You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

392 rivejä
12 KiB

10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
10 vuotta sitten
  1. <?php
  2. use Symfony\Component\HttpFoundation\Request;
  3. use Symfony\Component\HttpFoundation\Response;
  4. use XRay\Formats;
  5. class Parse {
  6. public $http;
  7. public $mc;
  8. private $_cacheTime = 120;
  9. private $_pretty = false;
  10. public static function useragent() {
  11. return 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36 XRay/1.0.0 ('.\Config::$base.')';
  12. }
  13. public function __construct() {
  14. $this->http = new p3k\HTTP();
  15. if(Config::$cache && class_exists('Memcache')) {
  16. $this->mc = new Memcache();
  17. $this->mc->addServer('127.0.0.1');
  18. }
  19. }
  20. public static function debug($msg, $header='X-Parse-Debug') {
  21. syslog(LOG_INFO, $msg);
  22. if(array_key_exists('REMOTE_ADDR', $_SERVER))
  23. header($header . ": " . $msg);
  24. }
  25. private function respond(Response $response, $code, $params, $headers=[]) {
  26. $response->setStatusCode($code);
  27. foreach($headers as $k=>$v) {
  28. $response->headers->set($k, $v);
  29. }
  30. $response->headers->set('Content-Type', 'application/json');
  31. $opts = JSON_UNESCAPED_SLASHES;
  32. if($this->_pretty) $opts += JSON_PRETTY_PRINT;
  33. $response->setContent(json_encode($params, $opts)."\n");
  34. return $response;
  35. }
  36. private static function toHtmlEntities($input) {
  37. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  38. }
  39. public function parse(Request $request, Response $response) {
  40. if($request->get('timeout')) {
  41. // We might make 2 HTTP requests, so each request gets half the desired timeout
  42. $this->http->timeout = $request->get('timeout') / 2;
  43. }
  44. if($request->get('max_redirects')) {
  45. $this->http->max_redirects = (int)$request->get('max_redirects');
  46. }
  47. if($request->get('pretty')) {
  48. $this->_pretty = true;
  49. }
  50. $url = $request->get('url');
  51. $html = $request->get('html');
  52. if(!$url && !$html) {
  53. return $this->respond($response, 400, [
  54. 'error' => 'missing_url',
  55. 'error_description' => 'Provide a URL or HTML to fetch'
  56. ]);
  57. }
  58. if($html) {
  59. // If HTML is provided in the request, parse that, and use the URL provided as the base URL for mf2 resolving
  60. $result['body'] = $html;
  61. $result['url'] = $url;
  62. } else {
  63. // Attempt some basic URL validation
  64. $scheme = parse_url($url, PHP_URL_SCHEME);
  65. if(!in_array($scheme, ['http','https'])) {
  66. return $this->respond($response, 400, [
  67. 'error' => 'invalid_url',
  68. 'error_description' => 'Only http and https URLs are supported'
  69. ]);
  70. }
  71. $host = parse_url($url, PHP_URL_HOST);
  72. if(!$host) {
  73. return $this->respond($response, 400, [
  74. 'error' => 'invalid_url',
  75. 'error_description' => 'The URL provided was not valid'
  76. ]);
  77. }
  78. $url = \normalize_url($url);
  79. // Check if this is a Twitter URL and if they've provided API credentials, use the API
  80. if(preg_match('/https?:\/\/(?:mobile\.twitter\.com|twitter\.com|twtr\.io)\/(?:[a-z0-9_\/!#]+statuse?s?\/([0-9]+)|([a-zA-Z0-9_]+))/i', $url, $match)) {
  81. return $this->parseTwitterURL($request, $response, $url, $match);
  82. }
  83. if($host == 'github.com') {
  84. return $this->parseGitHubURL($request, $response, $url);
  85. }
  86. // Now fetch the URL and check for any curl errors
  87. // Don't cache the response if a token is used to fetch it
  88. if($this->mc && !$request->get('token')) {
  89. $cacheKey = 'xray-'.md5($url);
  90. if($cached=$this->mc->get($cacheKey)) {
  91. $result = json_decode($cached, true);
  92. self::debug('using HTML from cache', 'X-Cache-Debug');
  93. } else {
  94. $result = $this->http->get($url, [self::useragent()]);
  95. $cacheData = json_encode($result);
  96. // App Engine limits the size of cached items, so don't cache ones larger than that
  97. if(strlen($cacheData) < 1000000)
  98. $this->mc->set($cacheKey, $cacheData, MEMCACHE_COMPRESSED, $this->_cacheTime);
  99. }
  100. } else {
  101. $headers = [self::useragent()];
  102. if($request->get('token')) {
  103. $headers[] = 'Authorization: Bearer ' . $request->get('token');
  104. }
  105. $result = $this->http->get($url, $headers);
  106. }
  107. if($result['error']) {
  108. return $this->respond($response, 200, [
  109. 'error' => $result['error'],
  110. 'error_description' => $result['error_description'],
  111. 'url' => $result['url'],
  112. 'code' => $result['code']
  113. ]);
  114. }
  115. if(trim($result['body']) == '') {
  116. if($result['code'] == 410) {
  117. // 410 Gone responses are valid and should not return an error
  118. return $this->respond($response, 200, [
  119. 'data' => [
  120. 'type' => 'unknown'
  121. ],
  122. 'url' => $result['url'],
  123. 'code' => $result['code']
  124. ]);
  125. }
  126. return $this->respond($response, 200, [
  127. 'error' => 'no_content',
  128. 'error_description' => 'We did not get a response body when fetching the URL',
  129. 'url' => $result['url'],
  130. 'code' => $result['code']
  131. ]);
  132. }
  133. // Check for HTTP 401/403
  134. if($result['code'] == 401) {
  135. return $this->respond($response, 200, [
  136. 'error' => 'unauthorized',
  137. 'error_description' => 'The URL returned "HTTP 401 Unauthorized"',
  138. 'url' => $result['url'],
  139. 'code' => 401
  140. ]);
  141. }
  142. if($result['code'] == 403) {
  143. return $this->respond($response, 200, [
  144. 'error' => 'forbidden',
  145. 'error_description' => 'The URL returned "HTTP 403 Forbidden"',
  146. 'url' => $result['url'],
  147. 'code' => 403
  148. ]);
  149. }
  150. }
  151. // Check for known services
  152. $host = parse_url($result['url'], PHP_URL_HOST);
  153. if(in_array($host, ['www.instagram.com','instagram.com'])) {
  154. list($data, $parsed) = Formats\Instagram::parse($result['body'], $result['url'], $this->http);
  155. if($request->get('include_original'))
  156. $data['original'] = $parsed;
  157. $data['url'] = $result['url'];
  158. $data['code'] = $result['code'];
  159. return $this->respond($response, 200, $data);
  160. }
  161. if($host == 'xkcd.com' && parse_url($url, PHP_URL_PATH) != '/') {
  162. $data = Formats\XKCD::parse($result['body'], $url);
  163. $data['url'] = $result['url'];
  164. $data['code'] = $result['code'];
  165. return $this->respond($response, 200, $data);
  166. }
  167. // attempt to parse the page as HTML
  168. $doc = new DOMDocument();
  169. @$doc->loadHTML(self::toHtmlEntities($result['body']));
  170. if(!$doc) {
  171. return $this->respond($response, 200, [
  172. 'error' => 'invalid_content',
  173. 'error_description' => 'The document could not be parsed as HTML'
  174. ]);
  175. }
  176. $xpath = new DOMXPath($doc);
  177. // Check for meta http equiv and replace the status code if present
  178. foreach($xpath->query('//meta[translate(@http-equiv,\'STATUS\',\'status\')=\'status\']') as $el) {
  179. $equivStatus = ''.$el->getAttribute('content');
  180. if($equivStatus && is_string($equivStatus)) {
  181. if(preg_match('/^(\d+)/', $equivStatus, $match)) {
  182. $result['code'] = (int)$match[1];
  183. }
  184. }
  185. }
  186. // If a target parameter was provided, make sure a link to it exists on the page
  187. if($target=$request->get('target')) {
  188. $found = [];
  189. if($target) {
  190. self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
  191. if($u == $target) {
  192. $found[$u] = null;
  193. }
  194. });
  195. self::xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
  196. if($u == $target) {
  197. $found[$u] = null;
  198. }
  199. });
  200. self::xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
  201. if($u == $target) {
  202. $found[$u] = null;
  203. }
  204. });
  205. self::xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
  206. if($u == $target) {
  207. $found[$u] = null;
  208. }
  209. });
  210. }
  211. if(!$found) {
  212. return $this->respond($response, 200, [
  213. 'error' => 'no_link_found',
  214. 'error_description' => 'The source document does not have a link to the target URL',
  215. 'url' => $result['url'],
  216. 'code' => $result['code'],
  217. ]);
  218. }
  219. }
  220. // If the URL has a fragment ID, find the DOM starting at that node and parse it instead
  221. $html = $result['body'];
  222. $fragment = parse_url($url, PHP_URL_FRAGMENT);
  223. if($fragment) {
  224. $fragElement = self::xPathGetElementById($xpath, $fragment);
  225. if($fragElement) {
  226. $html = $doc->saveHTML($fragElement);
  227. $foundFragment = true;
  228. } else {
  229. $foundFragment = false;
  230. }
  231. }
  232. // Now start pulling in the data from the page. Start by looking for microformats2
  233. $mf2 = mf2\Parse($html, $result['url']);
  234. if($mf2 && count($mf2['items']) > 0) {
  235. $data = Formats\Mf2::parse($mf2, $result['url'], $this->http);
  236. if($data) {
  237. if($fragment) {
  238. $data['info'] = [
  239. 'found_fragment' => $foundFragment
  240. ];
  241. }
  242. if($request->get('include_original'))
  243. $data['original'] = $html;
  244. $data['url'] = $result['url']; // this will be the effective URL after following redirects
  245. $data['code'] = $result['code'];
  246. return $this->respond($response, 200, $data);
  247. }
  248. }
  249. // TODO: look for other content like OEmbed or other known services later
  250. return $this->respond($response, 200, [
  251. 'data' => [
  252. 'type' => 'unknown',
  253. ],
  254. 'url' => $result['url'],
  255. 'code' => $result['code']
  256. ]);
  257. }
  258. private static function xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
  259. foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
  260. $v = $el->getAttribute($attr);
  261. $callback($v);
  262. }
  263. }
  264. private static function xPathGetElementById($xpath, $id) {
  265. $element = null;
  266. foreach($xpath->query("//*[@id='$id']") as $el) {
  267. $element = $el;
  268. }
  269. return $element;
  270. }
  271. private function parseTwitterURL(&$request, &$response, $url, $match) {
  272. $fields = ['twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret'];
  273. $creds = [];
  274. foreach($fields as $f) {
  275. if($v=$request->get($f))
  276. $creds[$f] = $v;
  277. }
  278. $data = false;
  279. if(count($creds) == 4) {
  280. list($data, $parsed) = Formats\Twitter::parse($url, $match[1], $creds);
  281. } elseif(count($creds) > 0) {
  282. // If only some Twitter credentials were present, return an error
  283. return $this->respond($response, 400, [
  284. 'error' => 'missing_parameters',
  285. 'error_description' => 'All 4 Twitter credentials must be included in the request'
  286. ]);
  287. } else {
  288. // Accept Tweet JSON and parse that if provided
  289. $json = $request->get('json');
  290. if($json) {
  291. list($data, $parsed) = Formats\Twitter::parse($url, $match[1], null, $json);
  292. }
  293. // Skip parsing from the Twitter API if they didn't include credentials
  294. }
  295. if($data) {
  296. if($request->get('include_original'))
  297. $data['original'] = $parsed;
  298. $data['url'] = $url;
  299. $data['code'] = 200;
  300. return $this->respond($response, 200, $data);
  301. } else {
  302. return $this->respond($response, 200, [
  303. 'data' => [
  304. 'type' => 'unknown'
  305. ],
  306. 'url' => $url,
  307. 'code' => 0
  308. ]);
  309. }
  310. }
  311. private function parseGitHubURL(&$request, &$response, $url) {
  312. $fields = ['github_access_token'];
  313. $creds = [];
  314. foreach($fields as $f) {
  315. if($v=$request->get($f))
  316. $creds[$f] = $v;
  317. }
  318. $data = false;
  319. $json = $request->get('json');
  320. if($json) {
  321. // Accept GitHub JSON and parse that if provided
  322. list($data, $json, $code) = Formats\GitHub::parse($this->http, $url, null, $json);
  323. } else {
  324. // Otherwise fetch the post unauthenticated or with the provided access token
  325. list($data, $json, $code) = Formats\GitHub::parse($this->http, $url, $creds);
  326. }
  327. if($data) {
  328. if($request->get('include_original'))
  329. $data['original'] = $json;
  330. $data['url'] = $url;
  331. $data['code'] = $code;
  332. return $this->respond($response, 200, $data);
  333. } else {
  334. return $this->respond($response, 200, [
  335. 'data' => [
  336. 'type' => 'unknown'
  337. ],
  338. 'url' => $url,
  339. 'code' => $code
  340. ]);
  341. }
  342. }
  343. }