From 4014da6dc77795dba8024c867afbfdc6cc07e838 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sat, 29 Apr 2017 09:09:46 -0700 Subject: [PATCH] moves fetching logic into a library class --- controllers/Parse.php | 124 +++++--------------------- lib/XRay.php | 10 ++- lib/XRay/Fetch.php | 158 +++++++++++++++++++++++++++++++++ lib/XRay/Formats/Format.php | 36 ++++++++ lib/XRay/Formats/GitHub.php | 116 +++++++++++++++--------- lib/XRay/Formats/Instagram.php | 9 ++ lib/XRay/Formats/Twitter.php | 37 +++++++- lib/XRay/Formats/XKCD.php | 35 +++----- 8 files changed, 351 insertions(+), 174 deletions(-) create mode 100644 lib/XRay/Fetch.php create mode 100644 lib/XRay/Formats/Format.php diff --git a/controllers/Parse.php b/controllers/Parse.php index d0be707..eb35338 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -46,14 +46,15 @@ class Parse { } public function parse(Request $request, Response $response) { + $opts = []; if($request->get('timeout')) { // We might make 2 HTTP requests, so each request gets half the desired timeout - $this->http->set_timeout($request->get('timeout') / 2); + $opts['timeout'] = $request->get('timeout') / 2; } if($request->get('max_redirects') !== null) { - $this->http->set_max_redirects((int)$request->get('max_redirects')); + $opts['max_redirects'] = (int)$request->get('max_redirects'); } if($request->get('pretty')) { @@ -74,115 +75,30 @@ class Parse { // If HTML is provided in the request, parse that, and use the URL provided as the base URL for mf2 resolving $result['body'] = $html; $result['url'] = $url; + $result['code'] = null; } else { - // Attempt some basic URL validation - $scheme = parse_url($url, PHP_URL_SCHEME); - if(!in_array($scheme, ['http','https'])) { - return $this->respond($response, 400, [ - 'error' => 'invalid_url', - 'error_description' => 'Only http and https URLs are supported' - ]); - } - - $host = parse_url($url, PHP_URL_HOST); - if(!$host) { - return $this->respond($response, 400, [ - 'error' => 'invalid_url', - 'error_description' => 'The URL provided was not valid' - ]); - } - - $url = p3k\XRay\normalize_url($url); - - // Check if this is a Twitter URL and if they've provided API credentials, use the API - if(preg_match('/https?:\/\/(?:mobile\.twitter\.com|twitter\.com|twtr\.io)\/(?:[a-z0-9_\/!#]+statuse?s?\/([0-9]+)|([a-zA-Z0-9_]+))/i', $url, $match)) { - return $this->parseTwitterURL($request, $response, $url, $match); + $fetch = new p3k\XRay\Fetch($this->http); + + $fields = [ + 'twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret', + 'github_access_token', + 'token' + ]; + foreach($fields as $f) { + if($v=$request->get($f)) + $opts[$f] = $v; } - if($host == 'github.com') { - return $this->parseGitHubURL($request, $response, $url); - } + $result = $fetch->fetch($url, $opts); - // Special-case appspot.com URLs to not follow redirects. - // https://cloud.google.com/appengine/docs/php/urlfetch/ - if(!p3k\XRay\should_follow_redirects($url)) { - $this->http->set_max_redirects(0); - $this->http->set_transport(new p3k\HTTP\Stream()); - } else { - $this->http->set_transport(new p3k\HTTP\Curl()); - } - - // Now fetch the URL and check for any curl errors - // Don't cache the response if a token is used to fetch it - if($this->mc && !$request->get('token')) { - $cacheKey = 'xray-'.md5($url); - if($cached=$this->mc->get($cacheKey)) { - $result = json_decode($cached, true); - self::debug('using HTML from cache', 'X-Cache-Debug'); - } else { - $result = $this->http->get($url); - $cacheData = json_encode($result); - // App Engine limits the size of cached items, so don't cache ones larger than that - if(strlen($cacheData) < 1000000) - $this->mc->set($cacheKey, $cacheData, MEMCACHE_COMPRESSED, $this->_cacheTime); - } - } else { - $headers = []; - if($request->get('token')) { - $headers[] = 'Authorization: Bearer ' . $request->get('token'); - } - - $result = $this->http->get($url, $headers); - } - - if($result['error']) { - return $this->respond($response, 200, [ - 'error' => $result['error'], - 'error_description' => $result['error_description'], - 'url' => $result['url'], - 'code' => $result['code'] - ]); - } - - if(trim($result['body']) == '') { - if($result['code'] == 410) { - // 410 Gone responses are valid and should not return an error - return $this->respond($response, 200, [ - 'data' => [ - 'type' => 'unknown' - ], - 'url' => $result['url'], - 'code' => $result['code'] - ]); - } - - return $this->respond($response, 200, [ - 'error' => 'no_content', - 'error_description' => 'We did not get a response body when fetching the URL', - 'url' => $result['url'], - 'code' => $result['code'] - ]); + if(!empty($result['error'])) { + $error_code = isset($result['error_code']) ? $result['error_code'] : 200; + unset($result['error_code']); + return $this->respond($response, $error_code, $result); } + } - // Check for HTTP 401/403 - if($result['code'] == 401) { - return $this->respond($response, 200, [ - 'error' => 'unauthorized', - 'error_description' => 'The URL returned "HTTP 401 Unauthorized"', - 'url' => $result['url'], - 'code' => 401 - ]); - } - if($result['code'] == 403) { - return $this->respond($response, 200, [ - 'error' => 'forbidden', - 'error_description' => 'The URL returned "HTTP 403 Forbidden"', - 'url' => $result['url'], - 'code' => 403 - ]); - } - } // Check for known services $host = parse_url($result['url'], PHP_URL_HOST); diff --git a/lib/XRay.php b/lib/XRay.php index 73cca7a..f633045 100644 --- a/lib/XRay.php +++ b/lib/XRay.php @@ -14,8 +14,14 @@ class XRay { } public function parse($url, $opts=[]) { - $parser = new XRay\Parser($this->http); - return $parser->parse($url, $opts); + $fetch = new XRay\Fetch($this->http); + $response = $fetch->fetch($url, $opts); + return $this->parse_doc($response, $url, $opts); + } + + public function parse_doc($response, $url=false, $opts=[]) { + + } } diff --git a/lib/XRay/Fetch.php b/lib/XRay/Fetch.php new file mode 100644 index 0000000..dda839f --- /dev/null +++ b/lib/XRay/Fetch.php @@ -0,0 +1,158 @@ +http = $http; + } + + public function fetch($url, $opts=[]) { + if(isset($opts['timeout'])) + $this->http->set_timeout($opts['timeout']); + if(isset($opts['max_redirects'])) + $this->http->set_max_redirects($opts['max_redirects']); + + // Attempt some basic URL validation + $scheme = parse_url($url, PHP_URL_SCHEME); + if(!in_array($scheme, ['http','https'])) { + return [ + 'error_code' => 400, + 'error' => 'invalid_url', + 'error_description' => 'Only http and https URLs are supported' + ]; + } + + $host = parse_url($url, PHP_URL_HOST); + if(!$host) { + return [ + 'error_code' => 400, + 'error' => 'invalid_url', + 'error_description' => 'The URL provided was not valid' + ]; + } + + $url = normalize_url($url); + $host = parse_url($url, PHP_URL_HOST); + + // Check if this is a Twitter URL and if they've provided API credentials, use the API + if(Formats\Twitter::matches_host($url)) { + return $this->_fetch_tweet($url, $opts); + } + + if(Formats\GitHub::matches_host($url)) { + return $this->_fetch_github($url, $opts); + } + + // Special-case appspot.com URLs to not follow redirects. + // https://cloud.google.com/appengine/docs/php/urlfetch/ + if(!should_follow_redirects($url)) { + $this->http->set_max_redirects(0); + $this->http->set_transport(new \p3k\HTTP\Stream()); + } else { + $this->http->set_transport(new \p3k\HTTP\Curl()); + } + + $headers = []; + if(isset($opts['token'])) + $headers[] = 'Authorization: Bearer ' . $opts['token']; + + $result = $this->http->get($url, $headers); + + if($result['error']) { + return [ + 'error' => $result['error'], + 'error_description' => $result['error_description'], + 'url' => $result['url'], + 'code' => $result['code'], + ]; + } + + if(trim($result['body']) == '') { + if($result['code'] == 410) { + // 410 Gone responses are valid and should not return an error + return $this->respond($response, 200, [ + 'TODO' => [ + ], + 'url' => $result['url'], + 'code' => $result['code'] + ]); + } + + return [ + 'error' => 'no_content', + 'error_description' => 'We did not get a response body when fetching the URL', + 'url' => $result['url'], + 'code' => $result['code'] + ]; + } + + // Check for HTTP 401/403 + if($result['code'] == 401) { + return [ + 'error' => 'unauthorized', + 'error_description' => 'The URL returned "HTTP 401 Unauthorized"', + 'url' => $result['url'], + 'code' => $result['code'] + ]; + } + if($result['code'] == 403) { + return [ + 'error' => 'forbidden', + 'error_description' => 'The URL returned "HTTP 403 Forbidden"', + 'url' => $result['url'], + 'code' => $result['code'] + ]; + } + + return [ + 'url' => $result['url'], + 'body' => $result['body'], + 'code' => $result['code'], + ]; + } + + private function _fetch_tweet($url, $opts) { + $fields = ['twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret']; + $creds = []; + foreach($fields as $f) { + if(isset($opts[$f])) + $creds[$f] = $opts[$f]; + } + + if(count($creds) < 4) { + return [ + 'error_code' => 400, + 'error' => 'missing_parameters', + 'error_description' => 'All 4 Twitter credentials must be included in the request' + ]; + } + + $tweet = Formats\Twitter::fetch($url, $creds); + if(!$tweet) { + return [ + 'error' => 'twitter_error', + 'error_description' => $e->getMessage() + ]; + } + + return [ + 'url' => $url, + 'body' => $tweet, + 'code' => 200, + ]; + } + + private function _fetch_github($url, $opts) { + $fields = ['github_access_token']; + $creds = []; + foreach($fields as $f) { + if(isset($opts[$f])) + $creds[$f] = $opts[$f]; + } + + return Formats\GitHub::fetch($this->http, $url, $creds); + } + +} diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php new file mode 100644 index 0000000..47e9625 --- /dev/null +++ b/lib/XRay/Formats/Format.php @@ -0,0 +1,36 @@ + [ + 'type' => 'unknown' + ] + ]; + } + + protected static function _loadHTML($html) { + $doc = new DOMDocument(); + @$doc->loadHTML($html); + + if(!$doc) { + return [null, null]; + } + + $xpath = new DOMXPath($doc); + + return [$doc, $xpath]; + } + +} diff --git a/lib/XRay/Formats/GitHub.php b/lib/XRay/Formats/GitHub.php index 766356d..91c8e96 100644 --- a/lib/XRay/Formats/GitHub.php +++ b/lib/XRay/Formats/GitHub.php @@ -2,53 +2,85 @@ namespace p3k\XRay\Formats; use DateTime, DateTimeZone; -use Parse, Config; +use Config; use cebe\markdown\GithubMarkdown; -class GitHub { +class GitHub extends Format { + + public static function matches_host($url) { + $host = parse_url($url, PHP_URL_HOST); + return $host == 'github.com'; + } + + public static function matches($url) { + return preg_match('~https://github.com/([^/]+)/([^/]+)/pull/(\d+)$~', $url, $match) + || preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)$~', $url, $match) + || preg_match('~https://github.com/([^/]+)/([^/]+)$~', $url, $match) + || preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)#issuecomment-(\d+)~', $url, $match); + } + + public static function fetch($http, $url, $creds) { + // Transform the GitHub URL to an API request + if(preg_match('~https://github.com/([^/]+)/([^/]+)/pull/(\d+)$~', $url, $match)) { + $type = 'pull'; + $org = $match[1]; + $repo = $match[2]; + $pull = $match[3]; + $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/pulls/'.$pull; + + } elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)$~', $url, $match)) { + $type = 'issue'; + $org = $match[1]; + $repo = $match[2]; + $issue = $match[3]; + $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/issues/'.$issue; + + } elseif(preg_match('~https://github.com/([^/]+)/([^/]+)$~', $url, $match)) { + $type = 'repo'; + $org = $match[1]; + $repo = $match[2]; + $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo; + + } elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)#issuecomment-(\d+)~', $url, $match)) { + $type = 'comment'; + $org = $match[1]; + $repo = $match[2]; + $issue = $match[3]; + $comment = $match[4]; + $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/issues/comments/'.$comment; + + } else { + return [ + 'error' => 'unsupported_url', + 'error_description' => 'This GitHub URL is not supported', + 'error_code' => 400, + ]; + } + + $headers = []; + if(isset($creds['github_access_token'])) { + $headers[] = 'Authorization: Bearer ' . $creds['github_access_token']; + } + + $response = $http->get($apiurl, $headers); + if($response['code'] != 200) { + return [ + 'error' => 'github_error', + 'error_description' => $response['body'], + 'code' => $response['code'], + ]; + } + + return [ + 'url' => $url, + 'body' => $response['body'], + 'code' => $response['code'], + ]; + } public static function parse($http, $url, $creds, $json=null) { - if(!$json) { - // Transform the GitHub URL to an API request - if(preg_match('~https://github.com/([^/]+)/([^/]+)/pull/(\d+)$~', $url, $match)) { - $type = 'pull'; - $org = $match[1]; - $repo = $match[2]; - $pull = $match[3]; - $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/pulls/'.$pull; - - } elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)$~', $url, $match)) { - $type = 'issue'; - $org = $match[1]; - $repo = $match[2]; - $issue = $match[3]; - $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/issues/'.$issue; - - } elseif(preg_match('~https://github.com/([^/]+)/([^/]+)$~', $url, $match)) { - $type = 'repo'; - $org = $match[1]; - $repo = $match[2]; - $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo; - - } elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)#issuecomment-(\d+)~', $url, $match)) { - $type = 'comment'; - $org = $match[1]; - $repo = $match[2]; - $issue = $match[3]; - $comment = $match[4]; - $apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/issues/comments/'.$comment; - - } else { - return [null, null, 0]; - } - - $response = $http->get($apiurl, ['User-Agent: XRay ('.Config::$base.')']); - if($response['code'] != 200) { - return [null, $response['body'], $response['code']]; - } - - $data = json_decode($response['body'], true); + if(false) { } else { $data = json_decode($json, true); } diff --git a/lib/XRay/Formats/Instagram.php b/lib/XRay/Formats/Instagram.php index f49ff15..1cfaee5 100644 --- a/lib/XRay/Formats/Instagram.php +++ b/lib/XRay/Formats/Instagram.php @@ -7,6 +7,15 @@ use Parse; class Instagram { + public static function matches_host($url) { + $host = parse_url($url, PHP_URL_HOST); + return in_array($host, ['www.instagram.com','instagram.com']); + } + + public static function matches($url) { + return self::matches_host($url); + } + public static function parse($html, $url, $http) { $photoData = self::_extractPhotoDataFromPhotoPage($html); diff --git a/lib/XRay/Formats/Twitter.php b/lib/XRay/Formats/Twitter.php index 246d4d2..905f24c 100644 --- a/lib/XRay/Formats/Twitter.php +++ b/lib/XRay/Formats/Twitter.php @@ -2,9 +2,42 @@ namespace p3k\XRay\Formats; use DateTime, DateTimeZone; -use Parse; -class Twitter { +class Twitter extends Format { + + public static function matches_host($url) { + $host = parse_url($url, PHP_URL_HOST); + return in_array($host, ['mobile.twitter.com','twitter.com','www.twitter.com','twtr.io']); + } + + public static function matches($url) { + if(preg_match('/https?:\/\/(?:mobile\.twitter\.com|twitter\.com|twtr\.io)\/(?:[a-z0-9_\/!#]+statuse?s?\/([0-9]+)|([a-zA-Z0-9_]+))/i', $url, $match)) + return $match; + else + return false; + } + + public static function fetch($url, $creds) { + if(!($match = self::matches($url))) { + return false; + } + + $tweet_id = $match[1]; + + $host = parse_url($url, PHP_URL_HOST); + if($host == 'twtr.io') { + $tweet_id = self::b60to10($tweet_id); + } + + $twitter = new \Twitter($creds['twitter_api_key'], $creds['twitter_api_secret'], $creds['twitter_access_token'], $creds['twitter_access_token_secret']); + try { + $tweet = $twitter->request('statuses/show/'.$tweet_id, 'GET', ['tweet_mode'=>'extended']); + } catch(\TwitterException $e) { + return false; + } + + return $tweet; + } public static function parse($url, $tweet_id, $creds, $json=null) { diff --git a/lib/XRay/Formats/XKCD.php b/lib/XRay/Formats/XKCD.php index acdb6e8..d7dc687 100644 --- a/lib/XRay/Formats/XKCD.php +++ b/lib/XRay/Formats/XKCD.php @@ -1,11 +1,19 @@ [ - 'type' => 'unknown' - ] - ]; - } - - private static function _loadHTML($html) { - $doc = new DOMDocument(); - @$doc->loadHTML($html); - - if(!$doc) { - return [null, null]; - } - - $xpath = new DOMXPath($doc); - - return [$doc, $xpath]; - } - }