From 01b53edc9599091f3a9f73f7751cf3816121a460 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sat, 29 Apr 2017 11:15:39 -0700 Subject: [PATCH] refactor Twitter parser --- README.md | 20 +++++++++ controllers/Parse.php | 80 +----------------------------------- lib/XRay/Fetcher.php | 3 +- lib/XRay/Formats/Twitter.php | 52 +++++++++-------------- lib/XRay/Parser.php | 4 ++ tests/TwitterTest.php | 36 ++++++++-------- 6 files changed, 65 insertions(+), 130 deletions(-) diff --git a/README.md b/README.md index a048ffd..dca7161 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,26 @@ In both cases, the response will be a JSON object containing a key of "type". If You can also make a POST request with the same parameter names. +If you already have an HTML or JSON document you want to parse, you can include that in the parameter `body`. This POST request would look like the below: + +``` +POST /parse +Content-type: application/x-www-form-urlencoded + +url=https://aaronparecki.com/2016/01/16/11/ +&body=.... +``` + +or for Twitter/GitHub where you might have JSON, + +``` +POST /parse +Content-type: application/x-www-form-urlencoded + +url=https://github.com/aaronpk/XRay +&body={"repo":......} +``` + ### Authentication If the URL you are fetching requires authentication, include the access token in the parameter "token", and it will be included in an "Authorization" header when fetching the URL. (It is recommended to use a POST request in this case, to avoid the access token potentially being logged as part of the query string.) This is useful for [Private Webmention](https://indieweb.org/Private-Webmention) verification. diff --git a/controllers/Parse.php b/controllers/Parse.php index 5b33763..44a8b34 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -62,12 +62,12 @@ class Parse { } $url = $request->get('url'); - $html = $request->get('html'); + $html = $request->get('html') ?: $request->get('body'); if(!$url && !$html) { return $this->respond($response, 400, [ 'error' => 'missing_url', - 'error_description' => 'Provide a URL or HTML to fetch' + 'error_description' => 'Provide a URL or HTML to fetch', ]); } @@ -236,81 +236,5 @@ class Parse { return $element; } - private function parseTwitterURL(&$request, &$response, $url, $match) { - $fields = ['twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret']; - $creds = []; - foreach($fields as $f) { - if($v=$request->get($f)) - $creds[$f] = $v; - } - $data = false; - if(count($creds) == 4) { - list($data, $parsed) = Formats\Twitter::parse($url, $match[1], $creds); - } elseif(count($creds) > 0) { - // If only some Twitter credentials were present, return an error - return $this->respond($response, 400, [ - 'error' => 'missing_parameters', - 'error_description' => 'All 4 Twitter credentials must be included in the request' - ]); - } else { - // Accept Tweet JSON and parse that if provided - $json = $request->get('json'); - if($json) { - list($data, $parsed) = Formats\Twitter::parse($url, $match[1], null, $json); - } - // Skip parsing from the Twitter API if they didn't include credentials - } - - if($data) { - if($request->get('include_original')) - $data['original'] = $parsed; - $data['url'] = $url; - $data['code'] = 200; - return $this->respond($response, 200, $data); - } else { - return $this->respond($response, 200, [ - 'data' => [ - 'type' => 'unknown' - ], - 'url' => $url, - 'code' => 0 - ]); - } - } - - private function parseGitHubURL(&$request, &$response, $url) { - $fields = ['github_access_token']; - $creds = []; - foreach($fields as $f) { - if($v=$request->get($f)) - $creds[$f] = $v; - } - $data = false; - $json = $request->get('json'); - if($json) { - // Accept GitHub JSON and parse that if provided - list($data, $json, $code) = Formats\GitHub::parse($this->http, $url, null, $json); - } else { - // Otherwise fetch the post unauthenticated or with the provided access token - list($data, $json, $code) = Formats\GitHub::parse($this->http, $url, $creds); - } - - if($data) { - if($request->get('include_original')) - $data['original'] = $json; - $data['url'] = $url; - $data['code'] = $code; - return $this->respond($response, 200, $data); - } else { - return $this->respond($response, 200, [ - 'data' => [ - 'type' => 'unknown' - ], - 'url' => $url, - 'code' => $code - ]); - } - } - } diff --git a/lib/XRay/Fetcher.php b/lib/XRay/Fetcher.php index 9b82909..1ead5e4 100644 --- a/lib/XRay/Fetcher.php +++ b/lib/XRay/Fetcher.php @@ -10,7 +10,7 @@ class Fetcher { public function fetch($url, $opts=[]) { if($opts == false) $opts = []; - + if(isset($opts['timeout'])) $this->http->set_timeout($opts['timeout']); if(isset($opts['max_redirects'])) @@ -127,6 +127,7 @@ class Fetcher { } if(count($creds) < 4) { +print_r(debug_backtrace()[1]); return [ 'error_code' => 400, 'error' => 'missing_parameters', diff --git a/lib/XRay/Formats/Twitter.php b/lib/XRay/Formats/Twitter.php index 905f24c..7462dd5 100644 --- a/lib/XRay/Formats/Twitter.php +++ b/lib/XRay/Formats/Twitter.php @@ -39,30 +39,17 @@ class Twitter extends Format { return $tweet; } - public static function parse($url, $tweet_id, $creds, $json=null) { + public static function parse($json, $url) { - $host = parse_url($url, PHP_URL_HOST); - if($host == 'twtr.io') { - $tweet_id = self::b60to10($tweet_id); - } + if(is_string($json)) + $tweet = json_decode($json); + else + $tweet = $json; - if($json) { - if(is_string($json)) - $tweet = json_decode($json); - else - $tweet = $json; - } else { - $twitter = new \Twitter($creds['twitter_api_key'], $creds['twitter_api_secret'], $creds['twitter_access_token'], $creds['twitter_access_token_secret']); - try { - $tweet = $twitter->request('statuses/show/'.$tweet_id, 'GET', ['tweet_mode'=>'extended']); - } catch(\TwitterException $e) { - return [false, false]; - } + if(!$tweet) { + return self::_unknown(); } - if(!$tweet) - return [false, false]; - $entry = array( 'type' => 'entry', 'url' => $url, @@ -89,9 +76,9 @@ class Twitter extends Format { $repostOf = 'https://twitter.com/' . $reposted->user->screen_name . '/status/' . $reposted->id_str; $entry['repost-of'] = $repostOf; - list($repostedEntry) = self::parse($repostOf, $reposted->id_str, null, $reposted); - if(isset($repostedEntry['refs'])) { - foreach($repostedEntry['refs'] as $k=>$v) { + $repostedEntry = self::parse($reposted, $repostOf); + if(isset($repostedEntry['data']['refs'])) { + foreach($repostedEntry['data']['refs'] as $k=>$v) { $refs[$k] = $v; } } @@ -174,28 +161,27 @@ class Twitter extends Format { // Quoted Status if(property_exists($tweet, 'quoted_status')) { $quoteOf = 'https://twitter.com/' . $tweet->quoted_status->user->screen_name . '/status/' . $tweet->quoted_status_id_str; - list($quoted) = self::parse($quoteOf, $tweet->quoted_status_id_str, null, $tweet->quoted_status); - if(isset($quoted['refs'])) { - foreach($quoted['refs'] as $k=>$v) { + $quotedEntry = self::parse($tweet->quoted_status, $quoteOf); + if(isset($quotedEntry['data']['refs'])) { + foreach($quotedEntry['data']['refs'] as $k=>$v) { $refs[$k] = $v; } } - $refs[$quoteOf] = $quoted['data']; + $refs[$quoteOf] = $quotedEntry['data']; } if($author = self::_buildHCardFromTwitterProfile($tweet->user)) { $entry['author'] = $author; } - $response = [ - 'data' => $entry - ]; - if(count($refs)) { - $response['refs'] = $refs; + $entry['refs'] = $refs; } - return [$response, $tweet]; + return [ + 'data' => $entry, + 'original' => $tweet, + ]; } private static function _buildHCardFromTwitterProfile($profile) { diff --git a/lib/XRay/Parser.php b/lib/XRay/Parser.php index 38efb5a..622e12e 100644 --- a/lib/XRay/Parser.php +++ b/lib/XRay/Parser.php @@ -26,6 +26,10 @@ class Parser { return Formats\GitHub::parse($body, $url); } + if(Formats\Twitter::matches($url)) { + return Formats\Twitter::parse($body, $url); + } + if(Formats\XKCD::matches($url)) { return Formats\XKCD::parse($body, $url); } diff --git a/tests/TwitterTest.php b/tests/TwitterTest.php index a766788..f892ad9 100644 --- a/tests/TwitterTest.php +++ b/tests/TwitterTest.php @@ -29,7 +29,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testBasicProfileInfo() { list($url, $json) = $this->loadTweet('818912506496229376'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('aaronpk dev', $data['data']['author']['name']); @@ -43,7 +43,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testProfileWithNonExpandedURL() { list($url, $json) = $this->loadTweet('791704641046052864'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('http://agiletortoise.com', $data['data']['author']['url']); } @@ -51,9 +51,9 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testBasicTestStuff() { list($url, $json) = $this->loadTweet('818913630569664512'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); - $this->assertEquals(200, $data['code']); + $this->assertEquals(null, $data['code']); // no code is expected if we pass in the body $this->assertEquals('https://twitter.com/pkdev/status/818913630569664512', $data['url']); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('A tweet with a URL https://indieweb.org/ #and #some #hashtags', $data['data']['content']['text']); @@ -67,14 +67,14 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testPositiveTimezone() { list($url, $json) = $this->loadTweet('719914707566649344'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals("2016-04-12T16:46:56+01:00", $data['data']['published']); } public function testTweetWithEmoji() { list($url, $json) = $this->loadTweet('818943244553699328'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('Here 🎉 have an emoji', $data['data']['content']['text']); @@ -83,7 +83,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testHTMLEscaping() { list($url, $json) = $this->loadTweet('818928092383166465'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('Double escaping & & amp', $data['data']['content']['text']); @@ -92,7 +92,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testTweetWithPhoto() { list($url, $json) = $this->loadTweet('818912506496229376'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('Tweet with a photo and a location', $data['data']['content']['text']); @@ -102,7 +102,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testTweetWithTwoPhotos() { list($url, $json) = $this->loadTweet('818935308813103104'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('Two photos', $data['data']['content']['text']); @@ -113,7 +113,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testTweetWithVideo() { list($url, $json) = $this->loadTweet('818913178260160512'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('Tweet with a video', $data['data']['content']['text']); @@ -123,12 +123,12 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testTweetWithLocation() { list($url, $json) = $this->loadTweet('818912506496229376'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('Tweet with a photo and a location', $data['data']['content']['text']); $this->assertEquals('https://api.twitter.com/1.1/geo/id/ac88a4f17a51c7fc.json', $data['data']['location']); - $location = $data['refs']['https://api.twitter.com/1.1/geo/id/ac88a4f17a51c7fc.json']; + $location = $data['data']['refs']['https://api.twitter.com/1.1/geo/id/ac88a4f17a51c7fc.json']; $this->assertEquals('adr', $location['type']); $this->assertEquals('Portland', $location['locality']); $this->assertEquals('United States', $location['country-name']); @@ -138,38 +138,38 @@ class TwitterTest extends PHPUnit_Framework_TestCase { public function testRetweet() { list($url, $json) = $this->loadTweet('818913351623245824'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertArrayNotHasKey('content', $data['data']); $repostOf = 'https://twitter.com/aaronpk/status/817414679131660288'; $this->assertEquals($repostOf, $data['data']['repost-of']); - $tweet = $data['refs'][$repostOf]; + $tweet = $data['data']['refs'][$repostOf]; $this->assertEquals('Yeah that\'s me http://xkcd.com/1782/', $tweet['content']['text']); } public function testRetweetWithPhoto() { list($url, $json) = $this->loadTweet('820039442773798912'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertArrayNotHasKey('content', $data['data']); $this->assertArrayNotHasKey('photo', $data['data']); $repostOf = 'https://twitter.com/phlaimeaux/status/819943954724556800'; $this->assertEquals($repostOf, $data['data']['repost-of']); - $tweet = $data['refs'][$repostOf]; + $tweet = $data['data']['refs'][$repostOf]; $this->assertEquals('this headline is such a rollercoaster', $tweet['content']['text']); } public function testQuotedTweet() { list($url, $json) = $this->loadTweet('818913488609251331'); - $data = $this->parse(['url' => $url, 'json' => $json]); + $data = $this->parse(['url' => $url, 'body' => $json]); $this->assertEquals('entry', $data['data']['type']); $this->assertEquals('Quoted tweet with a #hashtag https://twitter.com/aaronpk/status/817414679131660288', $data['data']['content']['text']); - $tweet = $data['refs']['https://twitter.com/aaronpk/status/817414679131660288']; + $tweet = $data['data']['refs']['https://twitter.com/aaronpk/status/817414679131660288']; $this->assertEquals('Yeah that\'s me http://xkcd.com/1782/', $tweet['content']['text']); }