diff --git a/controllers/Parse.php b/controllers/Parse.php index 6e7698e..ca545f8 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -106,7 +106,7 @@ class Parse { } $parser = new p3k\XRay\Parser($this->http); - $parsed = $parser->parse($result['body'], $result['url'], $opts); + $parsed = $parser->parse($result, $opts); // Allow the parser to override the HTTP response code, e.g. a meta-equiv tag if(isset($parsed['code'])) diff --git a/lib/XRay.php b/lib/XRay.php index 7df225c..9758b24 100644 --- a/lib/XRay.php +++ b/lib/XRay.php @@ -35,7 +35,11 @@ class XRay { } $parser = new XRay\Parser($this->http); - $result = $parser->parse($body, $url, $opts); + $result = $parser->parse([ + 'body' => $body, + 'url' => $url, + 'code' => $code, + ], $opts); if(!isset($opts['include_original']) || !$opts['include_original']) unset($result['original']); if(!isset($result['url'])) $result['url'] = $url; @@ -46,7 +50,11 @@ class XRay { public function process($url, $mf2json, $opts=[]) { $parser = new XRay\Parser($this->http); - $result = $parser->parse($mf2json, $url, $opts); + $result = $parser->parse([ + 'body' => $mf2json, + 'url' => $url, + 'code' => null, + ], $opts); if(!isset($opts['include_original']) || !$opts['include_original']) unset($result['original']); if(!isset($result['url'])) $result['url'] = $url; diff --git a/lib/XRay/Feeds.php b/lib/XRay/Feeds.php index 111d50b..1af19da 100644 --- a/lib/XRay/Feeds.php +++ b/lib/XRay/Feeds.php @@ -75,7 +75,7 @@ class Feeds { } else { // Some other document was returned, parse the HTML and look for rel alternates and Microformats - $mf2 = \mf2\Parse($body, $result['url']); + $mf2 = \mf2\Parse($result['body'], $result['url']); if(isset($mf2['rel-urls'])) { foreach($mf2['rel-urls'] as $rel=>$info) { if(isset($info['rels']) && in_array('alternate', $info['rels'])) { @@ -103,7 +103,7 @@ class Feeds { } } - $parsed = Formats\HTML::parse($this->http, $body, $result['url'], array_merge($opts, ['expect'=>'feed'])); + $parsed = Formats\HTML::parse($this->http, $result, array_merge($opts, ['expect'=>'feed'])); if($parsed && isset($parsed['data']['type']) && $parsed['data']['type'] == 'feed') { $feeds[] = [ 'url' => $result['url'], diff --git a/lib/XRay/Fetcher.php b/lib/XRay/Fetcher.php index 1af5fc4..6f22822 100644 --- a/lib/XRay/Fetcher.php +++ b/lib/XRay/Fetcher.php @@ -116,13 +116,7 @@ class Fetcher { if(trim($result['body']) == '') { if($result['code'] == 410) { // 410 Gone responses are valid and should not return an error - return [ - 'data' => [ - 'type' => 'deleted' - ], - 'url' => $result['url'], - 'code' => $result['code'] - ]; + return $result; } return [ diff --git a/lib/XRay/Formats/ActivityStreams.php b/lib/XRay/Formats/ActivityStreams.php index 8dc5fee..05748f1 100644 --- a/lib/XRay/Formats/ActivityStreams.php +++ b/lib/XRay/Formats/ActivityStreams.php @@ -25,7 +25,10 @@ class ActivityStreams extends Format { return true; } - public static function parse($as2, $url, $http, $opts=[]) { + public static function parse($http_response, $http, $opts=[]) { + $as2 = $http_response['body']; + $url = $http_response['url']; + if(!isset($as2['type'])) return false; @@ -49,6 +52,7 @@ class ActivityStreams extends Format { 'type' => 'unknown', ], 'url' => $url, + 'code' => $http_response['code'], ]; return $result; } @@ -165,7 +169,8 @@ class ActivityStreams extends Format { if($reposted && !empty($reposted['body'])) { $repostedData = json_decode($reposted['body'], true); if($repostedData) { - $repost = self::parse($repostedData, $as2['object'], $http, $opts); + $reposted['body'] = $repostedData; + $repost = self::parse($reposted, $http, $opts); if($repost && isset($repost['data']) && $repost['data']['type'] != 'unknown') { $refs[$as2['object']] = $repost['data']; } @@ -180,7 +185,8 @@ class ActivityStreams extends Format { if($liked && !empty($liked['body'])) { $likedData = json_decode($liked['body'], true); if($likedData) { - $like = self::parse($likedData, $as2['object'], $http, $opts); + $liked['body'] = $likedData; + $like = self::parse($liked, $http, $opts); if($like && isset($like['data']['type']) && $like['data']['type'] != 'unknown') { $refs[$as2['object']] = $like['data']; } diff --git a/lib/XRay/Formats/Facebook.php b/lib/XRay/Formats/Facebook.php index 4955b0b..7ab44f4 100644 --- a/lib/XRay/Formats/Facebook.php +++ b/lib/XRay/Formats/Facebook.php @@ -16,7 +16,10 @@ class Facebook extends Format { return self::matches_host($url); } - public static function parse($fbObject, $url) { + public static function parse($http_response) { + $fbObject = $http_response['body']; + $url = $http_response['url']; + if(is_string($fbObject)) $fbObject = json_decode($fbObject, true); $parts = self::extract_url_parts($url); diff --git a/lib/XRay/Formats/GitHub.php b/lib/XRay/Formats/GitHub.php index 9ea94b9..f0c8091 100644 --- a/lib/XRay/Formats/GitHub.php +++ b/lib/XRay/Formats/GitHub.php @@ -91,7 +91,10 @@ class GitHub extends Format { ]; } - public static function parse($json, $url) { + public static function parse($http_response) { + $json = $http_response['body']; + $url = $http_response['url']; + $data = @json_decode($json, true); if(!$data) diff --git a/lib/XRay/Formats/HTML.php b/lib/XRay/Formats/HTML.php index 9f9528b..6ad6311 100644 --- a/lib/XRay/Formats/HTML.php +++ b/lib/XRay/Formats/HTML.php @@ -10,12 +10,16 @@ class HTML extends Format { public static function matches_host($url) { return true; } public static function matches($url) { return true; } - public static function parse($http, $html, $url, $opts=[]) { + public static function parse($http, $http_response, $opts=[]) { + $html = $http_response['body']; + $url = $http_response['url']; + $result = [ 'data' => [ 'type' => 'unknown', ], 'url' => $url, + 'code' => $http_response['code'], ]; // attempt to parse the page as HTML @@ -55,7 +59,8 @@ class HTML extends Format { 'error' => 'no_link_found', 'error_description' => 'The source document does not have a link to the target URL', 'code' => isset($result['code']) ? $result['code'] : 200, - 'url' => $url + 'url' => $url, + 'debug' => $result ]; } } @@ -105,7 +110,8 @@ class HTML extends Format { if(!$jsonpage['error'] && $jsonpage['body']) { $jsondata = json_decode($jsonpage['body'],true); if($jsondata) { - $data = Formats\Mf2::parse($jsondata, $url, $http, $opts); + $jsonpage['body'] = $jsondata; + $data = Formats\Mf2::parse($jsonpage, $http, $opts); if($data && is_array($data) && isset($data['data']['type'])) { $data['url'] = $relurl; $data['source-format'] = 'mf2+json'; @@ -125,7 +131,8 @@ class HTML extends Format { if(!$jsonpage['error'] && $jsonpage['body']) { $jsondata = json_decode($jsonpage['body'],true); if($jsondata) { - $data = Formats\ActivityStreams::parse($jsondata, $url, $http, $opts); + $jsonpage['body'] = $jsondata; + $data = Formats\ActivityStreams::parse($jsonpage, $http, $opts); if($data && is_array($data) && isset($data['data']['type'])) { $data['url'] = $relurl; $data['source-format'] = 'activity+json'; @@ -139,7 +146,8 @@ class HTML extends Format { // Now start pulling in the data from the page. Start by looking for microformats2 if($mf2 && count($mf2['items']) > 0) { - $data = Formats\Mf2::parse($mf2, $url, $http, $opts); + $http_response['body'] = $mf2; + $data = Formats\Mf2::parse($http_response, $http, $opts); if($data) { $result = array_merge($result, $data); if($fragment) { diff --git a/lib/XRay/Formats/Hackernews.php b/lib/XRay/Formats/Hackernews.php index 5b1dd40..6de9eb2 100644 --- a/lib/XRay/Formats/Hackernews.php +++ b/lib/XRay/Formats/Hackernews.php @@ -38,7 +38,10 @@ class Hackernews extends Format { ]; } - public static function parse($json, $url) { + public static function parse($http_response) { + $json = $http_response['body']; + $url = $http_response['url']; + $data = @json_decode($json, true); if(!$data) diff --git a/lib/XRay/Formats/Instagram.php b/lib/XRay/Formats/Instagram.php index b799bdf..04dfb2d 100644 --- a/lib/XRay/Formats/Instagram.php +++ b/lib/XRay/Formats/Instagram.php @@ -15,7 +15,10 @@ class Instagram extends Format { return self::matches_host($url); } - public static function parse($http, $html, $url, $opts=[]) { + public static function parse($http, $http_response, $opts=[]) { + $html = $http_response['body']; + $url = $http_response['url']; + if(preg_match('#instagram.com/([^/]+)/$#', $url)) { if(isset($opts['expect']) && $opts['expect'] == 'feed') return self::parseFeed($http, $html, $url); diff --git a/lib/XRay/Formats/JSONFeed.php b/lib/XRay/Formats/JSONFeed.php index 943aaf1..27b35ab 100644 --- a/lib/XRay/Formats/JSONFeed.php +++ b/lib/XRay/Formats/JSONFeed.php @@ -10,7 +10,10 @@ class JSONFeed extends Format { public static function matches_host($url) { return true; } public static function matches($url) { return true; } - public static function parse($feed, $url) { + public static function parse($http_response) { + $feed = $http_response['body']; + $url = $http_response['url']; + $result = [ 'data' => [ 'type' => 'unknown', diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index 1740736..158dd2d 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -13,7 +13,10 @@ class Mf2 extends Format { return true; } - public static function parse($mf2, $url, $http, $opts=[]) { + public static function parse($http_response, $http, $opts=[]) { + $mf2 = $http_response['body']; + $url = $http_response['url']; + if(!isset($mf2['items']) || count($mf2['items']) == 0) return false; @@ -299,7 +302,11 @@ class Mf2 extends Format { if(!array_key_exists($p, $data)) $data[$p] = []; if(!in_array($u, $data[$p])) $data[$p][] = $u; - $ref = self::parse(['items'=>[$v]], $u, $http); + $ref = self::parse([ + 'body' => ['items'=>[$v]], + 'url' => $u, + 'code' => null, + ], $http); if($ref) { $refs[$u] = $ref['data']; } @@ -355,7 +362,11 @@ class Mf2 extends Format { $data[$p][] = $u; $keys[] = $p; // parse the object and put the result in the "refs" object - $ref = self::parse(['items'=>[$v]], $u, $http); + $ref = self::parse([ + 'body' => ['items'=>[$v]], + 'url' => $u, + 'code' => null, + ], $http); if($ref) { $refs[$u] = $ref['data']; } diff --git a/lib/XRay/Formats/Twitter.php b/lib/XRay/Formats/Twitter.php index afb0d2c..56172c5 100644 --- a/lib/XRay/Formats/Twitter.php +++ b/lib/XRay/Formats/Twitter.php @@ -46,7 +46,9 @@ class Twitter extends Format { ]; } - public static function parse($json, $url) { + public static function parse($http_response) { + $json = is_array($http_response) ? $http_response['body'] : $http_response->body; + $url = is_array($http_response) ? $http_response['url'] : $http_response->url; if(is_string($json)) $tweet = json_decode($json); @@ -77,7 +79,7 @@ class Twitter extends Format { $repostOf = 'https://twitter.com/' . $reposted->user->screen_name . '/status/' . $reposted->id_str; $entry['repost-of'] = $repostOf; - $repostedEntry = self::parse($reposted, $repostOf); + $repostedEntry = self::parse(['body' => $reposted, 'url' => $repostOf]); if(isset($repostedEntry['data']['refs'])) { foreach($repostedEntry['data']['refs'] as $k=>$v) { $refs[$k] = $v; @@ -152,7 +154,7 @@ class Twitter extends Format { // Quoted Status if(property_exists($tweet, 'quoted_status')) { $quoteOf = 'https://twitter.com/' . $tweet->quoted_status->user->screen_name . '/status/' . $tweet->quoted_status_id_str; - $quotedEntry = self::parse($tweet->quoted_status, $quoteOf); + $quotedEntry = self::parse(['body' => $tweet->quoted_status, 'url' => $quoteOf]); if(isset($quotedEntry['data']['refs'])) { foreach($quotedEntry['data']['refs'] as $k=>$v) { $refs[$k] = $v; diff --git a/lib/XRay/Formats/XKCD.php b/lib/XRay/Formats/XKCD.php index 987ec08..65408c2 100644 --- a/lib/XRay/Formats/XKCD.php +++ b/lib/XRay/Formats/XKCD.php @@ -15,7 +15,10 @@ class XKCD extends Format { return self::matches_host($url) && preg_match('/^\/\d+\/$/', ''.parse_url($url, PHP_URL_PATH)); } - public static function parse($html, $url) { + public static function parse($http_response) { + $html = $http_response['body']; + $url = $http_response['url']; + list($doc, $xpath) = self::_loadHTML($html); if(!$doc) diff --git a/lib/XRay/Formats/XML.php b/lib/XRay/Formats/XML.php index 49431d4..312d092 100644 --- a/lib/XRay/Formats/XML.php +++ b/lib/XRay/Formats/XML.php @@ -12,13 +12,17 @@ class XML extends Format { public static function matches_host($url) { return true; } public static function matches($url) { return true; } - public static function parse($xml, $url) { + public static function parse($http_response) { + $xml = $http_response['body']; + $url = $http_response['url']; + $result = [ 'data' => [ 'type' => 'unknown', ], 'url' => $url, 'source-format' => 'xml', + 'code' => $http_response['code'], ]; try { diff --git a/lib/XRay/Parser.php b/lib/XRay/Parser.php index 8e744d8..bce1b64 100644 --- a/lib/XRay/Parser.php +++ b/lib/XRay/Parser.php @@ -10,75 +10,81 @@ class Parser { $this->http = $http; } - public function parse($body, $url, $opts=[]) { + public function parse($http_response, $opts=[]) { if(isset($opts['timeout'])) $this->http->set_timeout($opts['timeout']); if(isset($opts['max_redirects'])) $this->http->set_max_redirects($opts['max_redirects']); // Check if the URL matches a special parser + $url = $http_response['url']; if(Formats\Instagram::matches($url)) { - return Formats\Instagram::parse($this->http, $body, $url, $opts); + return Formats\Instagram::parse($this->http, $http_response, $opts); } if(Formats\GitHub::matches($url)) { - return Formats\GitHub::parse($body, $url); + return Formats\GitHub::parse($http_response); } if(Formats\Twitter::matches($url)) { - return Formats\Twitter::parse($body, $url); + return Formats\Twitter::parse($http_response); } if(Formats\Facebook::matches($url)) { - return Formats\Facebook::parse($body, $url); + return Formats\Facebook::parse($http_response); } if(Formats\XKCD::matches($url)) { - return Formats\XKCD::parse($body, $url); + return Formats\XKCD::parse($http_response); } if(Formats\Hackernews::matches($url)) { - return Formats\Hackernews::parse($body, $url); + return Formats\Hackernews::parse($http_response); } + $body = $http_response['body']; + // Check if an mf2 JSON object was passed in if(is_array($body) && isset($body['items'][0]['type']) && isset($body['items'][0]['properties'])) { - $data = Formats\Mf2::parse($body, $url, $this->http, $opts); + $data = Formats\Mf2::parse($http_response, $this->http, $opts); $data['source-format'] = 'mf2+json'; return $data; } // Check if an ActivityStreams JSON object was passed in if(Formats\ActivityStreams::is_as2_json($body)) { - $data = Formats\ActivityStreams::parse($body, $url, $this->http, $opts); + $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts); $data['source-format'] = 'activity+json'; return $data; } if(substr($body, 0, 5) == 'http, $opts); + $http_response['body'] = $parsed; + $data = Formats\Mf2::parse($http_response, $this->http, $opts); $data['source-format'] = 'mf2+json'; return $data; } elseif($parsed && Formats\ActivityStreams::is_as2_json($parsed)) { // Check if an ActivityStreams JSON string was passed in - $data = Formats\ActivityStreams::parse($parsed, $url, $this->http, $opts); + $http_response['body'] = $parsed; + $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts); $data['source-format'] = 'activity+json'; return $data; } } // No special parsers matched, parse for Microformats now - $data = Formats\HTML::parse($this->http, $body, $url, $opts); + $data = Formats\HTML::parse($this->http, $http_response, $opts); if(!isset($data['source-format']) && isset($data['type']) && $data['type'] != 'unknown') $data['source-format'] = 'mf2+html'; return $data; diff --git a/tests/FetchTest.php b/tests/FetchTest.php index 08509c5..a274197 100644 --- a/tests/FetchTest.php +++ b/tests/FetchTest.php @@ -103,6 +103,49 @@ class FetchTest extends PHPUnit_Framework_TestCase { $this->assertEquals(401, $data->code); } + public function testDeleted() { + $url = 'http://source.example.com/deleted-gone'; + $response = $this->parse([ + 'url' => $url + ]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('error', $data); + $this->assertEquals(410, $data->code); + $this->assertEquals('This post has been deleted.', $data->data->content->text); + } + + public function testDeletedEmptyBody() { + $url = 'http://source.example.com/deleted-empty'; + $response = $this->parse([ + 'url' => $url + ]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('error', $data); + $this->assertEquals(410, $data->code); + $this->assertEquals('unknown', $data->data->type); + } + + public function testDeletedTargetProvided() { + $url = 'http://source.example.com/deleted-gone'; + $response = $this->parse([ + 'url' => $url, + 'target' => 'http://example.com/' + ]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectHasAttribute('error', $data); + $this->assertEquals('no_link_found', $data->error); + $this->assertEquals(410, $data->code); + } + public function testMetaEquivDeleted() { $url = 'http://source.example.com/deleted'; $response = $this->parse([ diff --git a/tests/FindFeedsTest.php b/tests/FindFeedsTest.php index 4b0e4c0..544da95 100644 --- a/tests/FindFeedsTest.php +++ b/tests/FindFeedsTest.php @@ -42,7 +42,7 @@ class FindFeedsTest extends PHPUnit_Framework_TestCase { $feeds = json_decode($body)->feeds; $this->assertEquals(2, count($feeds)); - // Should rank JSONFeed above Atom + // Should rank h-feed above Atom $this->assertEquals('http://feed.example.com/h-feed-with-atom-alternate', $feeds[0]->url); $this->assertEquals('microformats', $feeds[0]->type); $this->assertEquals('http://feed.example.com/atom', $feeds[1]->url); diff --git a/tests/data/source.example.com/deleted-empty b/tests/data/source.example.com/deleted-empty new file mode 100644 index 0000000..e7be5ed --- /dev/null +++ b/tests/data/source.example.com/deleted-empty @@ -0,0 +1,6 @@ +HTTP/1.1 410 Gone +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + diff --git a/tests/data/source.example.com/deleted-gone b/tests/data/source.example.com/deleted-gone new file mode 100644 index 0000000..98f6730 --- /dev/null +++ b/tests/data/source.example.com/deleted-gone @@ -0,0 +1,14 @@ +HTTP/1.1 410 Gone +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This post has been deleted.

+ +