Browse Source

pass thru HTTP code and parse deleted pages

more consistently returns HTTP 410 now
pull/93/head
Aaron Parecki 2 years ago
parent
commit
19126b5836
No known key found for this signature in database GPG Key ID: 276C2817346D6056
20 changed files with 168 additions and 48 deletions
  1. +1
    -1
      controllers/Parse.php
  2. +10
    -2
      lib/XRay.php
  3. +2
    -2
      lib/XRay/Feeds.php
  4. +1
    -7
      lib/XRay/Fetcher.php
  5. +9
    -3
      lib/XRay/Formats/ActivityStreams.php
  6. +4
    -1
      lib/XRay/Formats/Facebook.php
  7. +4
    -1
      lib/XRay/Formats/GitHub.php
  8. +13
    -5
      lib/XRay/Formats/HTML.php
  9. +4
    -1
      lib/XRay/Formats/Hackernews.php
  10. +4
    -1
      lib/XRay/Formats/Instagram.php
  11. +4
    -1
      lib/XRay/Formats/JSONFeed.php
  12. +14
    -3
      lib/XRay/Formats/Mf2.php
  13. +5
    -3
      lib/XRay/Formats/Twitter.php
  14. +4
    -1
      lib/XRay/Formats/XKCD.php
  15. +5
    -1
      lib/XRay/Formats/XML.php
  16. +20
    -14
      lib/XRay/Parser.php
  17. +43
    -0
      tests/FetchTest.php
  18. +1
    -1
      tests/FindFeedsTest.php
  19. +6
    -0
      tests/data/source.example.com/deleted-empty
  20. +14
    -0
      tests/data/source.example.com/deleted-gone

+ 1
- 1
controllers/Parse.php View File

@ -106,7 +106,7 @@ class Parse {
}
$parser = new p3k\XRay\Parser($this->http);
$parsed = $parser->parse($result['body'], $result['url'], $opts);
$parsed = $parser->parse($result, $opts);
// Allow the parser to override the HTTP response code, e.g. a meta-equiv tag
if(isset($parsed['code']))

+ 10
- 2
lib/XRay.php View File

@ -35,7 +35,11 @@ class XRay {
}
$parser = new XRay\Parser($this->http);
$result = $parser->parse($body, $url, $opts);
$result = $parser->parse([
'body' => $body,
'url' => $url,
'code' => $code,
], $opts);
if(!isset($opts['include_original']) || !$opts['include_original'])
unset($result['original']);
if(!isset($result['url'])) $result['url'] = $url;
@ -46,7 +50,11 @@ class XRay {
public function process($url, $mf2json, $opts=[]) {
$parser = new XRay\Parser($this->http);
$result = $parser->parse($mf2json, $url, $opts);
$result = $parser->parse([
'body' => $mf2json,
'url' => $url,
'code' => null,
], $opts);
if(!isset($opts['include_original']) || !$opts['include_original'])
unset($result['original']);
if(!isset($result['url'])) $result['url'] = $url;

+ 2
- 2
lib/XRay/Feeds.php View File

@ -75,7 +75,7 @@ class Feeds {
} else {
// Some other document was returned, parse the HTML and look for rel alternates and Microformats
$mf2 = \mf2\Parse($body, $result['url']);
$mf2 = \mf2\Parse($result['body'], $result['url']);
if(isset($mf2['rel-urls'])) {
foreach($mf2['rel-urls'] as $rel=>$info) {
if(isset($info['rels']) && in_array('alternate', $info['rels'])) {
@ -103,7 +103,7 @@ class Feeds {
}
}
$parsed = Formats\HTML::parse($this->http, $body, $result['url'], array_merge($opts, ['expect'=>'feed']));
$parsed = Formats\HTML::parse($this->http, $result, array_merge($opts, ['expect'=>'feed']));
if($parsed && isset($parsed['data']['type']) && $parsed['data']['type'] == 'feed') {
$feeds[] = [
'url' => $result['url'],

+ 1
- 7
lib/XRay/Fetcher.php View File

@ -116,13 +116,7 @@ class Fetcher {
if(trim($result['body']) == '') {
if($result['code'] == 410) {
// 410 Gone responses are valid and should not return an error
return [
'data' => [
'type' => 'deleted'
],
'url' => $result['url'],
'code' => $result['code']
];
return $result;
}
return [

+ 9
- 3
lib/XRay/Formats/ActivityStreams.php View File

@ -25,7 +25,10 @@ class ActivityStreams extends Format {
return true;
}
public static function parse($as2, $url, $http, $opts=[]) {
public static function parse($http_response, $http, $opts=[]) {
$as2 = $http_response['body'];
$url = $http_response['url'];
if(!isset($as2['type']))
return false;
@ -49,6 +52,7 @@ class ActivityStreams extends Format {
'type' => 'unknown',
],
'url' => $url,
'code' => $http_response['code'],
];
return $result;
}
@ -165,7 +169,8 @@ class ActivityStreams extends Format {
if($reposted && !empty($reposted['body'])) {
$repostedData = json_decode($reposted['body'], true);
if($repostedData) {
$repost = self::parse($repostedData, $as2['object'], $http, $opts);
$reposted['body'] = $repostedData;
$repost = self::parse($reposted, $http, $opts);
if($repost && isset($repost['data']) && $repost['data']['type'] != 'unknown') {
$refs[$as2['object']] = $repost['data'];
}
@ -180,7 +185,8 @@ class ActivityStreams extends Format {
if($liked && !empty($liked['body'])) {
$likedData = json_decode($liked['body'], true);
if($likedData) {
$like = self::parse($likedData, $as2['object'], $http, $opts);
$liked['body'] = $likedData;
$like = self::parse($liked, $http, $opts);
if($like && isset($like['data']['type']) && $like['data']['type'] != 'unknown') {
$refs[$as2['object']] = $like['data'];
}

+ 4
- 1
lib/XRay/Formats/Facebook.php View File

@ -16,7 +16,10 @@ class Facebook extends Format {
return self::matches_host($url);
}
public static function parse($fbObject, $url) {
public static function parse($http_response) {
$fbObject = $http_response['body'];
$url = $http_response['url'];
if(is_string($fbObject)) $fbObject = json_decode($fbObject, true);
$parts = self::extract_url_parts($url);

+ 4
- 1
lib/XRay/Formats/GitHub.php View File

@ -91,7 +91,10 @@ class GitHub extends Format {
];
}
public static function parse($json, $url) {
public static function parse($http_response) {
$json = $http_response['body'];
$url = $http_response['url'];
$data = @json_decode($json, true);
if(!$data)

+ 13
- 5
lib/XRay/Formats/HTML.php View File

@ -10,12 +10,16 @@ class HTML extends Format {
public static function matches_host($url) { return true; }
public static function matches($url) { return true; }
public static function parse($http, $html, $url, $opts=[]) {
public static function parse($http, $http_response, $opts=[]) {
$html = $http_response['body'];
$url = $http_response['url'];
$result = [
'data' => [
'type' => 'unknown',
],
'url' => $url,
'code' => $http_response['code'],
];
// attempt to parse the page as HTML
@ -55,7 +59,8 @@ class HTML extends Format {
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL',
'code' => isset($result['code']) ? $result['code'] : 200,
'url' => $url
'url' => $url,
'debug' => $result
];
}
}
@ -105,7 +110,8 @@ class HTML extends Format {
if(!$jsonpage['error'] && $jsonpage['body']) {
$jsondata = json_decode($jsonpage['body'],true);
if($jsondata) {
$data = Formats\Mf2::parse($jsondata, $url, $http, $opts);
$jsonpage['body'] = $jsondata;
$data = Formats\Mf2::parse($jsonpage, $http, $opts);
if($data && is_array($data) && isset($data['data']['type'])) {
$data['url'] = $relurl;
$data['source-format'] = 'mf2+json';
@ -125,7 +131,8 @@ class HTML extends Format {
if(!$jsonpage['error'] && $jsonpage['body']) {
$jsondata = json_decode($jsonpage['body'],true);
if($jsondata) {
$data = Formats\ActivityStreams::parse($jsondata, $url, $http, $opts);
$jsonpage['body'] = $jsondata;
$data = Formats\ActivityStreams::parse($jsonpage, $http, $opts);
if($data && is_array($data) && isset($data['data']['type'])) {
$data['url'] = $relurl;
$data['source-format'] = 'activity+json';
@ -139,7 +146,8 @@ class HTML extends Format {
// Now start pulling in the data from the page. Start by looking for microformats2
if($mf2 && count($mf2['items']) > 0) {
$data = Formats\Mf2::parse($mf2, $url, $http, $opts);
$http_response['body'] = $mf2;
$data = Formats\Mf2::parse($http_response, $http, $opts);
if($data) {
$result = array_merge($result, $data);
if($fragment) {

+ 4
- 1
lib/XRay/Formats/Hackernews.php View File

@ -38,7 +38,10 @@ class Hackernews extends Format {
];
}
public static function parse($json, $url) {
public static function parse($http_response) {
$json = $http_response['body'];
$url = $http_response['url'];
$data = @json_decode($json, true);
if(!$data)

+ 4
- 1
lib/XRay/Formats/Instagram.php View File

@ -15,7 +15,10 @@ class Instagram extends Format {
return self::matches_host($url);
}
public static function parse($http, $html, $url, $opts=[]) {
public static function parse($http, $http_response, $opts=[]) {
$html = $http_response['body'];
$url = $http_response['url'];
if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
if(isset($opts['expect']) && $opts['expect'] == 'feed')
return self::parseFeed($http, $html, $url);

+ 4
- 1
lib/XRay/Formats/JSONFeed.php View File

@ -10,7 +10,10 @@ class JSONFeed extends Format {
public static function matches_host($url) { return true; }
public static function matches($url) { return true; }
public static function parse($feed, $url) {
public static function parse($http_response) {
$feed = $http_response['body'];
$url = $http_response['url'];
$result = [
'data' => [
'type' => 'unknown',

+ 14
- 3
lib/XRay/Formats/Mf2.php View File

@ -13,7 +13,10 @@ class Mf2 extends Format {
return true;
}
public static function parse($mf2, $url, $http, $opts=[]) {
public static function parse($http_response, $http, $opts=[]) {
$mf2 = $http_response['body'];
$url = $http_response['url'];
if(!isset($mf2['items']) || count($mf2['items']) == 0)
return false;
@ -299,7 +302,11 @@ class Mf2 extends Format {
if(!array_key_exists($p, $data)) $data[$p] = [];
if(!in_array($u, $data[$p]))
$data[$p][] = $u;
$ref = self::parse(['items'=>[$v]], $u, $http);
$ref = self::parse([
'body' => ['items'=>[$v]],
'url' => $u,
'code' => null,
], $http);
if($ref) {
$refs[$u] = $ref['data'];
}
@ -355,7 +362,11 @@ class Mf2 extends Format {
$data[$p][] = $u;
$keys[] = $p;
// parse the object and put the result in the "refs" object
$ref = self::parse(['items'=>[$v]], $u, $http);
$ref = self::parse([
'body' => ['items'=>[$v]],
'url' => $u,
'code' => null,
], $http);
if($ref) {
$refs[$u] = $ref['data'];
}

+ 5
- 3
lib/XRay/Formats/Twitter.php View File

@ -46,7 +46,9 @@ class Twitter extends Format {
];
}
public static function parse($json, $url) {
public static function parse($http_response) {
$json = is_array($http_response) ? $http_response['body'] : $http_response->body;
$url = is_array($http_response) ? $http_response['url'] : $http_response->url;
if(is_string($json))
$tweet = json_decode($json);
@ -77,7 +79,7 @@ class Twitter extends Format {
$repostOf = 'https://twitter.com/' . $reposted->user->screen_name . '/status/' . $reposted->id_str;
$entry['repost-of'] = $repostOf;
$repostedEntry = self::parse($reposted, $repostOf);
$repostedEntry = self::parse(['body' => $reposted, 'url' => $repostOf]);
if(isset($repostedEntry['data']['refs'])) {
foreach($repostedEntry['data']['refs'] as $k=>$v) {
$refs[$k] = $v;
@ -152,7 +154,7 @@ class Twitter extends Format {
// Quoted Status
if(property_exists($tweet, 'quoted_status')) {
$quoteOf = 'https://twitter.com/' . $tweet->quoted_status->user->screen_name . '/status/' . $tweet->quoted_status_id_str;
$quotedEntry = self::parse($tweet->quoted_status, $quoteOf);
$quotedEntry = self::parse(['body' => $tweet->quoted_status, 'url' => $quoteOf]);
if(isset($quotedEntry['data']['refs'])) {
foreach($quotedEntry['data']['refs'] as $k=>$v) {
$refs[$k] = $v;

+ 4
- 1
lib/XRay/Formats/XKCD.php View File

@ -15,7 +15,10 @@ class XKCD extends Format {
return self::matches_host($url) && preg_match('/^\/\d+\/$/', ''.parse_url($url, PHP_URL_PATH));
}
public static function parse($html, $url) {
public static function parse($http_response) {
$html = $http_response['body'];
$url = $http_response['url'];
list($doc, $xpath) = self::_loadHTML($html);
if(!$doc)

+ 5
- 1
lib/XRay/Formats/XML.php View File

@ -12,13 +12,17 @@ class XML extends Format {
public static function matches_host($url) { return true; }
public static function matches($url) { return true; }
public static function parse($xml, $url) {
public static function parse($http_response) {
$xml = $http_response['body'];
$url = $http_response['url'];
$result = [
'data' => [
'type' => 'unknown',
],
'url' => $url,
'source-format' => 'xml',
'code' => $http_response['code'],
];
try {

+ 20
- 14
lib/XRay/Parser.php View File

@ -10,75 +10,81 @@ class Parser {
$this->http = $http;
}
public function parse($body, $url, $opts=[]) {
public function parse($http_response, $opts=[]) {
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
$this->http->set_max_redirects($opts['max_redirects']);
// Check if the URL matches a special parser
$url = $http_response['url'];
if(Formats\Instagram::matches($url)) {
return Formats\Instagram::parse($this->http, $body, $url, $opts);
return Formats\Instagram::parse($this->http, $http_response, $opts);
}
if(Formats\GitHub::matches($url)) {
return Formats\GitHub::parse($body, $url);
return Formats\GitHub::parse($http_response);
}
if(Formats\Twitter::matches($url)) {
return Formats\Twitter::parse($body, $url);
return Formats\Twitter::parse($http_response);
}
if(Formats\Facebook::matches($url)) {
return Formats\Facebook::parse($body, $url);
return Formats\Facebook::parse($http_response);
}
if(Formats\XKCD::matches($url)) {
return Formats\XKCD::parse($body, $url);
return Formats\XKCD::parse($http_response);
}
if(Formats\Hackernews::matches($url)) {
return Formats\Hackernews::parse($body, $url);
return Formats\Hackernews::parse($http_response);
}
$body = $http_response['body'];
// Check if an mf2 JSON object was passed in
if(is_array($body) && isset($body['items'][0]['type']) && isset($body['items'][0]['properties'])) {
$data = Formats\Mf2::parse($body, $url, $this->http, $opts);
$data = Formats\Mf2::parse($http_response, $this->http, $opts);
$data['source-format'] = 'mf2+json';
return $data;
}
// Check if an ActivityStreams JSON object was passed in
if(Formats\ActivityStreams::is_as2_json($body)) {
$data = Formats\ActivityStreams::parse($body, $url, $this->http, $opts);
$data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
$data['source-format'] = 'activity+json';
return $data;
}
if(substr($body, 0, 5) == '<?xml') {
return Formats\XML::parse($body, $url);
return Formats\XML::parse($http_response);
}
if(substr($body, 0, 1) == '{') {
$parsed = json_decode($body, true);
if($parsed && isset($parsed['version']) && $parsed['version'] == 'https://jsonfeed.org/version/1') {
return Formats\JSONFeed::parse($parsed, $url);
$http_response['body'] = $parsed;
return Formats\JSONFeed::parse($http_response);
} elseif($parsed && isset($parsed['items'][0]['type']) && isset($parsed['items'][0]['properties'])) {
// Check if an mf2 JSON string was passed in
$data = Formats\Mf2::parse($parsed, $url, $this->http, $opts);
$http_response['body'] = $parsed;
$data = Formats\Mf2::parse($http_response, $this->http, $opts);
$data['source-format'] = 'mf2+json';
return $data;
} elseif($parsed && Formats\ActivityStreams::is_as2_json($parsed)) {
// Check if an ActivityStreams JSON string was passed in
$data = Formats\ActivityStreams::parse($parsed, $url, $this->http, $opts);
$http_response['body'] = $parsed;
$data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
$data['source-format'] = 'activity+json';
return $data;
}
}
// No special parsers matched, parse for Microformats now
$data = Formats\HTML::parse($this->http, $body, $url, $opts);
$data = Formats\HTML::parse($this->http, $http_response, $opts);
if(!isset($data['source-format']) && isset($data['type']) && $data['type'] != 'unknown')
$data['source-format'] = 'mf2+html';
return $data;

+ 43
- 0
tests/FetchTest.php View File

@ -103,6 +103,49 @@ class FetchTest extends PHPUnit_Framework_TestCase {
$this->assertEquals(401, $data->code);
}
public function testDeleted() {
$url = 'http://source.example.com/deleted-gone';
$response = $this->parse([
'url' => $url
]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
$this->assertEquals(410, $data->code);
$this->assertEquals('This post has been deleted.', $data->data->content->text);
}
public function testDeletedEmptyBody() {
$url = 'http://source.example.com/deleted-empty';
$response = $this->parse([
'url' => $url
]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
$this->assertEquals(410, $data->code);
$this->assertEquals('unknown', $data->data->type);
}
public function testDeletedTargetProvided() {
$url = 'http://source.example.com/deleted-gone';
$response = $this->parse([
'url' => $url,
'target' => 'http://example.com/'
]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('no_link_found', $data->error);
$this->assertEquals(410, $data->code);
}
public function testMetaEquivDeleted() {
$url = 'http://source.example.com/deleted';
$response = $this->parse([

+ 1
- 1
tests/FindFeedsTest.php View File

@ -42,7 +42,7 @@ class FindFeedsTest extends PHPUnit_Framework_TestCase {
$feeds = json_decode($body)->feeds;
$this->assertEquals(2, count($feeds));
// Should rank JSONFeed above Atom
// Should rank h-feed above Atom
$this->assertEquals('http://feed.example.com/h-feed-with-atom-alternate', $feeds[0]->url);
$this->assertEquals('microformats', $feeds[0]->type);
$this->assertEquals('http://feed.example.com/atom', $feeds[1]->url);

+ 6
- 0
tests/data/source.example.com/deleted-empty View File

@ -0,0 +1,6 @@
HTTP/1.1 410 Gone
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive

+ 14
- 0
tests/data/source.example.com/deleted-gone View File

@ -0,0 +1,14 @@
HTTP/1.1 410 Gone
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content">This post has been deleted.</p>
</body>
</html>

Loading…
Cancel
Save