diff --git a/controllers/Parse.php b/controllers/Parse.php index ca545f8..8540dbc 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -120,7 +120,7 @@ class Parse { $data = [ 'data' => $parsed['data'], 'url' => $result['url'], - 'code' => $result['code'] + 'code' => $result['code'], ]; if(isset($parsed['info'])) $data['info'] = $parsed['info']; diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index 6cd7c5c..6a58288 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -104,36 +104,4 @@ abstract class Format implements iFormat { return trim(str_replace(['
','
'],"\n", $sanitized)); } - protected static function findLinksInDocument(&$xpath, $target) { - $found = []; - self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){ - if($u == $target) { - $found[$u] = null; - } - }); - self::xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){ - if($u == $target) { - $found[$u] = null; - } - }); - self::xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){ - if($u == $target) { - $found[$u] = null; - } - }); - self::xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){ - if($u == $target) { - $found[$u] = null; - } - }); - return $found; - } - - public static function xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) { - foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) { - $v = $el->getAttribute($attr); - $callback($v); - } - } - } diff --git a/lib/XRay/Formats/HTML.php b/lib/XRay/Formats/HTML.php index 6ad6311..430bb94 100644 --- a/lib/XRay/Formats/HTML.php +++ b/lib/XRay/Formats/HTML.php @@ -20,6 +20,7 @@ class HTML extends Format { ], 'url' => $url, 'code' => $http_response['code'], + 'html' => $html, ]; // attempt to parse the page as HTML @@ -45,26 +46,6 @@ class HTML extends Format { } } - // If a target parameter was provided, make sure a link to it exists on the page - if(isset($opts['target'])) { - $target = $opts['target']; - - $found = []; - if($target) { - $found = self::findLinksInDocument($xpath, $target); - } - - if(!$found) { - return [ - 'error' => 'no_link_found', - 'error_description' => 'The source document does not have a link to the target URL', - 'code' => isset($result['code']) ? $result['code'] : 200, - 'url' => $url, - 'debug' => $result - ]; - } - } - // If the URL has a fragment ID, find the DOM starting at that node and parse it instead $fragment = parse_url($url, PHP_URL_FRAGMENT); if($fragment) { @@ -108,7 +89,7 @@ class HTML extends Format { ]); // Skip and fall back to parsing the HTML if anything about this request fails if(!$jsonpage['error'] && $jsonpage['body']) { - $jsondata = json_decode($jsonpage['body'],true); + $jsondata = json_decode($jsonpage['body'], true); if($jsondata) { $jsonpage['body'] = $jsondata; $data = Formats\Mf2::parse($jsonpage, $http, $opts); diff --git a/lib/XRay/Parser.php b/lib/XRay/Parser.php index 3ee2d4b..108ec6f 100644 --- a/lib/XRay/Parser.php +++ b/lib/XRay/Parser.php @@ -2,6 +2,7 @@ namespace p3k\XRay; use p3k\XRay\Formats; +use DOMDocument, DOMXPath; class Parser { private $http; @@ -11,6 +12,42 @@ class Parser { } public function parse($http_response, $opts=[]) { + $document = $this->parse_document($http_response, $opts); + + // If a target parameter was provided, make sure a link to it exists in the parsed document + if(!isset($document['error']) && !empty($opts['target'])) { + + if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') { + if(isset($document['html'])) { + // Couldn't parse the page, check for the link manually assuming HTML content + $found = $this->_findLinkInHTML($opts['target'], $document['html']); + } else { + // Ignore this check for any non-HTML documents since this will be uncommon anyway + $found = false; + } + } else { + $found = $this->_findLinkInTree($opts['target'], $document['data']); + } + + if(!$found) { + return [ + 'error' => 'no_link_found', + 'error_description' => 'The source document does not have a link to the target URL', + 'code' => isset($document['code']) ? $document['code'] : 200, + 'url' => $document['url'], + 'debug' => $document['data'] + ]; + } + } + + // If the HTML parser couldn't parse the page it returns the full HTML for checking the target above, + // but we don't want to return that in the out put so remove it here + unset($document['html']); + + return $document; + } + + public function parse_document($http_response, $opts=[]) { if(isset($opts['timeout'])) $this->http->set_timeout($opts['timeout']); if(isset($opts['max_redirects'])) @@ -46,8 +83,15 @@ class Parser { $body = $http_response['body']; // Check if an mf2 JSON object was passed in - if(is_array($body) && isset($body['items'][0]['type']) && isset($body['items'][0]['properties'])) { + if(is_array($body) && isset($body['items']) && isset($body['rels']) && isset($body['rel-urls'])) { $data = Formats\Mf2::parse($http_response, $this->http, $opts); + if($data == false) { + $data = [ + 'data' => [ + 'type' => 'unknown', + ] + ]; + } $data['source-format'] = 'mf2+json'; return $data; } @@ -96,4 +140,80 @@ class Parser { return $data; } + private function _findLinkInTree($link, $document) { + if(!$document) + return false; + + if(is_string($document) || is_numeric($document)) { + return $document == $link; + } + + if(is_array($document)) { + foreach($document as $key=>$value) { + if($key === 'html') { + $found = $this->_findLinkInHTML($link, $value); + if($found) { + return true; + } + } else { + $found = $this->_findLinkInTree($link, $value); + if($found) { + return true; + } + } + } + return false; + } + + throw new Exception('Unexpected value in tree'); + } + + private function _findLinkInHTML($link, $html) { + $doc = new DOMDocument(); + @$doc->loadHTML(self::_toHtmlEntities($html)); + + if(!$doc) + return false; + + $xpath = new DOMXPath($doc); + + return self::_findLinksInDOMDocument($xpath, $link); + } + + private static function _findLinksInDOMDocument(&$xpath, $target) { + $found = []; + self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){ + if($u == $target) { + $found[$u] = null; + } + }); + self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){ + if($u == $target) { + $found[$u] = null; + } + }); + self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){ + if($u == $target) { + $found[$u] = null; + } + }); + self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){ + if($u == $target) { + $found[$u] = null; + } + }); + return $found; + } + + private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) { + foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) { + $v = $el->getAttribute($attr); + $callback($v); + } + } + + private static function _toHtmlEntities($input) { + return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); + } + } diff --git a/tests/LibraryTest.php b/tests/LibraryTest.php index f121da3..5e0109c 100644 --- a/tests/LibraryTest.php +++ b/tests/LibraryTest.php @@ -34,4 +34,34 @@ class LibraryTest extends PHPUnit_Framework_TestCase { $this->assertEquals('Barnaby Walters', $data['data']['name']); } + public function testNoHEntryMarkupMF2JSON() { + $url = 'http://example.com/'; + $html = '

Target

'; + $mf2 = Mf2\parse($html, $url); + + $xray = new p3k\XRay(); + $data = $xray->process($url, $mf2); + $this->assertEquals('unknown', $data['data']['type']); + } + + public function testNoHEntryMarkup() { + $url = 'http://example.com/'; + $html = '

Target

'; + + $xray = new p3k\XRay(); + $data = $xray->parse($url, $html); + $this->assertEquals('unknown', $data['data']['type']); + } + + public function testNoHEntryMarkupWithTarget() { + $url = 'http://example.com/'; + $html = '

Target

'; + + $xray = new p3k\XRay(); + $data = $xray->parse($url, $html, ['target' => 'http://target.example.com/']); + $this->assertEquals('unknown', $data['data']['type']); + $this->assertArrayNotHasKey('error', $data); + $this->assertArrayNotHasKey('html', $data); + } + } diff --git a/tests/ParseTest.php b/tests/ParseTest.php index 49ffd4b..2d94a6a 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -62,6 +62,19 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertObjectNotHasAttribute('error', $data); } + public function testTargetNotFoundInXML() { + $url = 'http://feed.example.com/atom'; + $response = $this->parse(['url' => $url, 'target' => 'http://example.net']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectHasAttribute('error', $data); + $this->assertEquals('no_link_found', $data->error); + $this->assertEquals('200', $data->code); + $this->assertEquals($url, $data->url); + } + public function testHTMLContent() { $url = 'http://source.example.com/html-content'; $response = $this->parse(['url' => $url]); @@ -217,6 +230,47 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertEquals('This page has an audio tag with the target URL.', $data->data->content->text); } + public function testFindTargetLinkInFeed() { + $url = 'http://feed.example.com/jsonfeed'; + $response = $this->parse(['url' => $url, 'target' => 'http://www.manton.org/2017/11/5993.html']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('error', $data); + } + + public function testFindTargetLinkInHTMLInFeed() { + $url = 'http://feed.example.com/jsonfeed'; + $response = $this->parse(['url' => $url, 'target' => 'http://www.manton.org/2016/11/todays-social-networks-are-broken.html']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('error', $data); + } + + public function testNotFindTargetLinkInHTMLInFeed() { + $url = 'http://feed.example.com/jsonfeed'; + $response = $this->parse(['url' => $url, 'target' => 'http://example.com/']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectHasAttribute('error', $data); + $this->assertEquals('no_link_found', $data->error); + } + + public function testFindRelativeTargetLink() { + $url = 'http://source.example.com/multiple-urls'; + $response = $this->parse(['url' => $url, 'target' => 'http://source.example.com/photo.jpg']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('error', $data); + } + public function testTextContent() { $url = 'http://source.example.com/text-content'; $response = $this->parse(['url' => $url]); @@ -316,6 +370,18 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); $this->assertEquals('unknown', $data->data->type); + $this->assertObjectNotHasAttribute('html', $data); + } + + public function testFindTargetInNoParsedResult() { + $url = 'http://source.example.com/no-h-entry'; + $response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('error', $data); + $this->assertEquals('unknown', $data->data->type); } public function testReplyIsURL() {