Browse Source

fix for target check for all formats

move target check outside of all format classes and operating on the final parsed JSON so that it works regardless of the input content type
pull/94/head v1.9.1
Aaron Parecki 4 years ago
parent
commit
989d42a85f
No known key found for this signature in database GPG Key ID: 276C2817346D6056
6 changed files with 220 additions and 55 deletions
  1. +1
    -1
      controllers/Parse.php
  2. +0
    -32
      lib/XRay/Formats/Format.php
  3. +2
    -21
      lib/XRay/Formats/HTML.php
  4. +121
    -1
      lib/XRay/Parser.php
  5. +30
    -0
      tests/LibraryTest.php
  6. +66
    -0
      tests/ParseTest.php

+ 1
- 1
controllers/Parse.php View File

@ -120,7 +120,7 @@ class Parse {
$data = [ $data = [
'data' => $parsed['data'], 'data' => $parsed['data'],
'url' => $result['url'], 'url' => $result['url'],
'code' => $result['code']
'code' => $result['code'],
]; ];
if(isset($parsed['info'])) if(isset($parsed['info']))
$data['info'] = $parsed['info']; $data['info'] = $parsed['info'];

+ 0
- 32
lib/XRay/Formats/Format.php View File

@ -104,36 +104,4 @@ abstract class Format implements iFormat {
return trim(str_replace(['<br>','<br />'],"\n", $sanitized)); return trim(str_replace(['<br>','<br />'],"\n", $sanitized));
} }
protected static function findLinksInDocument(&$xpath, $target) {
$found = [];
self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
return $found;
}
public static function xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
$v = $el->getAttribute($attr);
$callback($v);
}
}
} }

+ 2
- 21
lib/XRay/Formats/HTML.php View File

@ -20,6 +20,7 @@ class HTML extends Format {
], ],
'url' => $url, 'url' => $url,
'code' => $http_response['code'], 'code' => $http_response['code'],
'html' => $html,
]; ];
// attempt to parse the page as HTML // attempt to parse the page as HTML
@ -45,26 +46,6 @@ class HTML extends Format {
} }
} }
// If a target parameter was provided, make sure a link to it exists on the page
if(isset($opts['target'])) {
$target = $opts['target'];
$found = [];
if($target) {
$found = self::findLinksInDocument($xpath, $target);
}
if(!$found) {
return [
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL',
'code' => isset($result['code']) ? $result['code'] : 200,
'url' => $url,
'debug' => $result
];
}
}
// If the URL has a fragment ID, find the DOM starting at that node and parse it instead // If the URL has a fragment ID, find the DOM starting at that node and parse it instead
$fragment = parse_url($url, PHP_URL_FRAGMENT); $fragment = parse_url($url, PHP_URL_FRAGMENT);
if($fragment) { if($fragment) {
@ -108,7 +89,7 @@ class HTML extends Format {
]); ]);
// Skip and fall back to parsing the HTML if anything about this request fails // Skip and fall back to parsing the HTML if anything about this request fails
if(!$jsonpage['error'] && $jsonpage['body']) { if(!$jsonpage['error'] && $jsonpage['body']) {
$jsondata = json_decode($jsonpage['body'],true);
$jsondata = json_decode($jsonpage['body'], true);
if($jsondata) { if($jsondata) {
$jsonpage['body'] = $jsondata; $jsonpage['body'] = $jsondata;
$data = Formats\Mf2::parse($jsonpage, $http, $opts); $data = Formats\Mf2::parse($jsonpage, $http, $opts);

+ 121
- 1
lib/XRay/Parser.php View File

@ -2,6 +2,7 @@
namespace p3k\XRay; namespace p3k\XRay;
use p3k\XRay\Formats; use p3k\XRay\Formats;
use DOMDocument, DOMXPath;
class Parser { class Parser {
private $http; private $http;
@ -11,6 +12,42 @@ class Parser {
} }
public function parse($http_response, $opts=[]) { public function parse($http_response, $opts=[]) {
$document = $this->parse_document($http_response, $opts);
// If a target parameter was provided, make sure a link to it exists in the parsed document
if(!isset($document['error']) && !empty($opts['target'])) {
if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') {
if(isset($document['html'])) {
// Couldn't parse the page, check for the link manually assuming HTML content
$found = $this->_findLinkInHTML($opts['target'], $document['html']);
} else {
// Ignore this check for any non-HTML documents since this will be uncommon anyway
$found = false;
}
} else {
$found = $this->_findLinkInTree($opts['target'], $document['data']);
}
if(!$found) {
return [
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL',
'code' => isset($document['code']) ? $document['code'] : 200,
'url' => $document['url'],
'debug' => $document['data']
];
}
}
// If the HTML parser couldn't parse the page it returns the full HTML for checking the target above,
// but we don't want to return that in the out put so remove it here
unset($document['html']);
return $document;
}
public function parse_document($http_response, $opts=[]) {
if(isset($opts['timeout'])) if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']); $this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects'])) if(isset($opts['max_redirects']))
@ -46,8 +83,15 @@ class Parser {
$body = $http_response['body']; $body = $http_response['body'];
// Check if an mf2 JSON object was passed in // Check if an mf2 JSON object was passed in
if(is_array($body) && isset($body['items'][0]['type']) && isset($body['items'][0]['properties'])) {
if(is_array($body) && isset($body['items']) && isset($body['rels']) && isset($body['rel-urls'])) {
$data = Formats\Mf2::parse($http_response, $this->http, $opts); $data = Formats\Mf2::parse($http_response, $this->http, $opts);
if($data == false) {
$data = [
'data' => [
'type' => 'unknown',
]
];
}
$data['source-format'] = 'mf2+json'; $data['source-format'] = 'mf2+json';
return $data; return $data;
} }
@ -96,4 +140,80 @@ class Parser {
return $data; return $data;
} }
private function _findLinkInTree($link, $document) {
if(!$document)
return false;
if(is_string($document) || is_numeric($document)) {
return $document == $link;
}
if(is_array($document)) {
foreach($document as $key=>$value) {
if($key === 'html') {
$found = $this->_findLinkInHTML($link, $value);
if($found) {
return true;
}
} else {
$found = $this->_findLinkInTree($link, $value);
if($found) {
return true;
}
}
}
return false;
}
throw new Exception('Unexpected value in tree');
}
private function _findLinkInHTML($link, $html) {
$doc = new DOMDocument();
@$doc->loadHTML(self::_toHtmlEntities($html));
if(!$doc)
return false;
$xpath = new DOMXPath($doc);
return self::_findLinksInDOMDocument($xpath, $link);
}
private static function _findLinksInDOMDocument(&$xpath, $target) {
$found = [];
self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
return $found;
}
private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
$v = $el->getAttribute($attr);
$callback($v);
}
}
private static function _toHtmlEntities($input) {
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
}
} }

+ 30
- 0
tests/LibraryTest.php View File

@ -34,4 +34,34 @@ class LibraryTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('Barnaby Walters', $data['data']['name']); $this->assertEquals('Barnaby Walters', $data['data']['name']);
} }
public function testNoHEntryMarkupMF2JSON() {
$url = 'http://example.com/';
$html = '<p><a href="http://target.example.com/">Target</a></p>';
$mf2 = Mf2\parse($html, $url);
$xray = new p3k\XRay();
$data = $xray->process($url, $mf2);
$this->assertEquals('unknown', $data['data']['type']);
}
public function testNoHEntryMarkup() {
$url = 'http://example.com/';
$html = '<p><a href="http://target.example.com/">Target</a></p>';
$xray = new p3k\XRay();
$data = $xray->parse($url, $html);
$this->assertEquals('unknown', $data['data']['type']);
}
public function testNoHEntryMarkupWithTarget() {
$url = 'http://example.com/';
$html = '<p><a href="http://target.example.com/">Target</a></p>';
$xray = new p3k\XRay();
$data = $xray->parse($url, $html, ['target' => 'http://target.example.com/']);
$this->assertEquals('unknown', $data['data']['type']);
$this->assertArrayNotHasKey('error', $data);
$this->assertArrayNotHasKey('html', $data);
}
} }

+ 66
- 0
tests/ParseTest.php View File

@ -62,6 +62,19 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertObjectNotHasAttribute('error', $data); $this->assertObjectNotHasAttribute('error', $data);
} }
public function testTargetNotFoundInXML() {
$url = 'http://feed.example.com/atom';
$response = $this->parse(['url' => $url, 'target' => 'http://example.net']);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('no_link_found', $data->error);
$this->assertEquals('200', $data->code);
$this->assertEquals($url, $data->url);
}
public function testHTMLContent() { public function testHTMLContent() {
$url = 'http://source.example.com/html-content'; $url = 'http://source.example.com/html-content';
$response = $this->parse(['url' => $url]); $response = $this->parse(['url' => $url]);
@ -217,6 +230,47 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('This page has an audio tag with the target URL.', $data->data->content->text); $this->assertEquals('This page has an audio tag with the target URL.', $data->data->content->text);
} }
public function testFindTargetLinkInFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url, 'target' => 'http://www.manton.org/2017/11/5993.html']);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
}
public function testFindTargetLinkInHTMLInFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url, 'target' => 'http://www.manton.org/2016/11/todays-social-networks-are-broken.html']);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
}
public function testNotFindTargetLinkInHTMLInFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url, 'target' => 'http://example.com/']);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('no_link_found', $data->error);
}
public function testFindRelativeTargetLink() {
$url = 'http://source.example.com/multiple-urls';
$response = $this->parse(['url' => $url, 'target' => 'http://source.example.com/photo.jpg']);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
}
public function testTextContent() { public function testTextContent() {
$url = 'http://source.example.com/text-content'; $url = 'http://source.example.com/text-content';
$response = $this->parse(['url' => $url]); $response = $this->parse(['url' => $url]);
@ -316,6 +370,18 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertEquals(200, $response->getStatusCode()); $this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body); $data = json_decode($body);
$this->assertEquals('unknown', $data->data->type); $this->assertEquals('unknown', $data->data->type);
$this->assertObjectNotHasAttribute('html', $data);
}
public function testFindTargetInNoParsedResult() {
$url = 'http://source.example.com/no-h-entry';
$response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('error', $data);
$this->assertEquals('unknown', $data->data->type);
} }
public function testReplyIsURL() { public function testReplyIsURL() {

Loading…
Cancel
Save