diff --git a/controllers/Parse.php b/controllers/Parse.php
index ca545f8..8540dbc 100644
--- a/controllers/Parse.php
+++ b/controllers/Parse.php
@@ -120,7 +120,7 @@ class Parse {
$data = [
'data' => $parsed['data'],
'url' => $result['url'],
- 'code' => $result['code']
+ 'code' => $result['code'],
];
if(isset($parsed['info']))
$data['info'] = $parsed['info'];
diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php
index 6cd7c5c..6a58288 100644
--- a/lib/XRay/Formats/Format.php
+++ b/lib/XRay/Formats/Format.php
@@ -104,36 +104,4 @@ abstract class Format implements iFormat {
return trim(str_replace(['
','
'],"\n", $sanitized));
}
- protected static function findLinksInDocument(&$xpath, $target) {
- $found = [];
- self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
- if($u == $target) {
- $found[$u] = null;
- }
- });
- self::xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
- if($u == $target) {
- $found[$u] = null;
- }
- });
- self::xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
- if($u == $target) {
- $found[$u] = null;
- }
- });
- self::xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
- if($u == $target) {
- $found[$u] = null;
- }
- });
- return $found;
- }
-
- public static function xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
- foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
- $v = $el->getAttribute($attr);
- $callback($v);
- }
- }
-
}
diff --git a/lib/XRay/Formats/HTML.php b/lib/XRay/Formats/HTML.php
index 6ad6311..430bb94 100644
--- a/lib/XRay/Formats/HTML.php
+++ b/lib/XRay/Formats/HTML.php
@@ -20,6 +20,7 @@ class HTML extends Format {
],
'url' => $url,
'code' => $http_response['code'],
+ 'html' => $html,
];
// attempt to parse the page as HTML
@@ -45,26 +46,6 @@ class HTML extends Format {
}
}
- // If a target parameter was provided, make sure a link to it exists on the page
- if(isset($opts['target'])) {
- $target = $opts['target'];
-
- $found = [];
- if($target) {
- $found = self::findLinksInDocument($xpath, $target);
- }
-
- if(!$found) {
- return [
- 'error' => 'no_link_found',
- 'error_description' => 'The source document does not have a link to the target URL',
- 'code' => isset($result['code']) ? $result['code'] : 200,
- 'url' => $url,
- 'debug' => $result
- ];
- }
- }
-
// If the URL has a fragment ID, find the DOM starting at that node and parse it instead
$fragment = parse_url($url, PHP_URL_FRAGMENT);
if($fragment) {
@@ -108,7 +89,7 @@ class HTML extends Format {
]);
// Skip and fall back to parsing the HTML if anything about this request fails
if(!$jsonpage['error'] && $jsonpage['body']) {
- $jsondata = json_decode($jsonpage['body'],true);
+ $jsondata = json_decode($jsonpage['body'], true);
if($jsondata) {
$jsonpage['body'] = $jsondata;
$data = Formats\Mf2::parse($jsonpage, $http, $opts);
diff --git a/lib/XRay/Parser.php b/lib/XRay/Parser.php
index 3ee2d4b..108ec6f 100644
--- a/lib/XRay/Parser.php
+++ b/lib/XRay/Parser.php
@@ -2,6 +2,7 @@
namespace p3k\XRay;
use p3k\XRay\Formats;
+use DOMDocument, DOMXPath;
class Parser {
private $http;
@@ -11,6 +12,42 @@ class Parser {
}
public function parse($http_response, $opts=[]) {
+ $document = $this->parse_document($http_response, $opts);
+
+ // If a target parameter was provided, make sure a link to it exists in the parsed document
+ if(!isset($document['error']) && !empty($opts['target'])) {
+
+ if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') {
+ if(isset($document['html'])) {
+ // Couldn't parse the page, check for the link manually assuming HTML content
+ $found = $this->_findLinkInHTML($opts['target'], $document['html']);
+ } else {
+ // Ignore this check for any non-HTML documents since this will be uncommon anyway
+ $found = false;
+ }
+ } else {
+ $found = $this->_findLinkInTree($opts['target'], $document['data']);
+ }
+
+ if(!$found) {
+ return [
+ 'error' => 'no_link_found',
+ 'error_description' => 'The source document does not have a link to the target URL',
+ 'code' => isset($document['code']) ? $document['code'] : 200,
+ 'url' => $document['url'],
+ 'debug' => $document['data']
+ ];
+ }
+ }
+
+ // If the HTML parser couldn't parse the page it returns the full HTML for checking the target above,
+ // but we don't want to return that in the out put so remove it here
+ unset($document['html']);
+
+ return $document;
+ }
+
+ public function parse_document($http_response, $opts=[]) {
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
@@ -46,8 +83,15 @@ class Parser {
$body = $http_response['body'];
// Check if an mf2 JSON object was passed in
- if(is_array($body) && isset($body['items'][0]['type']) && isset($body['items'][0]['properties'])) {
+ if(is_array($body) && isset($body['items']) && isset($body['rels']) && isset($body['rel-urls'])) {
$data = Formats\Mf2::parse($http_response, $this->http, $opts);
+ if($data == false) {
+ $data = [
+ 'data' => [
+ 'type' => 'unknown',
+ ]
+ ];
+ }
$data['source-format'] = 'mf2+json';
return $data;
}
@@ -96,4 +140,80 @@ class Parser {
return $data;
}
+ private function _findLinkInTree($link, $document) {
+ if(!$document)
+ return false;
+
+ if(is_string($document) || is_numeric($document)) {
+ return $document == $link;
+ }
+
+ if(is_array($document)) {
+ foreach($document as $key=>$value) {
+ if($key === 'html') {
+ $found = $this->_findLinkInHTML($link, $value);
+ if($found) {
+ return true;
+ }
+ } else {
+ $found = $this->_findLinkInTree($link, $value);
+ if($found) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ throw new Exception('Unexpected value in tree');
+ }
+
+ private function _findLinkInHTML($link, $html) {
+ $doc = new DOMDocument();
+ @$doc->loadHTML(self::_toHtmlEntities($html));
+
+ if(!$doc)
+ return false;
+
+ $xpath = new DOMXPath($doc);
+
+ return self::_findLinksInDOMDocument($xpath, $link);
+ }
+
+ private static function _findLinksInDOMDocument(&$xpath, $target) {
+ $found = [];
+ self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
+ if($u == $target) {
+ $found[$u] = null;
+ }
+ });
+ self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
+ if($u == $target) {
+ $found[$u] = null;
+ }
+ });
+ self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
+ if($u == $target) {
+ $found[$u] = null;
+ }
+ });
+ self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
+ if($u == $target) {
+ $found[$u] = null;
+ }
+ });
+ return $found;
+ }
+
+ private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
+ foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
+ $v = $el->getAttribute($attr);
+ $callback($v);
+ }
+ }
+
+ private static function _toHtmlEntities($input) {
+ return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
+ }
+
}
diff --git a/tests/LibraryTest.php b/tests/LibraryTest.php
index f121da3..5e0109c 100644
--- a/tests/LibraryTest.php
+++ b/tests/LibraryTest.php
@@ -34,4 +34,34 @@ class LibraryTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('Barnaby Walters', $data['data']['name']);
}
+ public function testNoHEntryMarkupMF2JSON() {
+ $url = 'http://example.com/';
+ $html = '