diff --git a/controllers/Parse.php b/controllers/Parse.php index 8540dbc..49f00af 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -67,6 +67,10 @@ class Parse { $this->_pretty = true; } + if($request->get('include-mf1')) { + $opts['include-mf1'] = $request->get('include-mf1') == 'false' ? false : true; + } + $url = $request->get('url'); $html = $request->get('html') ?: $request->get('body'); diff --git a/lib/XRay/Formats/HTML.php b/lib/XRay/Formats/HTML.php index 5846266..eb53c7e 100644 --- a/lib/XRay/Formats/HTML.php +++ b/lib/XRay/Formats/HTML.php @@ -58,7 +58,11 @@ class HTML extends Format { } } - $mf2 = \mf2\Parse($html, $url); + $includeMF1 = true; + if(isset($opts['include-mf1']) && $opts['include-mf1'] == false) + $includeMF1 = false; + + $mf2 = \Mf2\parse($html, $url, $includeMF1); $canonical = false; diff --git a/lib/XRay/Parser.php b/lib/XRay/Parser.php index e2900db..28886b8 100644 --- a/lib/XRay/Parser.php +++ b/lib/XRay/Parser.php @@ -29,6 +29,25 @@ class Parser { } else { $found = $this->_findLinkInTree($opts['target'], $document['data']); $error_description = 'The Microformats at the source URL do not contain a link to the target URL. Check the source URL in a Microformats parser such as php.microformats.io'; + + if(!$found && isset($document['html'])) { + // If no link was found in the parsed mf2 tree, check for a link in the HTML + $found = $this->_findLinkInHTML($opts['target'], $document['html']); + // If there is a link, and if the HTML document has no mf2, then downgrade to a regular mention + if($found) { + $mf2Data = Formats\HTML::parse($this->http, $http_response, ['include-mf1'=>false]); + if(isset($mf2Data['data']['type']) && $mf2Data['data']['type'] == 'unknown') { + // Since the link was found in the HTML, but not in the parsed tree, it shouldn't return the parsed document + $document['data'] = [ + 'type' => 'unknown' + ]; + } else { + // Otherwise, the document did have mf2, but the link wasn't in it (checked earlier), so set found=false + $found = false; + } + } + } + } if(!$found) { diff --git a/tests/ParseTest.php b/tests/ParseTest.php index d9afeb2..61a2f68 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -1135,4 +1135,73 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertEquals('https://aaronparecki.com/2019/12/01/10/homeautomation', $data['data']['url']); $this->assertEquals('https://aaronparecki.com/2019/12/01/10/homeautomation', $data['data']['rels']['canonical']); } + + public function testTargetLinkOutsideHEntry() { + $url = 'http://source.example.com/target-test-link-outside-h-entry'; + $response = $this->parse(['url' => $url, 'target' => 'https://target.example.com/']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('no_link_found', $data['error']); + } + + public function testTargetLinkWithBadMf1() { + $url = 'http://source.example.com/target-test-only-bad-mf1'; + $response = $this->parse(['url' => $url, 'target' => 'https://target.example.com/']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('unknown', $data['data']['type']); + } + + public function testTargetLinkWithValidMf1() { + $url = 'http://source.example.com/target-test-only-good-mf1'; + $response = $this->parse(['url' => $url, 'target' => 'https://target.example.com/']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('target', $data['data']['content']['html']); + } + + public function testTargetLinkOutsideValidMf1() { + $url = 'http://source.example.com/target-test-link-outside-valid-mf1'; + $response = $this->parse(['url' => $url, 'target' => 'https://target.example.com/']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + // Since the link was found in the HTML, but not in the parsed tree, it shouldn't return the parsed document + $this->assertEquals('unknown', $data['data']['type']); + } + + public function testDisableMf1Parsing() { + $url = 'http://source.example.com/target-test-only-good-mf1'; + $response = $this->parse(['url' => $url, 'include-mf1' => 'false']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('unknown', $data['data']['type']); + } + + public function testEnableMf1Parsing() { + $url = 'http://source.example.com/target-test-only-good-mf1'; + $response = $this->parse(['url' => $url, 'include-mf1' => 'true']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + } + } diff --git a/tests/data/source.example.com/target-test-link-outside-h-entry b/tests/data/source.example.com/target-test-link-outside-h-entry new file mode 100644 index 0000000..3d719fd --- /dev/null +++ b/tests/data/source.example.com/target-test-link-outside-h-entry @@ -0,0 +1,17 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+

hello world

+
+ + + diff --git a/tests/data/source.example.com/target-test-link-outside-valid-mf1 b/tests/data/source.example.com/target-test-link-outside-valid-mf1 new file mode 100644 index 0000000..046e433 --- /dev/null +++ b/tests/data/source.example.com/target-test-link-outside-valid-mf1 @@ -0,0 +1,17 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+

hello world

+
+ + + diff --git a/tests/data/source.example.com/target-test-only-bad-mf1 b/tests/data/source.example.com/target-test-only-bad-mf1 new file mode 100644 index 0000000..6de94d1 --- /dev/null +++ b/tests/data/source.example.com/target-test-only-bad-mf1 @@ -0,0 +1,16 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+

target

+
+ + diff --git a/tests/data/source.example.com/target-test-only-good-mf1 b/tests/data/source.example.com/target-test-only-good-mf1 new file mode 100644 index 0000000..97ee246 --- /dev/null +++ b/tests/data/source.example.com/target-test-only-good-mf1 @@ -0,0 +1,16 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+

target

+
+ +