From 3bdafad98e9d444dbe11f419f114599c545df5d9 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Thu, 22 Dec 2016 08:06:00 -0800 Subject: [PATCH] parse URLs with fragment IDs If the input URL contains a fragment, finds the DOM tree at that ID and runs the subtree through the mf2 parser. closes #15 --- controllers/Parse.php | 34 ++++++++++++++++++++--- lib/HTTPTest.php | 3 ++ tests/ParseTest.php | 24 ++++++++++++++++ tests/data/source.example.com/fragment-id | 22 +++++++++++++++ 4 files changed, 79 insertions(+), 4 deletions(-) create mode 100644 tests/data/source.example.com/fragment-id diff --git a/controllers/Parse.php b/controllers/Parse.php index 586181d..ee8691a 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -154,10 +154,10 @@ class Parse { ]); } + $xpath = new DOMXPath($doc); + // If a target parameter was provided, make sure a link to it exists on the page if($target=$request->get('target')) { - $xpath = new DOMXPath($doc); - $found = []; if($target) { self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){ @@ -190,19 +190,37 @@ class Parse { } } + // If the URL has a fragment ID, find the DOM starting at that node and parse it instead + $html = $result['body']; + + $fragment = parse_url($url, PHP_URL_FRAGMENT); + if($fragment) { + $fragElement = self::xPathGetElementById($xpath, $fragment); + if($fragElement) { + $html = $doc->saveHTML($fragElement); + $foundFragment = true; + } else { + $foundFragment = false; + } + } + // Now start pulling in the data from the page. Start by looking for microformats2 - $mf2 = mf2\Parse($result['body'], $result['url']); + $mf2 = mf2\Parse($html, $result['url']); if($mf2 && count($mf2['items']) > 0) { $data = Formats\Mf2::parse($mf2, $result['url'], $this->http); if($data) { + if($fragment) { + $data['info'] = [ + 'found_fragment' => $foundFragment + ]; + } return $this->respond($response, 200, $data); } } // TODO: look for other content like OEmbed or other known services later - return $this->respond($response, 200, [ 'data' => [ 'type' => 'unknown', @@ -217,4 +235,12 @@ class Parse { } } + private static function xPathGetElementById($xpath, $id) { + $element = null; + foreach($xpath->query("//*[@id='$id']") as $el) { + $element = $el; + } + return $element; + } + } diff --git a/lib/HTTPTest.php b/lib/HTTPTest.php index a0858e3..ca3880e 100644 --- a/lib/HTTPTest.php +++ b/lib/HTTPTest.php @@ -10,6 +10,9 @@ class HTTPTest extends HTTPCurl { } public function get($url, $headers=[]) { + $parts = parse_url($url); + unset($parts['fragment']); + $url = \build_url($parts); return $this->_read_file($url); } diff --git a/tests/ParseTest.php b/tests/ParseTest.php index c6b2e8f..5475d61 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -358,4 +358,28 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertEquals('Tantek Çelik', $data['refs']['https://www.facebook.com/tantek.celik']['name']); } + public function testEntryAtFragmentID() { + $url = 'http://source.example.com/fragment-id#comment-1000'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('http://source.example.com/fragment-id#comment-1000', $data['data']['url']); + $this->assertTrue($data['info']['found_fragment']); + } + + public function testEntryAtNonExistentFragmentID() { + $url = 'http://source.example.com/fragment-id#comment-404'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('http://source.example.com/fragment-id', $data['data']['url']); + $this->assertFalse($data['info']['found_fragment']); + } + } diff --git a/tests/data/source.example.com/fragment-id b/tests/data/source.example.com/fragment-id new file mode 100644 index 0000000..7a6c566 --- /dev/null +++ b/tests/data/source.example.com/fragment-id @@ -0,0 +1,22 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This page has comments.

+ + permalink + +