Browse Source

parse URLs with fragment IDs

If the input URL contains a fragment, finds the DOM tree at that ID and runs the subtree through the mf2 parser.

closes #15
pull/39/head
Aaron Parecki 4 years ago
parent
commit
3bdafad98e
No known key found for this signature in database GPG Key ID: 276C2817346D6056
4 changed files with 79 additions and 4 deletions
  1. +30
    -4
      controllers/Parse.php
  2. +3
    -0
      lib/HTTPTest.php
  3. +24
    -0
      tests/ParseTest.php
  4. +22
    -0
      tests/data/source.example.com/fragment-id

+ 30
- 4
controllers/Parse.php View File

@ -154,10 +154,10 @@ class Parse {
]);
}
$xpath = new DOMXPath($doc);
// If a target parameter was provided, make sure a link to it exists on the page
if($target=$request->get('target')) {
$xpath = new DOMXPath($doc);
$found = [];
if($target) {
self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
@ -190,19 +190,37 @@ class Parse {
}
}
// If the URL has a fragment ID, find the DOM starting at that node and parse it instead
$html = $result['body'];
$fragment = parse_url($url, PHP_URL_FRAGMENT);
if($fragment) {
$fragElement = self::xPathGetElementById($xpath, $fragment);
if($fragElement) {
$html = $doc->saveHTML($fragElement);
$foundFragment = true;
} else {
$foundFragment = false;
}
}
// Now start pulling in the data from the page. Start by looking for microformats2
$mf2 = mf2\Parse($result['body'], $result['url']);
$mf2 = mf2\Parse($html, $result['url']);
if($mf2 && count($mf2['items']) > 0) {
$data = Formats\Mf2::parse($mf2, $result['url'], $this->http);
if($data) {
if($fragment) {
$data['info'] = [
'found_fragment' => $foundFragment
];
}
return $this->respond($response, 200, $data);
}
}
// TODO: look for other content like OEmbed or other known services later
return $this->respond($response, 200, [
'data' => [
'type' => 'unknown',
@ -217,4 +235,12 @@ class Parse {
}
}
private static function xPathGetElementById($xpath, $id) {
$element = null;
foreach($xpath->query("//*[@id='$id']") as $el) {
$element = $el;
}
return $element;
}
}

+ 3
- 0
lib/HTTPTest.php View File

@ -10,6 +10,9 @@ class HTTPTest extends HTTPCurl {
}
public function get($url, $headers=[]) {
$parts = parse_url($url);
unset($parts['fragment']);
$url = \build_url($parts);
return $this->_read_file($url);
}

+ 24
- 0
tests/ParseTest.php View File

@ -358,4 +358,28 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('Tantek Çelik', $data['refs']['https://www.facebook.com/tantek.celik']['name']);
}
public function testEntryAtFragmentID() {
$url = 'http://source.example.com/fragment-id#comment-1000';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('http://source.example.com/fragment-id#comment-1000', $data['data']['url']);
$this->assertTrue($data['info']['found_fragment']);
}
public function testEntryAtNonExistentFragmentID() {
$url = 'http://source.example.com/fragment-id#comment-404';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('http://source.example.com/fragment-id', $data['data']['url']);
$this->assertFalse($data['info']['found_fragment']);
}
}

+ 22
- 0
tests/data/source.example.com/fragment-id View File

@ -0,0 +1,22 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content p-name">This page has comments.</p>
<ul>
<li class="p-comment h-entry" id="comment-1000">
<span class="p-content p-name">Comment text</span>
<a href="http://user.example.com/" class="p-author h-card">A. Commenter</a>
<a href="#comment-1000" class="u-url">comment permalink</a>
</li>
</ul>
<a href="" class="u-url">permalink</a>
</body>
</html>

Loading…
Cancel
Save