From 05f7d9c86cb663f0d91773d32d4a0bb5565bbfd8 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sat, 11 Nov 2017 10:28:21 -0800 Subject: [PATCH] implement h-feed and other microformats feed parsing --- README.md | 63 +++++------- TODO.md | 40 ++++++++ controllers/Parse.php | 4 + lib/XRay/Formats/HTML.php | 2 +- lib/XRay/Formats/Mf2.php | 37 ++++--- lib/XRay/Formats/Mf2Feed.php | 97 +++++++++++++++++++ tests/FeedTest.php | 73 ++++++++++++++ .../feed.example.com/h-card-with-child-h-feed | 4 + .../h-card-with-sibling-h-entrys | 35 +++++++ .../list-of-hentrys-with-h-card | 4 + 10 files changed, 300 insertions(+), 59 deletions(-) create mode 100644 TODO.md create mode 100644 lib/XRay/Formats/Mf2Feed.php create mode 100644 tests/data/feed.example.com/h-card-with-sibling-h-entrys diff --git a/README.md b/README.md index b5d778d..7d268c7 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ In both cases, you can add an additional parameter to configure various options * `max_redirects` - The maximum number of redirects to follow * `include_original` - Will also return the full document fetched * `target` - Specify a target URL, and XRay will first check if that URL is on the page, and only if it is, will continue to parse the page. This is useful when you're using XRay to verify an incoming webmention. +* `expect=feed` - If you know the thing you are parsing is a feed, include this parameter which will avoid running the autodetection rules and will provide better results for some feeds. Additionally, the following parameters are supported when making requests that use the Twitter or GitHub API. See the authentication section below for details. @@ -272,57 +273,43 @@ If a property supports multiple values, it will always be returned as an array. The content will be an object that always contains a "text" property and may contain an "html" property if the source documented published HTML content. The "text" property must always be HTML escaped before displaying it as HTML, as it may include unescaped characters such as `<` and `>`. -The author will always be set in the entry if available. The service follows the [authorship discovery](http://indiewebcamp.com/authorship) algorithm to try to find the author information elsewhere on the page if it is not inside the entry in the source document. +The author will always be set in the entry if available. The service follows the [authorship discovery](https://indieweb.org/authorship) algorithm to try to find the author information elsewhere on the page if it is not inside the entry in the source document. All URLs provided in the output are absolute URLs. If the source document contains a relative URL, it will be resolved first. -In a future version, replies, likes, reposts, etc. of this post will be included if they are listed on the page. + +#### Other Properties + +Other properties are returned in the response at the same level as the `data` property. + +* `url` - The effective URL that the document was retrieved from. This will be the final URL after following any redirects. +* `code` - The HTTP response code returned by the URL. Typically this will be 200, but if the URL returned an alternate HTTP code that also included an h-entry (such as a 410 deleted notice with a stub h-entry), you can use this to find out that the original URL was actually deleted. + + +#### Feeds + +XRay can return information for several kinds of feeds. The URL (or body) passed to XRay will be checked for the following formats: + +* XML (Atom and RSS) +* JSONFeed (https://jsonfeed.org) +* Microformats [h-feed](https://indieweb.org/h-feed) + +If the page being parsed represents a feed, then the response will look like the following: ```json { "data": { - "type": "entry", - ... - "like": [ - { - "type": "cite", - "author": { - "type": "card", - "name": "Thomas Dunlap", - "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/9055c458a67762637c0071006b16c78f25cb610b224dbc98f48961d772faff4d.jpeg", - "url": "https://twitter.com/spladow" - }, - "url": "https://twitter.com/aaronpk/status/688518372170977280#favorited-by-16467582" - } - ], - "comment": [ - { - "type": "cite", - "author": { - "type": "card", - "name": "Poetica", - "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/192664bb706b2998ed42a50a860490b6aa1bb4926b458ba293b4578af599aa6f.png", - "url": "http://poetica.com/" - }, - "url": "https://twitter.com/poetica/status/689045331426803712", - "published": "2016-01-18T03:23:03-08:00", - "content": { - "text": "@aaronpk @mozillapersona thanks very much! :)" - } - } + "type": "feed", + "items": [ + ] } } - ``` -#### Other Properties - -Other properties are returned in the response at the same level as the `data` property. - -* `url` - The effective URL that the document was retrieved from. This will be the final URL after following any redirects. -* `code` - The HTTP response code returned by the URL. Typically this will be 200, but if the URL returned an alternate HTTP code that also included an h-entry (such as a 410 deleted notice with a stub h-entry), you can use this to find out that the original URL was actually deleted. +Each object in the `items` array will contain a parsed version of the item, in the same format that XRay normally returns. When parsing Microformats feeds, the [authorship discovery](https://indieweb.org/authorship) will be run for each item to build out the author info. +Atom, RSS and JSONFeed will all be normalized to XRay's vocabulary, and only recognized properties will be returned. ## Rels diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..adcd506 --- /dev/null +++ b/TODO.md @@ -0,0 +1,40 @@ + +In a future version, replies, likes, reposts, etc. of this post will be included if they are listed on the page. + +```json +{ + "data": { + "type": "entry", + ... + "like": [ + { + "type": "cite", + "author": { + "type": "card", + "name": "Thomas Dunlap", + "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/9055c458a67762637c0071006b16c78f25cb610b224dbc98f48961d772faff4d.jpeg", + "url": "https://twitter.com/spladow" + }, + "url": "https://twitter.com/aaronpk/status/688518372170977280#favorited-by-16467582" + } + ], + "comment": [ + { + "type": "cite", + "author": { + "type": "card", + "name": "Poetica", + "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/192664bb706b2998ed42a50a860490b6aa1bb4926b458ba293b4578af599aa6f.png", + "url": "http://poetica.com/" + }, + "url": "https://twitter.com/poetica/status/689045331426803712", + "published": "2016-01-18T03:23:03-08:00", + "content": { + "text": "@aaronpk @mozillapersona thanks very much! :)" + } + } + ] + } +} + +``` diff --git a/controllers/Parse.php b/controllers/Parse.php index ed6a753..fc18e07 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -59,6 +59,10 @@ class Parse { $opts['target'] = $request->get('target'); } + if($request->get('expect')) { + $opts['expect'] = $request->get('expect'); + } + if($request->get('pretty')) { $this->_pretty = true; } diff --git a/lib/XRay/Formats/HTML.php b/lib/XRay/Formats/HTML.php index 45c6c45..8c39345 100644 --- a/lib/XRay/Formats/HTML.php +++ b/lib/XRay/Formats/HTML.php @@ -95,7 +95,7 @@ class HTML extends Format { $mf2 = \mf2\Parse($html, $url); if($mf2 && count($mf2['items']) > 0) { - $data = Formats\Mf2::parse($mf2, $url, $http); + $data = Formats\Mf2::parse($mf2, $url, $http, $opts); $result = array_merge($result, $data); if($data) { if($fragment) { diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index 05e9fbb..f8ffd1a 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -5,6 +5,8 @@ use HTMLPurifier, HTMLPurifier_Config; class Mf2 extends Format { + use Mf2Feed; + public static function matches_host($url) { return true; } @@ -13,10 +15,15 @@ class Mf2 extends Format { return true; } - public static function parse($mf2, $url, $http) { + public static function parse($mf2, $url, $http, $opts=[]) { if(count($mf2['items']) == 0) return false; + // If they are expecting a feed, always return a feed or an error + if(isset($opts['expect']) && $opts['expect'] == 'feed') { + return self::parseAsHFeed($mf2, $http); + } + // If there is only one item on the page, just use that if(count($mf2['items']) == 1) { $item = $mf2['items'][0]; @@ -44,18 +51,18 @@ class Mf2 extends Format { #Parse::debug("mf2:0: Recognized $url as an h-product it is the only item on the page"); return self::parseAsHItem($mf2, $item, $http); } - if(in_array('h-feed', $item['type'])) { - #Parse::debug("mf2:0: Recognized $url as an h-feed because it is the only item on the page"); - return self::parseAsHFeed($mf2, $http); - } if(in_array('h-card', $item['type'])) { #Parse::debug("mf2:0: Recognized $url as an h-card it is the only item on the page"); return self::parseAsHCard($item, $http, $url); } + if(in_array('h-feed', $item['type'])) { + #Parse::debug("mf2:0: Recognized $url as an h-feed because it is the only item on the page"); + return self::parseAsHFeed($mf2, $http); + } } // Check the list of items on the page to see if one matches the URL of the page, - // and treat as a permalink for that object if so. Otherwise, parse as a feed. + // and treat as a permalink for that object if so. foreach($mf2['items'] as $item) { if(array_key_exists('url', $item['properties'])) { $urls = $item['properties']['url']; @@ -76,6 +83,8 @@ class Mf2 extends Format { return self::parseAsHProduct($mf2, $item, $http); } elseif(in_array('h-item', $item['type'])) { return self::parseAsHItem($mf2, $item, $http); + } elseif(in_array('h-feed', $item['type'])) { + return self::parseAsHFeed($mf2, $http); } else { #Parse::debug('This object was not a recognized type.'); return false; @@ -135,7 +144,7 @@ class Mf2 extends Format { // Fallback case, but hopefully we have found something before this point foreach($mf2['items'] as $item) { - // Otherwise check for a recognized h-entr* object + // Otherwise check for a recognized h-* object if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) { #Parse::debug("mf2:6: $url is falling back to the first h-entry on the page"); return self::parseAsHEntry($mf2, $item, $http); @@ -532,18 +541,6 @@ class Mf2 extends Format { return $response; } - private static function parseAsHFeed($mf2, $http) { - $data = [ - 'type' => 'feed', - 'todo' => 'Not yet implemented. Please see https://github.com/aaronpk/XRay/issues/1', - 'items' => [], - ]; - - return [ - 'data' => $data - ]; - } - private static function parseAsHCard($item, $http, $authorURL=false) { $data = [ 'type' => 'card', @@ -731,7 +728,7 @@ class Mf2 extends Format { } private static function getURL($url, $http) { - if(!$url) return null; + if(!$url || !$http) return null; // TODO: consider adding caching here $result = $http->get($url); if($result['error'] || !$result['body']) { diff --git a/lib/XRay/Formats/Mf2Feed.php b/lib/XRay/Formats/Mf2Feed.php new file mode 100644 index 0000000..af51e56 --- /dev/null +++ b/lib/XRay/Formats/Mf2Feed.php @@ -0,0 +1,97 @@ + 'feed', + 'items' => [], + ]; + + // Given an mf2 data structure from a web page, assume it is a feed of entries + // and return the XRay data structure for the feed. + // Look for the first (BFS) h-feed if present, otherwise use the list of items. + // Normalize this into a simpler mf2 structure, (h-feed -> h-* children) + $feed = self::_findFirstOfType($mf2, 'h-feed'); + if(!$feed) { + // There was no h-feed. + // Check for a top-level h-card with children + if(isset($mf2['items'][0]) && in_array('h-card', $mf2['items'][0]['type'])) { + $feed = $mf2['items'][0]; + // If the h-card has children, use them, otherwise look for siblings + if(!isset($feed['children'])) { + $items = self::_findAllObjectsExcept($mf2, ['h-card']); + $feed['children'] = $items; + } + } else { + $children = self::_findAllObjectsExcept($mf2, ['h-card','h-feed']); + $feed = [ + 'type' => ['h-feed'], + 'properties' => [], + 'children' => $children + ]; + } + } + if(!isset($feed['children'])) + $feed['children'] = []; + + // Now that the feed has been normalized so all the items are under "children", we + // can transform each entry into the XRay format, including finding the author, etc + foreach($feed['children'] as $item) { + $parsed = false; + if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) { + $parsed = self::parseAsHEntry($mf2, $item, false); + } + elseif(in_array('h-event', $item['type'])) { + $parsed = self::parseAsHEvent($mf2, $item, false); + } + elseif(in_array('h-review', $item['type'])) { + $parsed = self::parseAsHReview($mf2, $item, false); + } + elseif(in_array('h-recipe', $item['type'])) { + $parsed = self::parseAsHRecipe($mf2, $item, false); + } + elseif(in_array('h-product', $item['type'])) { + $parsed = self::parseAsHProduct($mf2, $item, false); + } + elseif(in_array('h-item', $item['type'])) { + $parsed = self::parseAsHItem($mf2, $item, false); + } + elseif(in_array('h-card', $item['type'])) { + $parsed = self::parseAsHCard($mf2, $item, false); + } + if($parsed) { + $data['items'][] = $parsed['data']; + } + } + + return [ + 'data' => $data + ]; + } + + private static function _findFirstOfType($mf2, $type) { + foreach($mf2['items'] as $item) { + if(in_array($type, $item['type'])) { + return $item; + } else { + if(isset($item['children'])) { + $items = $item['children']; + return self::_findFirstOfType(['items'=>$items], $type); + } + } + } + } + + private static function _findAllObjectsExcept($mf2, $types) { + $items = []; + foreach($mf2['items'] as $item) { + if(count(array_intersect($item['type'], $types)) == 0) { + $items[] = $item; + } + } + return $items; + } + +} diff --git a/tests/FeedTest.php b/tests/FeedTest.php index 148f18c..f00aaaf 100644 --- a/tests/FeedTest.php +++ b/tests/FeedTest.php @@ -27,6 +27,11 @@ class FeedTest extends PHPUnit_Framework_TestCase { $data = json_decode($body)->data; $this->assertEquals('feed', $data->type); + $this->assertEquals(4, count($data->items)); + $this->assertEquals('One', $data->items[0]->name); + $this->assertEquals('Two', $data->items[1]->name); + $this->assertEquals('Three', $data->items[2]->name); + $this->assertEquals('Four', $data->items[3]->name); } public function testListOfHEntrysWithHCard() { @@ -38,6 +43,17 @@ class FeedTest extends PHPUnit_Framework_TestCase { $data = json_decode($body)->data; $this->assertEquals('feed', $data->type); + $this->assertEquals(4, count($data->items)); + $this->assertEquals('One', $data->items[0]->name); + $this->assertEquals('Two', $data->items[1]->name); + $this->assertEquals('Three', $data->items[2]->name); + $this->assertEquals('Four', $data->items[3]->name); + + // Check that the author h-card was matched up with each h-entry + $this->assertEquals('Author Name', $data->items[0]->author->name); + $this->assertEquals('Author Name', $data->items[1]->author->name); + $this->assertEquals('Author Name', $data->items[2]->author->name); + $this->assertEquals('Author Name', $data->items[3]->author->name); } public function testShortListOfHEntrysWithHCard() { @@ -49,6 +65,10 @@ class FeedTest extends PHPUnit_Framework_TestCase { $data = json_decode($body)->data; $this->assertEquals('feed', $data->type); + // This test should find the h-entry rather than the h-card, because expect=feed + $this->assertEquals('entry', $data->items[0]->type); + $this->assertEquals('http://feed.example.com/1', $data->items[0]->url); + $this->assertEquals('Author', $data->items[0]->author->name); } public function testTopLevelHFeed() { @@ -60,6 +80,11 @@ class FeedTest extends PHPUnit_Framework_TestCase { $data = json_decode($body)->data; $this->assertEquals('feed', $data->type); + $this->assertEquals(4, count($data->items)); + $this->assertEquals('One', $data->items[0]->name); + $this->assertEquals('Two', $data->items[1]->name); + $this->assertEquals('Three', $data->items[2]->name); + $this->assertEquals('Four', $data->items[3]->name); } public function testHCardWithChildHEntrys() { @@ -71,6 +96,32 @@ class FeedTest extends PHPUnit_Framework_TestCase { $data = json_decode($body)->data; $this->assertEquals('feed', $data->type); + $this->assertEquals(4, count($data->items)); + $this->assertEquals('One', $data->items[0]->name); + $this->assertEquals('Two', $data->items[1]->name); + $this->assertEquals('Three', $data->items[2]->name); + $this->assertEquals('Four', $data->items[3]->name); + } + + public function testHCardWithSiblingHEntrys() { + $url = 'http://feed.example.com/h-card-with-sibling-h-entrys'; + $response = $this->parse(['url' => $url, 'expect' => 'feed']); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body)->data; + + $this->assertEquals('feed', $data->type); + $this->assertEquals(4, count($data->items)); + $this->assertEquals('One', $data->items[0]->name); + $this->assertEquals('Two', $data->items[1]->name); + $this->assertEquals('Three', $data->items[2]->name); + $this->assertEquals('Four', $data->items[3]->name); + // Check that the author h-card was matched up with each h-entry + $this->assertEquals('Author Name', $data->items[0]->author->name); + $this->assertEquals('Author Name', $data->items[1]->author->name); + $this->assertEquals('Author Name', $data->items[2]->author->name); + $this->assertEquals('Author Name', $data->items[3]->author->name); } public function testHCardWithChildHFeed() { @@ -82,6 +133,28 @@ class FeedTest extends PHPUnit_Framework_TestCase { $data = json_decode($body)->data; $this->assertEquals('feed', $data->type); + $this->assertEquals(4, count($data->items)); + $this->assertEquals('One', $data->items[0]->name); + $this->assertEquals('Two', $data->items[1]->name); + $this->assertEquals('Three', $data->items[2]->name); + $this->assertEquals('Four', $data->items[3]->name); + // Check that the author h-card was matched up with each h-entry + $this->assertEquals('Author Name', $data->items[0]->author->name); + $this->assertEquals('Author Name', $data->items[1]->author->name); + $this->assertEquals('Author Name', $data->items[2]->author->name); + $this->assertEquals('Author Name', $data->items[3]->author->name); + } + + public function testHCardWithChildHFeedNoExpect() { + $url = 'http://feed.example.com/h-card-with-child-h-feed'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body)->data; + + $this->assertEquals('card', $data->type); + $this->assertEquals('Author Name', $data->name); } public function testJSONFeed() { diff --git a/tests/data/feed.example.com/h-card-with-child-h-feed b/tests/data/feed.example.com/h-card-with-child-h-feed index 405443a..e499cc0 100644 --- a/tests/data/feed.example.com/h-card-with-child-h-feed +++ b/tests/data/feed.example.com/h-card-with-child-h-feed @@ -16,15 +16,19 @@ Connection: keep-alive diff --git a/tests/data/feed.example.com/h-card-with-sibling-h-entrys b/tests/data/feed.example.com/h-card-with-sibling-h-entrys new file mode 100644 index 0000000..c70e06e --- /dev/null +++ b/tests/data/feed.example.com/h-card-with-sibling-h-entrys @@ -0,0 +1,35 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + + + Author Name + + + + + diff --git a/tests/data/feed.example.com/list-of-hentrys-with-h-card b/tests/data/feed.example.com/list-of-hentrys-with-h-card index 996c72a..40bab56 100644 --- a/tests/data/feed.example.com/list-of-hentrys-with-h-card +++ b/tests/data/feed.example.com/list-of-hentrys-with-h-card @@ -13,15 +13,19 @@ Connection: keep-alive