From 18dc92966bb0d40b5463b09aa7cba3a168cb5c3b Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 9 Nov 2018 09:46:43 -0800 Subject: [PATCH] recognize pattern of h-entry + h-card * a single h-entry and h-card, where the h-entry has no URL, will result in a permalink page with that h-entry * multiple h-entrys followed by an h-card is a feed --- lib/XRay/Formats/Mf2.php | 32 ++++++++++---- tests/FeedTest.php | 42 +++++++++++++++++++ .../source.example.com/rel-alternate-mf2-json | 2 +- .../rel-alternate-mf2-json.json | 2 +- 4 files changed, 69 insertions(+), 9 deletions(-) diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index 7023f1b..4333a70 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -22,8 +22,26 @@ class Mf2 extends Format { return self::parseAsHFeed($mf2, $http, $url); } - // If there is only one item on the page, just use that - if(count($mf2['items']) == 1) { + // Remove h-breadcrumb since we never use it and it causes problems determining + // whether a page is a feed or permalink + $mf2['items'] = array_values(array_filter($mf2['items'], function($item){ + return !in_array('h-breadcrumb', $item['type']); + })); + + $items = $mf2['items']; + + // If there is more than one item on the page, it may be a feed. + // Remove an h-card if there is one that doesn't match the page URL, then try again. + // (Don't modify the actual tree, but compare on the modified tree) + if(count($items) > 1) { + $tmpmf2 = array_filter($items, function($item) use($url){ + return !(in_array('h-card', $item['type']) && isset($item['properties']['url'][0]) && $item['properties']['url'][0] != $url); + }); + $items = array_values($tmpmf2); + } + + // If there is only one item left on the page, it's a permalink, and just use that + if(count($items) == 1) { $item = $mf2['items'][0]; if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) { #Parse::debug("mf2:0: Recognized $url as an h-entry it is the only item on the page"); @@ -130,18 +148,18 @@ class Mf2 extends Format { } } - // If there was more than one h-entry on the page, treat the whole page as a feed - if(count($mf2['items']) > 1) { - if(count(array_filter($mf2['items'], function($item){ + // At this point, if there are any h-entrys left on the page, it's probably a feed. + if(count($items) > 0) { + if(count(array_filter($items, function($item){ return in_array('h-entry', $item['type']); - })) > 1) { + })) > 0) { #Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one object on the page"); return self::parseAsHFeed($mf2, $http, $url); } } // If the first item is an h-feed, parse as a feed - $first = $mf2['items'][0]; + $first = $items[0]; if(in_array('h-feed', $first['type'])) { #Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed"); return self::parseAsHFeed($mf2, $http, $url); diff --git a/tests/FeedTest.php b/tests/FeedTest.php index b206857..4bb6246 100644 --- a/tests/FeedTest.php +++ b/tests/FeedTest.php @@ -61,6 +61,48 @@ class FeedTest extends PHPUnit_Framework_TestCase { $this->assertEquals('Author Name', $data->items[3]->author->name); } + public function testListOfHEntrysWithHCardNoExpect() { + $url = 'http://feed.example.com/list-of-hentrys-with-h-card'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $result = json_decode($body); + $this->assertEquals('mf2+html', $result->{'source-format'}); + $data = $result->data; + + $this->assertEquals('feed', $data->type); + $this->assertEquals(4, count($data->items)); + $this->assertEquals('One', $data->items[0]->name); + $this->assertEquals('article', $data->items[0]->{'post-type'}); + $this->assertEquals('Two', $data->items[1]->name); + $this->assertEquals('Three', $data->items[2]->name); + $this->assertEquals('Four', $data->items[3]->name); + + // Check that the author h-card was matched up with each h-entry + $this->assertEquals('Author Name', $data->items[0]->author->name); + $this->assertEquals('Author Name', $data->items[1]->author->name); + $this->assertEquals('Author Name', $data->items[2]->author->name); + $this->assertEquals('Author Name', $data->items[3]->author->name); + } + + public function testShortListOfHEntrysWithHCardNoExpect() { + $url = 'http://feed.example.com/short-list-of-hentrys-with-h-card'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $result = json_decode($body); + $this->assertEquals('mf2+html', $result->{'source-format'}); + $data = $result->data; + + // In this case, this looks like a page permalink + $this->assertEquals('entry', $data->type); + // This test should find the h-entry rather than the h-card, because the h-card does not contain the page URL + $this->assertEquals('http://feed.example.com/1', $data->url); + $this->assertEquals('Author', $data->author->name); + } + public function testShortListOfHEntrysWithHCard() { $url = 'http://feed.example.com/short-list-of-hentrys-with-h-card'; $response = $this->parse(['url' => $url, 'expect' => 'feed']); diff --git a/tests/data/source.example.com/rel-alternate-mf2-json b/tests/data/source.example.com/rel-alternate-mf2-json index 48085f6..2b96d4c 100644 --- a/tests/data/source.example.com/rel-alternate-mf2-json +++ b/tests/data/source.example.com/rel-alternate-mf2-json @@ -256,7 +256,7 @@ Connection: keep-alive #indieauth
- + diff --git a/tests/data/source.example.com/rel-alternate-mf2-json.json b/tests/data/source.example.com/rel-alternate-mf2-json.json index 2e4fd65..f7bdcb7 100644 --- a/tests/data/source.example.com/rel-alternate-mf2-json.json +++ b/tests/data/source.example.com/rel-alternate-mf2-json.json @@ -18,7 +18,7 @@ Connection: keep-alive "indieauth" ], "url": [ - "https://aaronparecki.com/2018/07/12/10/indieauth" + "http://source.example.com/rel-alternate-mf2-json" ], "syndication": [ "https://twitter.com/aaronpk/status/1017500609631567872"