From 5d8fb4e13c17c8764893d2e64d12e65c54d5ad57 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Wed, 25 Jan 2017 13:42:00 -0800 Subject: [PATCH] support h-review and h-product vocab * closes #23 * major refactor of the methods for extracting properties to consolidate the logic * hReview parsing is incomplete due to issues with the php-mf2 backcompat parsing. see https://github.com/indieweb/php-mf2/issues/107 --- lib/Formats/Mf2.php | 330 ++++++++++-------- tests/FeedTest.php | 2 +- tests/ParseTest.php | 68 +++- .../source.example.com/h-review-of-h-card | 27 ++ .../source.example.com/h-review-of-product | 32 ++ tests/data/source.example.com/hReview | 34 ++ 6 files changed, 353 insertions(+), 140 deletions(-) create mode 100644 tests/data/source.example.com/h-review-of-h-card create mode 100644 tests/data/source.example.com/h-review-of-product create mode 100644 tests/data/source.example.com/hReview diff --git a/lib/Formats/Mf2.php b/lib/Formats/Mf2.php index accdf37..4d74881 100644 --- a/lib/Formats/Mf2.php +++ b/lib/Formats/Mf2.php @@ -9,6 +9,7 @@ class Mf2 { public static function parse($mf2, $url, $http) { if(count($mf2['items']) == 0) return false; + // If there is only one item on the page, just use that if(count($mf2['items']) == 1) { $item = $mf2['items'][0]; @@ -20,79 +21,106 @@ class Mf2 { Parse::debug("mf2:0: Recognized $url as an h-event it is the only item on the page"); return self::parseAsHEvent($mf2, $item, $http); } - } - - // Check if the list of items is a bunch of h-entrys and return as a feed - // Unless this page's URL matches one of the entries, then treat it as a permalink - $hentrys = 0; - $lastSeenEntry = false; - foreach($mf2['items'] as $item) { - if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) { - if(array_key_exists('url', $item['properties'])) { - $urls = $item['properties']['url']; - $urls = array_map('self::normalize_url', $urls); - if(in_array($url, $urls)) { - Parse::debug("mf2:1: Recognized $url as an h-entry because an h-entry on the page matched the URL of the request"); - return self::parseAsHEntry($mf2, $item, $http); - } - $lastSeenEntry = $item; - } - $hentrys++; + if(in_array('h-review', $item['type'])) { + Parse::debug("mf2:0: Recognized $url as an h-review it is the only item on the page"); + return self::parseAsHReview($mf2, $item, $http); + } + if(in_array('h-product', $item['type'])) { + Parse::debug("mf2:0: Recognized $url as an h-product it is the only item on the page"); + return self::parseAsHProduct($mf2, $item, $http); + } + if(in_array('h-feed', $item['type'])) { + Parse::debug("mf2:0: Recognized $url as an h-feed because it is the only item on the page"); + return self::parseAsHFeed($mf2, $http); } } - // If there was more than one h-entry on the page, treat the whole page as a feed - if($hentrys > 1) { - Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one h-entry on the page"); - return self::parseAsHFeed($mf2, $http); - } - - // If the first item is an h-feed, parse as a feed - $first = $mf2['items'][0]; - if(in_array('h-feed', $first['type'])) { - Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed"); - return self::parseAsHFeed($mf2, $http); - } - - // Check each top-level h-card and h-event, and if there is one that matches this URL, the page is an h-card + // Check the list of items on the page to see if one matches the URL of the page, + // and treat as a permalink for that object if so. Otherwise, parse as a feed. foreach($mf2['items'] as $item) { - if((in_array('h-card', $item['type']) or in_array('h-event', $item['type'])) - and array_key_exists('url', $item['properties']) - ) { + if(array_key_exists('url', $item['properties'])) { $urls = $item['properties']['url']; $urls = array_map('self::normalize_url', $urls); if(in_array($url, $urls)) { - // TODO: check for children h-entrys (like tantek.com), or sibling h-entries (like aaronparecki.com) - // and return the result as a feed instead + Parse::debug("mf2:1: Recognized $url as a permalink because an object on the page matched the URL of the request"); if(in_array('h-card', $item['type'])) { - Parse::debug("mf2:4: Recognized $url as an h-card because an h-card on the page matched the URL of the request"); return self::parseAsHCard($item, $http, $url); - } else { - Parse::debug("mf2:4: Recognized $url as an h-event because an h-event on the page matched the URL of the request"); + } elseif(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) { + return self::parseAsHEntry($mf2, $item, $http); + } elseif(in_array('h-event', $item['type'])) { return self::parseAsHEvent($mf2, $item, $http); + } elseif(in_array('h-review', $item['type'])) { + return self::parseAsHReview($mf2, $item, $http); + } elseif(in_array('h-product', $item['type'])) { + return self::parseAsHProduct($mf2, $item, $http); + } else { + Parse::debug('This object was not a recognized type.'); + return false; } } } } - // If there was only one h-entry, but the URL for it is not the same as this page, then treat as a feed - if($hentrys == 1) { - if($lastSeenEntry) { - $urls = $lastSeenEntry['properties']['url']; - $urls = array_map('self::normalize_url', $urls); - if(count($urls) && !in_array($url, $urls)) { - Parse::debug("mf2:5: Recognized $url as an h-feed no h-entrys on the page matched the URL of the request"); - return self::parseAsHFeed($mf2, $http); + // Check for an h-card matching rel=author or the author URL of any h-* on the page, + // and return the h-* object if so + if(isset($mf2['rels']['author'])) { + foreach($mf2['items'] as $card) { + if(in_array('h-card', $card['type']) && array_key_exists('url', $card['properties'])) { + $urls = $card['properties']['url']; + $urls = array_map('self::normalize_url', $urls); + if(count(array_intersect($urls, $mf2['rels']['author'])) > 0) { + // There is an author h-card on this page + // Now look for the first h-* object other than an h-card and use that as the object + foreach($mf2['items'] as $item) { + if(!in_array('h-card', $item['type'])) { + if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) { + return self::parseAsHEntry($mf2, $item, $http); + } elseif(in_array('h-event', $item['type'])) { + return self::parseAsHEvent($mf2, $item, $http); + } elseif(in_array('h-review', $item['type'])) { + return self::parseAsHReview($mf2, $item, $http); + } elseif(in_array('h-product', $item['type'])) { + return self::parseAsHProduct($mf2, $item, $http); + } + } + } + } } } } + // If there was more than one h-entry on the page, treat the whole page as a feed + if(count($mf2['items']) > 1) { + if(count(array_filter($mf2['items'], function($item){ + return in_array('h-entry', $item['type']); + })) > 1) { + Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one object on the page"); + return self::parseAsHFeed($mf2, $http); + } + } + + // If the first item is an h-feed, parse as a feed + $first = $mf2['items'][0]; + if(in_array('h-feed', $first['type'])) { + Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed"); + return self::parseAsHFeed($mf2, $http); + } + // Fallback case, but hopefully we have found something before this point foreach($mf2['items'] as $item) { - // Otherwise check for an h-entry + // Otherwise check for a recognized h-entr* object if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) { Parse::debug("mf2:6: $url is falling back to the first h-entry on the page"); return self::parseAsHEntry($mf2, $item, $http); + } elseif(in_array('h-event', $item['type'])) { + Parse::debug("mf2:6: $url is falling back to the first h-event on the page"); + return self::parseAsHEvent($mf2, $item, $http); + } elseif(in_array('h-review', $item['type'])) { + Parse::debug("mf2:6: $url is falling back to the first h-review on the page"); + return self::parseAsHReview($mf2, $item, $http); + } elseif(in_array('h-product', $item['type'])) { + Parse::debug("mf2:6: $url is falling back to the first h-product on the page"); + return self::parseAsHProduct($mf2, $item, $http); } } @@ -101,70 +129,70 @@ class Mf2 { return false; } - private static function parseAsHEntry($mf2, $item, $http) { - $data = [ - 'type' => 'entry' - ]; - $refs = []; - - // Single plaintext values - $properties = ['url','published','summary','rsvp']; + private static function collectSingleValues($properties, $urlProperties, $item, &$data) { foreach($properties as $p) { if(($v = self::getPlaintext($item, $p)) !== null) { - if($p == 'url') { - if(self::isURL($v)) - $data[$p] = $v; - } else { + $data[$p] = $v; + } + } + foreach($urlProperties as $p) { + if(($v = self::getPlaintext($item, $p)) !== null) { + if(self::isURL($v)) $data[$p] = $v; - } } } + } - // Always arrays - $properties = ['photo','video','audio','syndication']; + // Always return arrays, and may contain plaintext content + // Nested objects are added to refs and the URL is used as the value if present + private static function collectArrayValues($properties, $item, &$data, &$refs, &$http) { foreach($properties as $p) { if(array_key_exists($p, $item['properties'])) { foreach($item['properties'][$p] as $v) { - if(is_string($v) && self::isURL($v)) { + if(is_string($v)) { if(!array_key_exists($p, $data)) $data[$p] = []; $data[$p][] = $v; - } - elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) { - if(!array_key_exists($p, $data)) $data[$p] = []; - $data[$p][] = $v['value']; - } - } - } - } - - // Always returned as arrays, and may also create external references - // If these are not objects, they must be URLs - $set = [ - 'normal' => ['category','invitee'], - 'url' => ['in-reply-to','like-of','repost-of','bookmark-of'] - ]; - foreach($set as $type=>$properties) { - foreach($properties as $p) { - if(array_key_exists($p, $item['properties'])) { - foreach($item['properties'][$p] as $v) { - if(is_string($v) && ($type == 'normal' || self::isURL($v))) { - if(!array_key_exists($p, $data)) $data[$p] = []; - $data[$p][] = $v; - } - elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) { + } elseif(self::isMicroformat($v)) { + if(($u=self::getPlaintext($v, 'url')) && self::isURL($u)) { if(!array_key_exists($p, $data)) $data[$p] = []; $data[$p][] = $u; - // parse the object and put the result in the "refs" object $ref = self::parse(['items'=>[$v]], $u, $http); if($ref) { $refs[$u] = $ref['data']; } + } else { + if(!array_key_exists($p, $data)) $data[$p] = []; + $data[$p][] = $v['value']; } } - } + } } } + } + + private static function collectArrayURLValues($properties, $item, &$data, &$refs, &$http) { + foreach($properties as $p) { + if(array_key_exists($p, $item['properties'])) { + foreach($item['properties'][$p] as $v) { + if(is_string($v) && self::isURL($v)) { + if(!array_key_exists($p, $data)) $data[$p] = []; + $data[$p][] = $v; + } + elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) { + if(!array_key_exists($p, $data)) $data[$p] = []; + $data[$p][] = $u; + // parse the object and put the result in the "refs" object + $ref = self::parse(['items'=>[$v]], $u, $http); + if($ref) { + $refs[$u] = $ref['data']; + } + } + } + } + } + } + private static function determineNameAndContent($item, &$data) { // Determine if the name is distinct from the content $name = self::getPlaintext($item, 'name'); $content = null; @@ -210,8 +238,56 @@ class Mf2 { $data['content']['html'] = $htmlContent; } // TODO: If no HTML content was included in the post, create HTML by autolinking? + } + } + + private static function parseAsHEntry($mf2, $item, $http) { + $data = [ + 'type' => 'entry' + ]; + $refs = []; + + // Single plaintext and URL values + self::collectSingleValues(['published','summary','rsvp'], ['url'], $item, $data); + + // These properties are always returned as arrays and may contain plaintext content + self::collectArrayValues(['category','invitee'], $item, $data, $refs, $http); + + // These properties are always returned as arrays and always URLs + // If the value is an h-* object with a URL, the URL is used and a "ref" is added as well + self::collectArrayURLValues(['photo','video','audio','syndication','in-reply-to','like-of','repost-of','bookmark-of'], $item, $data, $refs, $http); + + self::determineNameAndContent($item, $data); + + if($author = self::findAuthor($mf2, $item, $http)) + $data['author'] = $author; + + $response = [ + 'data' => $data + ]; + + if(count($refs)) { + $response['refs'] = $refs; } + return $response; + } + + private static function parseAsHReview($mf2, $item, $http) { + $data = [ + 'type' => 'review' + ]; + $refs = []; + + // TODO: add description as an HTML value + self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $data); + + self::collectArrayValues(['category'], $item, $data, $refs, $http); + + self::collectArrayURLValues(['item'], $item, $data, $refs, $http); + + self::determineNameAndContent($item, $data); + if($author = self::findAuthor($mf2, $item, $http)) $data['author'] = $author; @@ -226,61 +302,39 @@ class Mf2 { return $response; } + private static function parseAsHProduct($mf2, $item, $http) { + $data = [ + 'type' => 'product' + ]; + + self::collectSingleValues(['name','identifier','price'], ['url'], $item, $data); + + self::collectArrayValues(['category','brand'], $item, $data, $refs, $http); + + self::collectArrayURLValues(['photo','video','audio'], $item, $data, $refs, $http); + + $response = [ + 'data' => $data + ]; + + return $response; + } + private static function parseAsHEvent($mf2, $item, $http) { $data = [ 'type' => 'event' ]; $refs = []; - // Single plaintext values - $properties = ['name','summary','url','published','start','end','duration']; - foreach($properties as $p) { - if(($v = self::getPlaintext($item, $p)) !== null) { - if($p == 'url') { - if(self::isURL($v)) - $data[$p] = $v; - } else { - $data[$p] = $v; - } - } - } + // Single plaintext and URL values + self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $data); - // Always arrays - $properties = ['photo','video','audio','syndication']; - foreach($properties as $p) { - if(array_key_exists($p, $item['properties'])) { - foreach($item['properties'][$p] as $v) { - if(is_string($v) && self::isURL($v)) { - if(!array_key_exists($p, $data)) $data[$p] = []; - $data[$p][] = $v; - } - elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) { - if(!array_key_exists($p, $data)) $data[$p] = []; - $data[$p][] = $v['value']; - } - } - } - } + // These properties are always returned as arrays and may contain plaintext content + self::collectArrayValues(['category','location','attendee'], $item, $data, $refs, $http); - // Always returned as arrays, and may also create external references - $properties = ['category','location','attendee']; - foreach($properties as $p) { - if(array_key_exists($p, $item['properties'])) { - $data[$p] = []; - foreach($item['properties'][$p] as $v) { - if(is_string($v)) - $data[$p][] = $v; - elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) { - $data[$p][] = $u; - // parse the object and put the result in the "refs" object - $ref = self::parse(['items'=>[$v]], $u, $http); - if($ref) { - $refs[$u] = $ref['data']; - } - } - } - } - } + // These properties are always returned as arrays and always URLs + // If the value is an h-* object with a URL, the URL is used and a "ref" is added as well + self::collectArrayURLValues(['photo','video','audio','syndication'], $item, $data, $refs, $http); // If there is a description, always return the plaintext description, and return HTML description if it's different $textDescription = null; diff --git a/tests/FeedTest.php b/tests/FeedTest.php index a7e5932..7acefc1 100644 --- a/tests/FeedTest.php +++ b/tests/FeedTest.php @@ -48,7 +48,7 @@ class FeedTest extends PHPUnit_Framework_TestCase { $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); - $this->assertEquals('feed', $data->data->type); + $this->assertEquals('entry', $data->data->type); } public function testTopLevelHFeed() { diff --git a/tests/ParseTest.php b/tests/ParseTest.php index de96563..22acbd3 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -222,7 +222,6 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body, true); $this->assertEquals('entry', $data['data']['type']); - print_r($data['data']); $this->assertEquals('http://syndicated.example/', $data['data']['syndication'][0]); } @@ -357,6 +356,73 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertEquals('Venue', $data['refs']['http://source.example.com/venue']['name']); } + public function testMf2ReviewOfProduct() { + $url = 'http://source.example.com/h-review-of-product'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('review', $data['data']['type']); + $this->assertEquals('Review', $data['data']['name']); + $this->assertEquals('Not great', $data['data']['summary']); + $this->assertEquals('3', $data['data']['rating']); + $this->assertEquals('5', $data['data']['best']); + $this->assertEquals('This is the full text of the review', $data['data']['content']['text']); + $this->assertContains('red', $data['data']['category']); + $this->assertContains('blue', $data['data']['category']); + $this->assertContains('http://product.example.com/', $data['data']['item']); + $this->assertArrayHasKey('http://product.example.com/', $data['refs']); + $this->assertEquals('product', $data['refs']['http://product.example.com/']['type']); + $this->assertEquals('The Reviewed Product', $data['refs']['http://product.example.com/']['name']); + $this->assertEquals('http://product.example.com/', $data['refs']['http://product.example.com/']['url']); + } + + public function testMf2ReviewOfHCard() { + $url = 'http://source.example.com/h-review-of-h-card'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('review', $data['data']['type']); + $this->assertEquals('Review', $data['data']['name']); + $this->assertEquals('Not great', $data['data']['summary']); + $this->assertEquals('3', $data['data']['rating']); + $this->assertEquals('5', $data['data']['best']); + $this->assertEquals('This is the full text of the review', $data['data']['content']['text']); + $this->assertContains('http://business.example.com/', $data['data']['item']); + $this->assertArrayHasKey('http://business.example.com/', $data['refs']); + $this->assertEquals('card', $data['refs']['http://business.example.com/']['type']); + $this->assertEquals('The Reviewed Business', $data['refs']['http://business.example.com/']['name']); + $this->assertEquals('http://business.example.com/', $data['refs']['http://business.example.com/']['url']); + } + + public function testMf1Review() { + $url = 'http://source.example.com/hReview'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('review', $data['data']['type']); + $this->assertEquals('Review', $data['data']['name']); + # TODO: backcompat of mf1 parser is kind of messed up right now + #$this->assertEquals('Not great', $data['data']['summary']); + $this->assertEquals('3', $data['data']['rating']); + $this->assertEquals('5', $data['data']['best']); + #$this->assertEquals('This is the full text of the review', $data['data']['content']['text']); + // $this->assertContains('http://product.example.com/', $data['data']['item']); + // $this->assertArrayHasKey('http://product.example.com/', $data['refs']); + // $this->assertEquals('product', $data['refs']['http://product.example.com/']['type']); + // $this->assertEquals('The Reviewed Product', $data['refs']['http://product.example.com/']['name']); + // $this->assertEquals('http://product.example.com/', $data['refs']['http://product.example.com/']['url']); + + } + public function testEntryIsAnInvitee() { $url = 'http://source.example.com/bridgy-invitee'; $response = $this->parse(['url' => $url]); diff --git a/tests/data/source.example.com/h-review-of-h-card b/tests/data/source.example.com/h-review-of-h-card new file mode 100644 index 0000000..0b5558d --- /dev/null +++ b/tests/data/source.example.com/h-review-of-h-card @@ -0,0 +1,27 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Review + + +

Review

+ + permalink + +

The Reviewed Business

+ + 3 out of 5 + +
Not great
+ +
+ This is the full text of the review +
+ + + diff --git a/tests/data/source.example.com/h-review-of-product b/tests/data/source.example.com/h-review-of-product new file mode 100644 index 0000000..6fff5b0 --- /dev/null +++ b/tests/data/source.example.com/h-review-of-product @@ -0,0 +1,32 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Review + + +

Review

+ + permalink + +

The Reviewed Product

+ + 3 out of 5 + +
Not great
+ +
+ This is the full text of the review +
+ + + + + diff --git a/tests/data/source.example.com/hReview b/tests/data/source.example.com/hReview new file mode 100644 index 0000000..a3d6c33 --- /dev/null +++ b/tests/data/source.example.com/hReview @@ -0,0 +1,34 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Review + + +

Review

+ + + +
+

The Reviewed Product

+
+ + 3 out of 5 + + Aaron Parecki + + 2016-12-15T22:32:42+01:00 + +
Not great
+ +
+ This is the full text of the review +
+ + + +