Browse Source

support h-review and h-product vocab

* closes #23
* major refactor of the methods for extracting properties to consolidate the logic
* hReview parsing is incomplete due to issues with the php-mf2 backcompat parsing. see https://github.com/indieweb/php-mf2/issues/107
pull/39/head
Aaron Parecki 4 years ago
parent
commit
5d8fb4e13c
No known key found for this signature in database GPG Key ID: 276C2817346D6056
6 changed files with 353 additions and 140 deletions
  1. +192
    -138
      lib/Formats/Mf2.php
  2. +1
    -1
      tests/FeedTest.php
  3. +67
    -1
      tests/ParseTest.php
  4. +27
    -0
      tests/data/source.example.com/h-review-of-h-card
  5. +32
    -0
      tests/data/source.example.com/h-review-of-product
  6. +34
    -0
      tests/data/source.example.com/hReview

+ 192
- 138
lib/Formats/Mf2.php View File

@ -9,6 +9,7 @@ class Mf2 {
public static function parse($mf2, $url, $http) {
if(count($mf2['items']) == 0)
return false;
// If there is only one item on the page, just use that
if(count($mf2['items']) == 1) {
$item = $mf2['items'][0];
@ -20,79 +21,106 @@ class Mf2 {
Parse::debug("mf2:0: Recognized $url as an h-event it is the only item on the page");
return self::parseAsHEvent($mf2, $item, $http);
}
}
// Check if the list of items is a bunch of h-entrys and return as a feed
// Unless this page's URL matches one of the entries, then treat it as a permalink
$hentrys = 0;
$lastSeenEntry = false;
foreach($mf2['items'] as $item) {
if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
if(array_key_exists('url', $item['properties'])) {
$urls = $item['properties']['url'];
$urls = array_map('self::normalize_url', $urls);
if(in_array($url, $urls)) {
Parse::debug("mf2:1: Recognized $url as an h-entry because an h-entry on the page matched the URL of the request");
return self::parseAsHEntry($mf2, $item, $http);
}
$lastSeenEntry = $item;
}
$hentrys++;
if(in_array('h-review', $item['type'])) {
Parse::debug("mf2:0: Recognized $url as an h-review it is the only item on the page");
return self::parseAsHReview($mf2, $item, $http);
}
if(in_array('h-product', $item['type'])) {
Parse::debug("mf2:0: Recognized $url as an h-product it is the only item on the page");
return self::parseAsHProduct($mf2, $item, $http);
}
if(in_array('h-feed', $item['type'])) {
Parse::debug("mf2:0: Recognized $url as an h-feed because it is the only item on the page");
return self::parseAsHFeed($mf2, $http);
}
}
// If there was more than one h-entry on the page, treat the whole page as a feed
if($hentrys > 1) {
Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one h-entry on the page");
return self::parseAsHFeed($mf2, $http);
}
// If the first item is an h-feed, parse as a feed
$first = $mf2['items'][0];
if(in_array('h-feed', $first['type'])) {
Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed");
return self::parseAsHFeed($mf2, $http);
}
// Check each top-level h-card and h-event, and if there is one that matches this URL, the page is an h-card
// Check the list of items on the page to see if one matches the URL of the page,
// and treat as a permalink for that object if so. Otherwise, parse as a feed.
foreach($mf2['items'] as $item) {
if((in_array('h-card', $item['type']) or in_array('h-event', $item['type']))
and array_key_exists('url', $item['properties'])
) {
if(array_key_exists('url', $item['properties'])) {
$urls = $item['properties']['url'];
$urls = array_map('self::normalize_url', $urls);
if(in_array($url, $urls)) {
// TODO: check for children h-entrys (like tantek.com), or sibling h-entries (like aaronparecki.com)
// and return the result as a feed instead
Parse::debug("mf2:1: Recognized $url as a permalink because an object on the page matched the URL of the request");
if(in_array('h-card', $item['type'])) {
Parse::debug("mf2:4: Recognized $url as an h-card because an h-card on the page matched the URL of the request");
return self::parseAsHCard($item, $http, $url);
} else {
Parse::debug("mf2:4: Recognized $url as an h-event because an h-event on the page matched the URL of the request");
} elseif(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
return self::parseAsHEntry($mf2, $item, $http);
} elseif(in_array('h-event', $item['type'])) {
return self::parseAsHEvent($mf2, $item, $http);
} elseif(in_array('h-review', $item['type'])) {
return self::parseAsHReview($mf2, $item, $http);
} elseif(in_array('h-product', $item['type'])) {
return self::parseAsHProduct($mf2, $item, $http);
} else {
Parse::debug('This object was not a recognized type.');
return false;
}
}
}
}
// If there was only one h-entry, but the URL for it is not the same as this page, then treat as a feed
if($hentrys == 1) {
if($lastSeenEntry) {
$urls = $lastSeenEntry['properties']['url'];
$urls = array_map('self::normalize_url', $urls);
if(count($urls) && !in_array($url, $urls)) {
Parse::debug("mf2:5: Recognized $url as an h-feed no h-entrys on the page matched the URL of the request");
return self::parseAsHFeed($mf2, $http);
// Check for an h-card matching rel=author or the author URL of any h-* on the page,
// and return the h-* object if so
if(isset($mf2['rels']['author'])) {
foreach($mf2['items'] as $card) {
if(in_array('h-card', $card['type']) && array_key_exists('url', $card['properties'])) {
$urls = $card['properties']['url'];
$urls = array_map('self::normalize_url', $urls);
if(count(array_intersect($urls, $mf2['rels']['author'])) > 0) {
// There is an author h-card on this page
// Now look for the first h-* object other than an h-card and use that as the object
foreach($mf2['items'] as $item) {
if(!in_array('h-card', $item['type'])) {
if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
return self::parseAsHEntry($mf2, $item, $http);
} elseif(in_array('h-event', $item['type'])) {
return self::parseAsHEvent($mf2, $item, $http);
} elseif(in_array('h-review', $item['type'])) {
return self::parseAsHReview($mf2, $item, $http);
} elseif(in_array('h-product', $item['type'])) {
return self::parseAsHProduct($mf2, $item, $http);
}
}
}
}
}
}
}
// If there was more than one h-entry on the page, treat the whole page as a feed
if(count($mf2['items']) > 1) {
if(count(array_filter($mf2['items'], function($item){
return in_array('h-entry', $item['type']);
})) > 1) {
Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one object on the page");
return self::parseAsHFeed($mf2, $http);
}
}
// If the first item is an h-feed, parse as a feed
$first = $mf2['items'][0];
if(in_array('h-feed', $first['type'])) {
Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed");
return self::parseAsHFeed($mf2, $http);
}
// Fallback case, but hopefully we have found something before this point
foreach($mf2['items'] as $item) {
// Otherwise check for an h-entry
// Otherwise check for a recognized h-entr* object
if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
Parse::debug("mf2:6: $url is falling back to the first h-entry on the page");
return self::parseAsHEntry($mf2, $item, $http);
} elseif(in_array('h-event', $item['type'])) {
Parse::debug("mf2:6: $url is falling back to the first h-event on the page");
return self::parseAsHEvent($mf2, $item, $http);
} elseif(in_array('h-review', $item['type'])) {
Parse::debug("mf2:6: $url is falling back to the first h-review on the page");
return self::parseAsHReview($mf2, $item, $http);
} elseif(in_array('h-product', $item['type'])) {
Parse::debug("mf2:6: $url is falling back to the first h-product on the page");
return self::parseAsHProduct($mf2, $item, $http);
}
}
@ -101,70 +129,70 @@ class Mf2 {
return false;
}
private static function parseAsHEntry($mf2, $item, $http) {
$data = [
'type' => 'entry'
];
$refs = [];
// Single plaintext values
$properties = ['url','published','summary','rsvp'];
private static function collectSingleValues($properties, $urlProperties, $item, &$data) {
foreach($properties as $p) {
if(($v = self::getPlaintext($item, $p)) !== null) {
if($p == 'url') {
if(self::isURL($v))
$data[$p] = $v;
} else {
$data[$p] = $v;
}
}
foreach($urlProperties as $p) {
if(($v = self::getPlaintext($item, $p)) !== null) {
if(self::isURL($v))
$data[$p] = $v;
}
}
}
}
// Always arrays
$properties = ['photo','video','audio','syndication'];
// Always return arrays, and may contain plaintext content
// Nested objects are added to refs and the URL is used as the value if present
private static function collectArrayValues($properties, $item, &$data, &$refs, &$http) {
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
foreach($item['properties'][$p] as $v) {
if(is_string($v) && self::isURL($v)) {
if(is_string($v)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v;
}
elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v['value'];
}
}
}
}
// Always returned as arrays, and may also create external references
// If these are not objects, they must be URLs
$set = [
'normal' => ['category','invitee'],
'url' => ['in-reply-to','like-of','repost-of','bookmark-of']
];
foreach($set as $type=>$properties) {
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
foreach($item['properties'][$p] as $v) {
if(is_string($v) && ($type == 'normal' || self::isURL($v))) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v;
}
elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
} elseif(self::isMicroformat($v)) {
if(($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $u;
// parse the object and put the result in the "refs" object
$ref = self::parse(['items'=>[$v]], $u, $http);
if($ref) {
$refs[$u] = $ref['data'];
}
} else {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v['value'];
}
}
}
}
}
}
}
private static function collectArrayURLValues($properties, $item, &$data, &$refs, &$http) {
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
foreach($item['properties'][$p] as $v) {
if(is_string($v) && self::isURL($v)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v;
}
elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $u;
// parse the object and put the result in the "refs" object
$ref = self::parse(['items'=>[$v]], $u, $http);
if($ref) {
$refs[$u] = $ref['data'];
}
}
}
}
}
}
private static function determineNameAndContent($item, &$data) {
// Determine if the name is distinct from the content
$name = self::getPlaintext($item, 'name');
$content = null;
@ -210,8 +238,56 @@ class Mf2 {
$data['content']['html'] = $htmlContent;
}
// TODO: If no HTML content was included in the post, create HTML by autolinking?
}
}
private static function parseAsHEntry($mf2, $item, $http) {
$data = [
'type' => 'entry'
];
$refs = [];
// Single plaintext and URL values
self::collectSingleValues(['published','summary','rsvp'], ['url'], $item, $data);
// These properties are always returned as arrays and may contain plaintext content
self::collectArrayValues(['category','invitee'], $item, $data, $refs, $http);
// These properties are always returned as arrays and always URLs
// If the value is an h-* object with a URL, the URL is used and a "ref" is added as well
self::collectArrayURLValues(['photo','video','audio','syndication','in-reply-to','like-of','repost-of','bookmark-of'], $item, $data, $refs, $http);
self::determineNameAndContent($item, $data);
if($author = self::findAuthor($mf2, $item, $http))
$data['author'] = $author;
$response = [
'data' => $data
];
if(count($refs)) {
$response['refs'] = $refs;
}
return $response;
}
private static function parseAsHReview($mf2, $item, $http) {
$data = [
'type' => 'review'
];
$refs = [];
// TODO: add description as an HTML value
self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $data);
self::collectArrayValues(['category'], $item, $data, $refs, $http);
self::collectArrayURLValues(['item'], $item, $data, $refs, $http);
self::determineNameAndContent($item, $data);
if($author = self::findAuthor($mf2, $item, $http))
$data['author'] = $author;
@ -226,61 +302,39 @@ class Mf2 {
return $response;
}
private static function parseAsHProduct($mf2, $item, $http) {
$data = [
'type' => 'product'
];
self::collectSingleValues(['name','identifier','price'], ['url'], $item, $data);
self::collectArrayValues(['category','brand'], $item, $data, $refs, $http);
self::collectArrayURLValues(['photo','video','audio'], $item, $data, $refs, $http);
$response = [
'data' => $data
];
return $response;
}
private static function parseAsHEvent($mf2, $item, $http) {
$data = [
'type' => 'event'
];
$refs = [];
// Single plaintext values
$properties = ['name','summary','url','published','start','end','duration'];
foreach($properties as $p) {
if(($v = self::getPlaintext($item, $p)) !== null) {
if($p == 'url') {
if(self::isURL($v))
$data[$p] = $v;
} else {
$data[$p] = $v;
}
}
}
// Single plaintext and URL values
self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $data);
// Always arrays
$properties = ['photo','video','audio','syndication'];
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
foreach($item['properties'][$p] as $v) {
if(is_string($v) && self::isURL($v)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v;
}
elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v['value'];
}
}
}
}
// These properties are always returned as arrays and may contain plaintext content
self::collectArrayValues(['category','location','attendee'], $item, $data, $refs, $http);
// Always returned as arrays, and may also create external references
$properties = ['category','location','attendee'];
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
$data[$p] = [];
foreach($item['properties'][$p] as $v) {
if(is_string($v))
$data[$p][] = $v;
elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
$data[$p][] = $u;
// parse the object and put the result in the "refs" object
$ref = self::parse(['items'=>[$v]], $u, $http);
if($ref) {
$refs[$u] = $ref['data'];
}
}
}
}
}
// These properties are always returned as arrays and always URLs
// If the value is an h-* object with a URL, the URL is used and a "ref" is added as well
self::collectArrayURLValues(['photo','video','audio','syndication'], $item, $data, $refs, $http);
// If there is a description, always return the plaintext description, and return HTML description if it's different
$textDescription = null;

+ 1
- 1
tests/FeedTest.php View File

@ -48,7 +48,7 @@ class FeedTest extends PHPUnit_Framework_TestCase {
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('feed', $data->data->type);
$this->assertEquals('entry', $data->data->type);
}
public function testTopLevelHFeed() {

+ 67
- 1
tests/ParseTest.php View File

@ -222,7 +222,6 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
print_r($data['data']);
$this->assertEquals('http://syndicated.example/', $data['data']['syndication'][0]);
}
@ -357,6 +356,73 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('Venue', $data['refs']['http://source.example.com/venue']['name']);
}
public function testMf2ReviewOfProduct() {
$url = 'http://source.example.com/h-review-of-product';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('review', $data['data']['type']);
$this->assertEquals('Review', $data['data']['name']);
$this->assertEquals('Not great', $data['data']['summary']);
$this->assertEquals('3', $data['data']['rating']);
$this->assertEquals('5', $data['data']['best']);
$this->assertEquals('This is the full text of the review', $data['data']['content']['text']);
$this->assertContains('red', $data['data']['category']);
$this->assertContains('blue', $data['data']['category']);
$this->assertContains('http://product.example.com/', $data['data']['item']);
$this->assertArrayHasKey('http://product.example.com/', $data['refs']);
$this->assertEquals('product', $data['refs']['http://product.example.com/']['type']);
$this->assertEquals('The Reviewed Product', $data['refs']['http://product.example.com/']['name']);
$this->assertEquals('http://product.example.com/', $data['refs']['http://product.example.com/']['url']);
}
public function testMf2ReviewOfHCard() {
$url = 'http://source.example.com/h-review-of-h-card';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('review', $data['data']['type']);
$this->assertEquals('Review', $data['data']['name']);
$this->assertEquals('Not great', $data['data']['summary']);
$this->assertEquals('3', $data['data']['rating']);
$this->assertEquals('5', $data['data']['best']);
$this->assertEquals('This is the full text of the review', $data['data']['content']['text']);
$this->assertContains('http://business.example.com/', $data['data']['item']);
$this->assertArrayHasKey('http://business.example.com/', $data['refs']);
$this->assertEquals('card', $data['refs']['http://business.example.com/']['type']);
$this->assertEquals('The Reviewed Business', $data['refs']['http://business.example.com/']['name']);
$this->assertEquals('http://business.example.com/', $data['refs']['http://business.example.com/']['url']);
}
public function testMf1Review() {
$url = 'http://source.example.com/hReview';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('review', $data['data']['type']);
$this->assertEquals('Review', $data['data']['name']);
# TODO: backcompat of mf1 parser is kind of messed up right now
#$this->assertEquals('Not great', $data['data']['summary']);
$this->assertEquals('3', $data['data']['rating']);
$this->assertEquals('5', $data['data']['best']);
#$this->assertEquals('This is the full text of the review', $data['data']['content']['text']);
// $this->assertContains('http://product.example.com/', $data['data']['item']);
// $this->assertArrayHasKey('http://product.example.com/', $data['refs']);
// $this->assertEquals('product', $data['refs']['http://product.example.com/']['type']);
// $this->assertEquals('The Reviewed Product', $data['refs']['http://product.example.com/']['name']);
// $this->assertEquals('http://product.example.com/', $data['refs']['http://product.example.com/']['url']);
}
public function testEntryIsAnInvitee() {
$url = 'http://source.example.com/bridgy-invitee';
$response = $this->parse(['url' => $url]);

+ 27
- 0
tests/data/source.example.com/h-review-of-h-card View File

@ -0,0 +1,27 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Review</title>
</head>
<body class="h-review">
<h2 class="p-name">Review</h2>
<a href="/h-review" class="u-url">permalink</a>
<h3><a href="http://business.example.com/" class="p-item h-card">The Reviewed Business</a></h3>
<span class="rating"><span class="p-rating">3</span> out of <span class="p-best">5</span></span>
<div class="p-summary">Not great</div>
<div class="e-content">
This is the full text of the review
</div>
</body>
</html>

+ 32
- 0
tests/data/source.example.com/h-review-of-product View File

@ -0,0 +1,32 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Review</title>
</head>
<body class="h-review">
<h2 class="p-name">Review</h2>
<a href="/h-review" class="u-url">permalink</a>
<h3><a href="http://product.example.com/" class="p-item h-product">The Reviewed Product</a></h3>
<span class="rating"><span class="p-rating">3</span> out of <span class="p-best">5</span></span>
<div class="p-summary">Not great</div>
<div class="e-content">
This is the full text of the review
</div>
<ul>
<li class="p-category">red</li>
<li class="p-category">blue</li>
</ul>
</body>
</html>

+ 34
- 0
tests/data/source.example.com/hReview View File

@ -0,0 +1,34 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Review</title>
</head>
<body class="hreview">
<h2 class="fn">Review</h2>
<a href="/hReview" class="permalink">permalink</a>
<div class="item">
<h3><a href="http://product.example.com/" class="url fn">The Reviewed Product</a></h3>
</div>
<span class="rating"><span class="value">3</span> out of <span class="best">5</span></span>
<span class="reviewer vcard"><a class="url fn" href="https://author.example.com/">Aaron Parecki</a></span>
<span class="dtreviewed">2016-12-15T22:32:42+01:00</span>
<div class="summary">Not great</div>
<div class="description">
This is the full text of the review
</div>
</body>
</html>

Loading…
Cancel
Save