diff --git a/controllers/Parse.php b/controllers/Parse.php index 628edbf..44470c7 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -28,6 +28,11 @@ class Parse { public function parse(Request $request, Response $response) { + if($request->get('timeout')) { + // We might make 2 HTTP requests, so each request gets half the desired timeout + $this->http->timeout = $request->get('timeout') / 2; + } + $url = $request->get('url'); if(!$url) { @@ -100,18 +105,18 @@ class Parse { } // Now start pulling in the data from the page. Start by looking for microformats2 - $mf2 = mf2\Parse($result['body']); + $mf2 = mf2\Parse($result['body'], $url); + if($mf2 && count($mf2['items']) > 0) { - $data = Formats\Mf2::parse($mf2); + $data = Formats\Mf2::parse($mf2, $url, $this->http); if($data) { return $this->respond($response, 200, [ 'data' => $data, - 'mf2' => $mf2 ]); } } - // TODO: look for other content like OEmbed or known services later + // TODO: look for other content like OEmbed or other known services later return $this->respond($response, 400, [ diff --git a/lib/Formats/Mf2.php b/lib/Formats/Mf2.php index a6a0ff6..34cb4ac 100644 --- a/lib/Formats/Mf2.php +++ b/lib/Formats/Mf2.php @@ -3,7 +3,36 @@ namespace Percolator\Formats; class Mf2 { - public static function parse($mf2) { + public static function parse($mf2, $url, $http) { + + if($item = $mf2['items'][0]) { + // If the first item is a feed, the page is a feed + if(in_array('h-feed', $item['type'])) { + return self::parseHFeed($mf2, $http); + } + + // Check each top-level h-card, and if there is one that matches this URL, the page is an h-card + foreach($mf2['items'] as $i) { + if(in_array('h-card', $i['type']) + and array_key_exists('url', $i['properties']) + and in_array($url, $i['properties']['url']) + ) { + // TODO: check for children h-entrys (like tantek.com), or sibling h-entries (like aaronparecki.com) + // and return the result as a feed instead + return self::parseHCard($i, $http); + } + } + + // Otherwise check for an h-entry + if(in_array('h-entry', $item['type'])) { + return self::parseHEntry($mf2, $http); + } + } + + return false; + } + + private static function parseHEntry($mf2, \p3k\HTTP $http) { $data = [ 'type' => 'entry', 'author' => [ @@ -14,68 +43,190 @@ class Mf2 { ] ]; - if($item = $mf2['items'][0]) { - if(in_array('h-entry', $item['type'])) { + $item = $mf2['items'][0]; + + // Single plaintext values + $properties = ['url','published','summary','rsvp']; + foreach($properties as $p) { + if($v = self::getPlaintext($item, $p)) + $data[$p] = $v; + } + + // Always arrays + $properties = ['photo','video','syndication','in-reply-to','like-of','repost-of']; + foreach($properties as $p) { + if(array_key_exists($p, $item['properties'])) + $data[$p] = $item['properties'][$p]; + } - // Single plaintext values - $properties = ['url','published','summary','rsvp']; - foreach($properties as $p) { - if($v = self::getPlaintext($item, $p)) - $data[$p] = $v; + // Determine if the name is distinct from the content + $name = self::getPlaintext($item, 'name'); + $content = null; + $textContent = null; + $htmlContent = null; + if(array_key_exists('content', $item['properties'])) { + $content = $item['properties']['content'][0]; + if(is_string($content)) { + $textContent = $content; + } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) { + if(array_key_exists('html', $content)) { + $textContent = strip_tags($content['html']); + $htmlContent = $content['html']; + } else { + $textContent = $content['value']; } + } + + // Trim ellipses from the name + $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name); + + // Check if the name is a prefix of the content + if(strpos($textContent, $name) === 0) { + $name = null; + } + } + + if($name) { + $data['name'] = $name; + } + if($content) { + $data['content'] = [ + 'text' => $textContent + ]; + if($textContent != $htmlContent) { + $data['content']['html'] = $htmlContent; + } + } + + $data['author'] = self::findAuthor($mf2, $item, $http); + + return $data; + } + + private static function parseHFeed($mf2, \p3k\HTTP $http) { + $data = [ + 'type' => 'feed', + 'author' => [ + 'type' => 'card', + 'name' => null, + 'url' => null, + 'photo' => null + ], + 'items' => [] + ]; - // Always arrays - $properties = ['photo','video','syndication','in-reply-to','like-of','repost-of']; - foreach($properties as $p) { - if(array_key_exists($p, $item['properties'])) - $data[$p] = $item['properties'][$p]; + return $data; + } + + private static function parseHCard($item, \p3k\HTTP $http) { + $data = [ + 'type' => 'card', + 'name' => null, + 'url' => null, + 'photo' => null + ]; + + $properties = ['url','name','photo']; + foreach($properties as $p) { + if($v = self::getPlaintext($item, $p)) + $data[$p] = $v; + } + + return $data; + } + + private static function findAuthor($mf2, $item, \p3k\HTTP $http) { + $author = [ + 'type' => 'card', + 'name' => null, + 'url' => null, + 'photo' => null + ]; + + // Author Discovery + // http://indiewebcamp.com/authorship + + $authorPage = false; + if(array_key_exists('author', $item['properties'])) { + + // Check if any of the values of the author property are an h-card + foreach($item['properties']['author'] as $a) { + if(self::isHCard($a)) { + // 5.1 "if it has an h-card, use it, exit." + return self::parseHCard($a, $http); + } elseif(is_string($a)) { + if(self::isURL($a)) { + // 5.2 "otherwise if author property is an http(s) URL, let the author-page have that URL" + $authorPage = $a; + } else { + // 5.3 "otherwise use the author property as the author name, exit" + // We can only set the name, no h-card or URL was found + $author['name'] = self::getPlaintext($item, 'author'); + return $author; + } + } else { + // This case is only hit when the author property is an mf2 object that is not an h-card + $author['name'] = self::getPlaintext($item, 'author'); + return $author; } + } + + } + + // 6. "if no author page was found" ... check for rel-author link + if(!$authorPage) { + if(isset($mf2['rels']) && isset($mf2['rels']['author'])) + $authorPage = $mf2['rels']['author'][0]; + } + + // 7. "if there is an author-page URL" ... + if($authorPage) { - // Determine if the name is distinct from the content - $name = self::getPlaintext($item, 'name'); - $content = null; - $textContent = null; - $htmlContent = null; - if(array_key_exists('content', $item['properties'])) { - $content = $item['properties']['content'][0]; - if(is_string($content)) { - $textContent = $content; - } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) { - if(array_key_exists('html', $content)) { - $textContent = strip_tags($content['html']); - $htmlContent = $content['html']; - } else { - $textContent = $content['value']; + // 7.1 "get the author-page from that URL and parse it for microformats2" + $authorPageContents = self::getURL($authorPage, $http); + + if($authorPageContents) { + foreach($authorPageContents['items'] as $i) { + if(self::isHCard($i)) { + + // 7.2 "if author-page has 1+ h-card with url == uid == author-page's URL, then use first such h-card, exit." + if(array_key_exists('url', $i['properties']) + and in_array($authorPage, $i['properties']['url']) + and array_key_exists('uid', $i['properties']) + and in_array($authorPage, $i['properties']['uid']) + ) { + return self::parseHCard($i, $http); } - } - // Trim ellipses from the name - $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name); + // 7.3 "else if author-page has 1+ h-card with url property which matches the href of a rel-me link on the author-page" + $relMeLinks = (isset($authorPageContents['rels']) && isset($authorPageContents['rels']['me'])) ? $authorPageContents['rels']['me'] : []; + if(count($relMeLinks) > 0 + and array_key_exists('url', $i['properties']) + and count(array_intersect($i['properties']['url'], $relMeLinks)) > 0 + ) { + return self::parseHCard($i, $http); + } - // Check if the name is a prefix of the content - if(strpos($textContent, $name) === 0) { - $name = null; } - } + } - if($name) { - $data['name'] = $name; - } - if($content) { - $data['content'] = [ - 'text' => $textContent - ]; - if($textContent != $htmlContent) { - $data['content']['html'] = $htmlContent; + // 7.4 "if the h-entry's page has 1+ h-card with url == author-page URL, use first such h-card, exit." + foreach($mf2['items'] as $i) { + if(self::isHCard($i)) { + + if(array_key_exists('url', $i['properties']) + and in_array($authorPage, $i['properties']['url']) + ) { + return self::parseHCard($i, $http); } - } - return $data; + } } - } - return false; + } + + return $author; } private static function responseDisplayText($name, $summary, $content) { @@ -118,6 +269,17 @@ class Mf2 { and isset($mf['properties']); } + private static function isHCard($mf) { + return is_array($mf) + and !empty($mf['type']) + and is_array($mf['type']) + and in_array('h-card', $mf['type']); + } + + private static function isURL($string) { + return preg_match('/^https?:\/\/.+\..+$/', $string); + } + // Given an array of microformats properties and a key name, return the plaintext value // at that property // e.g. @@ -135,4 +297,14 @@ class Mf2 { return $fallback; } + private static function getURL($url, \p3k\HTTP $http) { + if(!$url) return null; + // TODO: consider adding caching here + $result = $http->get($url); + if($result['error'] || !$result['body']) { + return null; + } + return \mf2\Parse($result['body'], $url); + } + } diff --git a/lib/HTTP.php b/lib/HTTP.php index 63be97e..c83cc0d 100644 --- a/lib/HTTP.php +++ b/lib/HTTP.php @@ -12,7 +12,7 @@ class HTTP { curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); - curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_TIMEOUT_MS, round($this->timeout * 1000)); $response = curl_exec($ch); $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); return array( @@ -33,7 +33,7 @@ class HTTP { curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_HEADER, true); - curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_TIMEOUT_MS, round($this->timeout * 1000)); $response = curl_exec($ch); $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); return array( @@ -53,7 +53,7 @@ class HTTP { curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); - curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($ch, CURLOPT_TIMEOUT_MS, round($this->timeout * 1000)); $response = curl_exec($ch); return array( 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), diff --git a/lib/HTTPTest.php b/lib/HTTPTest.php index 85337bb..3859a7a 100644 --- a/lib/HTTPTest.php +++ b/lib/HTTPTest.php @@ -35,10 +35,11 @@ class HTTPTest extends HTTP { $response = file_get_contents($filename); $split = explode("\r\n\r\n", $response); - if(count($split) != 2) { + if(count($split) < 2) { throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url"); } - list($headers, $body) = $split; + $headers = array_shift($split); + $body = implode("\r\n", $split); if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) { $code = $match[1]; diff --git a/tests/AuthorTest.php b/tests/AuthorTest.php new file mode 100644 index 0000000..ed3229f --- /dev/null +++ b/tests/AuthorTest.php @@ -0,0 +1,141 @@ +client = new Parse(); + $this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); + } + + private function parse($params) { + $request = new Request($params); + $response = new Response(); + return $this->client->parse($request, $response); + } + + public function testHEntryAuthorIsName() { + $url = 'http://author.example.com/h-entry-author-is-name'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEmpty($data->data->author->url); + $this->assertEquals('Author Name', $data->data->author->name); + $this->assertEmpty($data->data->author->photo); + } + + public function testHEntryAuthorIsRelLinkToHCardOnPage() { + $url = 'http://author.example.com/h-entry-author-is-rel-link-to-h-card-on-page'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('http://author.example.com/about', $data->data->author->url); + $this->assertEquals('Author', $data->data->author->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->author->photo); + } + + public function testHEntryAuthorIsRelLinkToHCardWithRelMe() { + $url = 'http://author.example.com/h-entry-author-is-rel-link-to-h-card-with-rel-me'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('http://author.example.com/about-rel-me', $data->data->author->url); + $this->assertEquals('Author Full Name', $data->data->author->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->author->photo); + } + + public function testHEntryAuthorIsRelLinkToHCardWithUrlUid() { + $url = 'http://author.example.com/h-entry-author-is-rel-link-to-h-card-with-url-uid'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('http://author.example.com/about-url-uid', $data->data->author->url); + $this->assertEquals('Author Full Name', $data->data->author->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->author->photo); + } + + public function testHEntryAuthorIsUrlToHCardOnPage() { + $url = 'http://author.example.com/h-entry-author-is-url-to-h-card-on-page'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('http://author.example.com/about', $data->data->author->url); + $this->assertEquals('Author', $data->data->author->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->author->photo); + } + + public function testHEntryHasHCardAndUrlAuthor() { + $url = 'http://author.example.com/h-entry-has-h-card-and-url-author'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('http://author.example.com/about', $data->data->author->url); + $this->assertEquals('Author', $data->data->author->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->author->photo); + } + + public function testHEntryHasHCardAuthor() { + $url = 'http://author.example.com/h-entry-has-h-card-author'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('http://author.example.com/about', $data->data->author->url); + $this->assertEquals('Author', $data->data->author->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->author->photo); + } + + public function testPageIsHCard() { + $url = 'http://author.example.com/about'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('card', $data->data->type); + $this->assertEquals('http://author.example.com/about', $data->data->url); + $this->assertEquals('Author Full Name', $data->data->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->photo); + } + + /* + public function testHFeedHasHCardAuthor() { + $url = 'http://author.example.com/h-feed-has-h-card-author'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + print_r($body); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals('http://author.example.com/about', $data->data->author->url); + $this->assertEquals('Author', $data->data->author->name); + $this->assertEquals('http://author.example.com/photo.jpg', $data->data->author->photo); + } + */ + +} diff --git a/tests/ParseTest.php b/tests/ParseTest.php index bee07fb..93049c2 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -39,7 +39,7 @@ class ParseTest extends PHPUnit_Framework_TestCase { } public function testTargetNotFound() { - $url = 'http://source.example.com/baseictest'; + $url = 'http://source.example.com/basictest'; $response = $this->parse(['url' => $url, 'target' => 'http://example.net']); $body = $response->getContent(); diff --git a/tests/data/author.example.com/about b/tests/data/author.example.com/about new file mode 100644 index 0000000..b532739 --- /dev/null +++ b/tests/data/author.example.com/about @@ -0,0 +1,23 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Author + + + +
+ + + Author Full Name + + +
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/about-rel-me b/tests/data/author.example.com/about-rel-me new file mode 100644 index 0000000..813b446 --- /dev/null +++ b/tests/data/author.example.com/about-rel-me @@ -0,0 +1,23 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Author + + + + +
+ + + Author Full Name + +
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/about-url-uid b/tests/data/author.example.com/about-url-uid new file mode 100644 index 0000000..cffd871 --- /dev/null +++ b/tests/data/author.example.com/about-url-uid @@ -0,0 +1,22 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Author + + + +
+ + + Author Full Name + +
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-entry-author-is-name b/tests/data/author.example.com/h-entry-author-is-name new file mode 100644 index 0000000..4c98282 --- /dev/null +++ b/tests/data/author.example.com/h-entry-author-is-name @@ -0,0 +1,20 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+
Author Name
+

Hello World

+
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-on-page b/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-on-page new file mode 100644 index 0000000..c357f51 --- /dev/null +++ b/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-on-page @@ -0,0 +1,28 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+

Hello World

+
+ +
+ + + Author + +
+ + + + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-with-rel-me b/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-with-rel-me new file mode 100644 index 0000000..536c7cd --- /dev/null +++ b/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-with-rel-me @@ -0,0 +1,21 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+

Hello World

+
+ + + + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-with-url-uid b/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-with-url-uid new file mode 100644 index 0000000..d575daa --- /dev/null +++ b/tests/data/author.example.com/h-entry-author-is-rel-link-to-h-card-with-url-uid @@ -0,0 +1,21 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+

Hello World

+
+ + + + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-entry-author-is-url-to-h-card-on-page b/tests/data/author.example.com/h-entry-author-is-url-to-h-card-on-page new file mode 100644 index 0000000..fd61e9d --- /dev/null +++ b/tests/data/author.example.com/h-entry-author-is-url-to-h-card-on-page @@ -0,0 +1,27 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+

Hello World

+ Author +
+ +
+ + + Author + +
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-entry-has-h-card-and-url-author b/tests/data/author.example.com/h-entry-has-h-card-and-url-author new file mode 100644 index 0000000..28dbeb0 --- /dev/null +++ b/tests/data/author.example.com/h-entry-has-h-card-and-url-author @@ -0,0 +1,30 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+ + Author + +
+ + + Author + +
+ +

Hello World

+ +
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-entry-has-h-card-author b/tests/data/author.example.com/h-entry-has-h-card-author new file mode 100644 index 0000000..cf7630d --- /dev/null +++ b/tests/data/author.example.com/h-entry-has-h-card-author @@ -0,0 +1,28 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+ +
+ + + Author + +
+ +

Hello World

+ +
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-feed-has-h-card-author b/tests/data/author.example.com/h-feed-has-h-card-author new file mode 100644 index 0000000..736ac33 --- /dev/null +++ b/tests/data/author.example.com/h-feed-has-h-card-author @@ -0,0 +1,30 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+ +
+ + + Author + +
+ +
+

Hello World

+
+ +
+ + + \ No newline at end of file diff --git a/tests/data/author.example.com/h-feed-has-multiple-entries-with-different-authors b/tests/data/author.example.com/h-feed-has-multiple-entries-with-different-authors new file mode 100644 index 0000000..98e8e9b --- /dev/null +++ b/tests/data/author.example.com/h-feed-has-multiple-entries-with-different-authors @@ -0,0 +1,23 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+ +
+

Hello World

+
+ +
+ + + \ No newline at end of file