diff --git a/README.md b/README.md index 7fb01cb..7ddcddc 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,8 @@ In both cases, the response will be a JSON object containing a key of "type". If ```json { - "type": "error", - "code": "not_found", - "summary": "The URL provided was not found" + "error": "not_found", + "error_description": "The URL provided was not found" } ``` @@ -52,26 +51,28 @@ Other possible errors are listed below: ```json { - "type": "entry", - "author": { - "type": "card", - "name": "Aaron Parecki", - "photo": "https://aaronparecki.com/images/aaronpk-256.jpg", - "url": "https://aaronparecki.com/" - }, - "url": "https://aaronparecki.com/2016/01/16/11/", - "published": "2016-01-16T16:26:43-08:00", - "photo": [ - "https://aaronparecki.com/2016/01/16/11/photo.png" - ], - "syndication": [ - "https://twitter.com/aaronpk/status/688518372170977280" - ], - "summary": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it.", - "content": { - "html": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it.", - "text": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it." - }, + "data": { + "type": "entry", + "author": { + "type": "card", + "name": "Aaron Parecki", + "photo": "https://aaronparecki.com/images/aaronpk-256.jpg", + "url": "https://aaronparecki.com/" + }, + "url": "https://aaronparecki.com/2016/01/16/11/", + "published": "2016-01-16T16:26:43-08:00", + "photo": [ + "https://aaronparecki.com/2016/01/16/11/photo.png" + ], + "syndication": [ + "https://twitter.com/aaronpk/status/688518372170977280" + ], + "summary": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it.", + "content": { + "html": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it.", + "text": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it." + }, + } } ``` @@ -91,36 +92,38 @@ Replies, likes, reposts, etc. of this post will be included if they are listed o ```json { - "type": "entry", - ... - "like": [ - { - "type": "cite", - "author": { - "type": "card", - "name": "Thomas Dunlap", - "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/9055c458a67762637c0071006b16c78f25cb610b224dbc98f48961d772faff4d.jpeg", - "url": "https://twitter.com/spladow" - }, - "url": "https://twitter.com/aaronpk/status/688518372170977280#favorited-by-16467582" - } - ], - "comment": [ - { - "type": "cite", - "author": { - "type": "card", - "name": "Poetica", - "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/192664bb706b2998ed42a50a860490b6aa1bb4926b458ba293b4578af599aa6f.png", - "url": "http://poetica.com/" - }, - "url": "https://twitter.com/poetica/status/689045331426803712", - "published": "2016-01-18T03:23:03-08:00", - "content": { - "text": "@aaronpk @mozillapersona thanks very much! :)" + "data": { + "type": "entry", + ... + "like": [ + { + "type": "cite", + "author": { + "type": "card", + "name": "Thomas Dunlap", + "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/9055c458a67762637c0071006b16c78f25cb610b224dbc98f48961d772faff4d.jpeg", + "url": "https://twitter.com/spladow" + }, + "url": "https://twitter.com/aaronpk/status/688518372170977280#favorited-by-16467582" } - } - ] + ], + "comment": [ + { + "type": "cite", + "author": { + "type": "card", + "name": "Poetica", + "photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/192664bb706b2998ed42a50a860490b6aa1bb4926b458ba293b4578af599aa6f.png", + "url": "http://poetica.com/" + }, + "url": "https://twitter.com/poetica/status/689045331426803712", + "published": "2016-01-18T03:23:03-08:00", + "content": { + "text": "@aaronpk @mozillapersona thanks very much! :)" + } + } + ] + } } ``` diff --git a/composer.lock b/composer.lock index 258b242..0e4a93b 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", "This file is @generated automatically" ], - "hash": "887779f2c9c5c0c3cbf524a606a776c2", + "hash": "a620639fa69cc332e4d4f49436f81ecc", "content-hash": "b26a0c76296b944624f36bbb163392ae", "packages": [ { diff --git a/controllers/Parse.php b/controllers/Parse.php index bf58ae2..628edbf 100644 --- a/controllers/Parse.php +++ b/controllers/Parse.php @@ -32,7 +32,6 @@ class Parse { if(!$url) { return $this->respond($response, 400, [ - 'type' => 'error', 'error' => 'missing_url', 'error_description' => 'Provide a URL to fetch' ]); @@ -42,7 +41,6 @@ class Parse { $scheme = parse_url($url, PHP_URL_SCHEME); if(!in_array($scheme, ['http','https'])) { return $this->respond($response, 400, [ - 'type' => 'error', 'error' => 'invalid_url', 'error_description' => 'Only http and https URLs are supported' ]); @@ -51,7 +49,6 @@ class Parse { $host = parse_url($url, PHP_URL_HOST); if(!$host) { return $this->respond($response, 400, [ - 'type' => 'error', 'error' => 'invalid_url', 'error_description' => 'The URL provided was not valid' ]); @@ -62,7 +59,6 @@ class Parse { if($result['error']) { return $this->respond($response, 400, [ - 'type' => 'error', 'error' => $result['error'], 'error_description' => $result['error_description'] ]); @@ -74,7 +70,6 @@ class Parse { if(!$doc) { return $this->respond($response, 400, [ - 'type' => 'error', 'error' => 'invalid_content', 'error_description' => 'The document could not be parsed as HTML' ]); @@ -98,7 +93,6 @@ class Parse { if(!$found) { return $this->respond($response, 400, [ - 'type' => 'error', 'error' => 'no_link_found', 'error_description' => 'The source document does not have a link to the target URL' ]); @@ -109,14 +103,18 @@ class Parse { $mf2 = mf2\Parse($result['body']); if($mf2 && count($mf2['items']) > 0) { $data = Formats\Mf2::parse($mf2); - return $this->respond($response, 200, $data); + if($data) { + return $this->respond($response, 200, [ + 'data' => $data, + 'mf2' => $mf2 + ]); + } } // TODO: look for other content like OEmbed or known services later return $this->respond($response, 400, [ - 'type' => 'error', 'error' => 'no_content', 'error_description' => 'No usable content could be found at the given URL' ]); diff --git a/lib/Formats/Mf2.php b/lib/Formats/Mf2.php index 95c4b83..a6a0ff6 100644 --- a/lib/Formats/Mf2.php +++ b/lib/Formats/Mf2.php @@ -4,9 +4,135 @@ namespace Percolator\Formats; class Mf2 { public static function parse($mf2) { + $data = [ + 'type' => 'entry', + 'author' => [ + 'type' => 'card', + 'name' => null, + 'url' => null, + 'photo' => null + ] + ]; + + if($item = $mf2['items'][0]) { + if(in_array('h-entry', $item['type'])) { + + // Single plaintext values + $properties = ['url','published','summary','rsvp']; + foreach($properties as $p) { + if($v = self::getPlaintext($item, $p)) + $data[$p] = $v; + } + + // Always arrays + $properties = ['photo','video','syndication','in-reply-to','like-of','repost-of']; + foreach($properties as $p) { + if(array_key_exists($p, $item['properties'])) + $data[$p] = $item['properties'][$p]; + } + + // Determine if the name is distinct from the content + $name = self::getPlaintext($item, 'name'); + $content = null; + $textContent = null; + $htmlContent = null; + if(array_key_exists('content', $item['properties'])) { + $content = $item['properties']['content'][0]; + if(is_string($content)) { + $textContent = $content; + } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) { + if(array_key_exists('html', $content)) { + $textContent = strip_tags($content['html']); + $htmlContent = $content['html']; + } else { + $textContent = $content['value']; + } + } + + // Trim ellipses from the name + $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name); + + // Check if the name is a prefix of the content + if(strpos($textContent, $name) === 0) { + $name = null; + } + + } + + if($name) { + $data['name'] = $name; + } + if($content) { + $data['content'] = [ + 'text' => $textContent + ]; + if($textContent != $htmlContent) { + $data['content']['html'] = $htmlContent; + } + } + + return $data; + } + } + + return false; + } + + private static function responseDisplayText($name, $summary, $content) { + + // Build a fake h-entry to pass to the comments parser + $input = [ + 'type' => ['h-entry'], + 'properties' => [ + 'name' => [trim($name)], + 'summary' => [trim($summary)], + 'content' => [trim($content)] + ] + ]; + + if(!trim($name)) + unset($input['properties']['name']); + + if(!trim($summary)) + unset($input['properties']['summary']); + + $result = \IndieWeb\comments\parse($input, false, 1024, 4); + return [ - 'type' => 'entry' + 'name' => trim($result['name']), + 'content' => $result['text'] ]; + } + + private static function hasNumericKeys(array $arr) { + foreach($arr as $key=>$val) + if (is_numeric($key)) + return true; + return false; + } + + private static function isMicroformat($mf) { + return is_array($mf) + and !self::hasNumericKeys($mf) + and !empty($mf['type']) + and isset($mf['properties']); + } + + // Given an array of microformats properties and a key name, return the plaintext value + // at that property + // e.g. + // {"properties":{"published":["foo"]}} results in "foo" + private static function getPlaintext($mf2, $k, $fallback=null) { + if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) { + // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser + $value = $mf2['properties'][$k][0]; + if(is_string($value)) { + return $value; + } elseif(self::isMicroformat($value) && array_key_exists('value', $value)) { + return $value['value']; + } + } + return $fallback; } } diff --git a/tests/ParseTest.php b/tests/ParseTest.php index f103179..bee07fb 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -23,7 +23,7 @@ class ParseTest extends PHPUnit_Framework_TestCase { $body = $response->getContent(); $this->assertEquals(400, $response->getStatusCode()); $data = json_decode($body); - $this->assertEquals('error', $data->type); + $this->assertObjectHasAttribute('error', $data); $this->assertEquals('missing_url', $data->error); } @@ -34,7 +34,7 @@ class ParseTest extends PHPUnit_Framework_TestCase { $body = $response->getContent(); $this->assertEquals(400, $response->getStatusCode()); $data = json_decode($body); - $this->assertEquals('error', $data->type); + $this->assertObjectHasAttribute('error', $data); $this->assertEquals('invalid_url', $data->error); } @@ -45,7 +45,7 @@ class ParseTest extends PHPUnit_Framework_TestCase { $body = $response->getContent(); $this->assertEquals(400, $response->getStatusCode()); $data = json_decode($body); - $this->assertEquals('error', $data->type); + $this->assertObjectHasAttribute('error', $data); $this->assertEquals('no_link_found', $data->error); } @@ -56,8 +56,66 @@ class ParseTest extends PHPUnit_Framework_TestCase { $body = $response->getContent(); $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); - $this->assertNotEquals('error', $data->type); $this->assertObjectNotHasAttribute('error', $data); + $this->assertObjectNotHasAttribute('error', $data); + } + + public function testHTMLContent() { + $url = 'http://source.example.com/html-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->text); + $this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->html); + } + + public function testTextContent() { + $url = 'http://source.example.com/text-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('This page has a link to target.example.com and some formatted text but is in a p-content element so is plaintext.', $data->data->content->text); + } + + public function testContentWithPrefixedName() { + $url = 'http://source.example.com/content-with-prefixed-name'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->text); + $this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->html); + } + + public function testContentWithDistinctName() { + $url = 'http://source.example.com/content-with-distinct-name'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals('Hello World', $data->data->name); + $this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->text); + $this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->html); + } + + public function testNameWithNoContent() { + $url = 'http://source.example.com/name-no-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals('Hello World', $data->data->name); + $this->assertObjectNotHasAttribute('content', $data->data); } } \ No newline at end of file diff --git a/tests/data/source.example.com/content-with-distinct-name b/tests/data/source.example.com/content-with-distinct-name new file mode 100644 index 0000000..a428379 --- /dev/null +++ b/tests/data/source.example.com/content-with-distinct-name @@ -0,0 +1,15 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

Hello World

+

This page has a link to target.example.com and some formatted text.

+ + diff --git a/tests/data/source.example.com/content-with-prefixed-name b/tests/data/source.example.com/content-with-prefixed-name new file mode 100644 index 0000000..136cb46 --- /dev/null +++ b/tests/data/source.example.com/content-with-prefixed-name @@ -0,0 +1,15 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This page has a link...

+

This page has a link to target.example.com and some formatted text.

+ + diff --git a/tests/data/source.example.com/html-content b/tests/data/source.example.com/html-content new file mode 100644 index 0000000..3f99cee --- /dev/null +++ b/tests/data/source.example.com/html-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This page has a link to target.example.com and some formatted text.

+ + diff --git a/tests/data/source.example.com/name-no-content b/tests/data/source.example.com/name-no-content new file mode 100644 index 0000000..8105878 --- /dev/null +++ b/tests/data/source.example.com/name-no-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

Hello World

+ + diff --git a/tests/data/source.example.com/text-content b/tests/data/source.example.com/text-content new file mode 100644 index 0000000..ae1dae8 --- /dev/null +++ b/tests/data/source.example.com/text-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This page has a link to target.example.com and some formatted text but is in a p-content element so is plaintext.

+ +