Browse Source

parse content and name from the entry

pull/39/head
Aaron Parecki 9 years ago
parent
commit
9eecc31571
10 changed files with 323 additions and 66 deletions
  1. +55
    -52
      README.md
  2. +1
    -1
      composer.lock
  3. +6
    -8
      controllers/Parse.php
  4. +127
    -1
      lib/Formats/Mf2.php
  5. +62
    -4
      tests/ParseTest.php
  6. +15
    -0
      tests/data/source.example.com/content-with-distinct-name
  7. +15
    -0
      tests/data/source.example.com/content-with-prefixed-name
  8. +14
    -0
      tests/data/source.example.com/html-content
  9. +14
    -0
      tests/data/source.example.com/name-no-content
  10. +14
    -0
      tests/data/source.example.com/text-content

+ 55
- 52
README.md View File

@ -32,9 +32,8 @@ In both cases, the response will be a JSON object containing a key of "type". If
```json
{
"type": "error",
"code": "not_found",
"summary": "The URL provided was not found"
"error": "not_found",
"error_description": "The URL provided was not found"
}
```
@ -52,26 +51,28 @@ Other possible errors are listed below:
```json
{
"type": "entry",
"author": {
"type": "card",
"name": "Aaron Parecki",
"photo": "https://aaronparecki.com/images/aaronpk-256.jpg",
"url": "https://aaronparecki.com/"
},
"url": "https://aaronparecki.com/2016/01/16/11/",
"published": "2016-01-16T16:26:43-08:00",
"photo": [
"https://aaronparecki.com/2016/01/16/11/photo.png"
],
"syndication": [
"https://twitter.com/aaronpk/status/688518372170977280"
],
"summary": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it.",
"content": {
"html": "Now that <a href=\"https://twitter.com/MozillaPersona\">@MozillaPersona</a> is shutting down, the only good way to do email-based login is how <a href=\"https://twitter.com/poetica\">@poetica</a> does it.",
"text": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it."
},
"data": {
"type": "entry",
"author": {
"type": "card",
"name": "Aaron Parecki",
"photo": "https://aaronparecki.com/images/aaronpk-256.jpg",
"url": "https://aaronparecki.com/"
},
"url": "https://aaronparecki.com/2016/01/16/11/",
"published": "2016-01-16T16:26:43-08:00",
"photo": [
"https://aaronparecki.com/2016/01/16/11/photo.png"
],
"syndication": [
"https://twitter.com/aaronpk/status/688518372170977280"
],
"summary": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it.",
"content": {
"html": "Now that <a href=\"https://twitter.com/MozillaPersona\">@MozillaPersona</a> is shutting down, the only good way to do email-based login is how <a href=\"https://twitter.com/poetica\">@poetica</a> does it.",
"text": "Now that @MozillaPersona is shutting down, the only good way to do email-based login is how @poetica does it."
},
}
}
```
@ -91,36 +92,38 @@ Replies, likes, reposts, etc. of this post will be included if they are listed o
```json
{
"type": "entry",
...
"like": [
{
"type": "cite",
"author": {
"type": "card",
"name": "Thomas Dunlap",
"photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/9055c458a67762637c0071006b16c78f25cb610b224dbc98f48961d772faff4d.jpeg",
"url": "https://twitter.com/spladow"
},
"url": "https://twitter.com/aaronpk/status/688518372170977280#favorited-by-16467582"
}
],
"comment": [
{
"type": "cite",
"author": {
"type": "card",
"name": "Poetica",
"photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/192664bb706b2998ed42a50a860490b6aa1bb4926b458ba293b4578af599aa6f.png",
"url": "http://poetica.com/"
},
"url": "https://twitter.com/poetica/status/689045331426803712",
"published": "2016-01-18T03:23:03-08:00",
"content": {
"text": "@aaronpk @mozillapersona thanks very much! :)"
"data": {
"type": "entry",
...
"like": [
{
"type": "cite",
"author": {
"type": "card",
"name": "Thomas Dunlap",
"photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/9055c458a67762637c0071006b16c78f25cb610b224dbc98f48961d772faff4d.jpeg",
"url": "https://twitter.com/spladow"
},
"url": "https://twitter.com/aaronpk/status/688518372170977280#favorited-by-16467582"
}
}
]
],
"comment": [
{
"type": "cite",
"author": {
"type": "card",
"name": "Poetica",
"photo": "https://s3-us-west-2.amazonaws.com/aaronparecki.com/twitter.com/192664bb706b2998ed42a50a860490b6aa1bb4926b458ba293b4578af599aa6f.png",
"url": "http://poetica.com/"
},
"url": "https://twitter.com/poetica/status/689045331426803712",
"published": "2016-01-18T03:23:03-08:00",
"content": {
"text": "@aaronpk @mozillapersona thanks very much! :)"
}
}
]
}
}
```

+ 1
- 1
composer.lock View File

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "887779f2c9c5c0c3cbf524a606a776c2",
"hash": "a620639fa69cc332e4d4f49436f81ecc",
"content-hash": "b26a0c76296b944624f36bbb163392ae",
"packages": [
{

+ 6
- 8
controllers/Parse.php View File

@ -32,7 +32,6 @@ class Parse {
if(!$url) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'missing_url',
'error_description' => 'Provide a URL to fetch'
]);
@ -42,7 +41,6 @@ class Parse {
$scheme = parse_url($url, PHP_URL_SCHEME);
if(!in_array($scheme, ['http','https'])) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'invalid_url',
'error_description' => 'Only http and https URLs are supported'
]);
@ -51,7 +49,6 @@ class Parse {
$host = parse_url($url, PHP_URL_HOST);
if(!$host) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'invalid_url',
'error_description' => 'The URL provided was not valid'
]);
@ -62,7 +59,6 @@ class Parse {
if($result['error']) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => $result['error'],
'error_description' => $result['error_description']
]);
@ -74,7 +70,6 @@ class Parse {
if(!$doc) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'invalid_content',
'error_description' => 'The document could not be parsed as HTML'
]);
@ -98,7 +93,6 @@ class Parse {
if(!$found) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL'
]);
@ -109,14 +103,18 @@ class Parse {
$mf2 = mf2\Parse($result['body']);
if($mf2 && count($mf2['items']) > 0) {
$data = Formats\Mf2::parse($mf2);
return $this->respond($response, 200, $data);
if($data) {
return $this->respond($response, 200, [
'data' => $data,
'mf2' => $mf2
]);
}
}
// TODO: look for other content like OEmbed or known services later
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'no_content',
'error_description' => 'No usable content could be found at the given URL'
]);

+ 127
- 1
lib/Formats/Mf2.php View File

@ -4,9 +4,135 @@ namespace Percolator\Formats;
class Mf2 {
public static function parse($mf2) {
$data = [
'type' => 'entry',
'author' => [
'type' => 'card',
'name' => null,
'url' => null,
'photo' => null
]
];
if($item = $mf2['items'][0]) {
if(in_array('h-entry', $item['type'])) {
// Single plaintext values
$properties = ['url','published','summary','rsvp'];
foreach($properties as $p) {
if($v = self::getPlaintext($item, $p))
$data[$p] = $v;
}
// Always arrays
$properties = ['photo','video','syndication','in-reply-to','like-of','repost-of'];
foreach($properties as $p) {
if(array_key_exists($p, $item['properties']))
$data[$p] = $item['properties'][$p];
}
// Determine if the name is distinct from the content
$name = self::getPlaintext($item, 'name');
$content = null;
$textContent = null;
$htmlContent = null;
if(array_key_exists('content', $item['properties'])) {
$content = $item['properties']['content'][0];
if(is_string($content)) {
$textContent = $content;
} elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
if(array_key_exists('html', $content)) {
$textContent = strip_tags($content['html']);
$htmlContent = $content['html'];
} else {
$textContent = $content['value'];
}
}
// Trim ellipses from the name
$name = preg_replace('/ ?(\.\.\.|…)$/', '', $name);
// Check if the name is a prefix of the content
if(strpos($textContent, $name) === 0) {
$name = null;
}
}
if($name) {
$data['name'] = $name;
}
if($content) {
$data['content'] = [
'text' => $textContent
];
if($textContent != $htmlContent) {
$data['content']['html'] = $htmlContent;
}
}
return $data;
}
}
return false;
}
private static function responseDisplayText($name, $summary, $content) {
// Build a fake h-entry to pass to the comments parser
$input = [
'type' => ['h-entry'],
'properties' => [
'name' => [trim($name)],
'summary' => [trim($summary)],
'content' => [trim($content)]
]
];
if(!trim($name))
unset($input['properties']['name']);
if(!trim($summary))
unset($input['properties']['summary']);
$result = \IndieWeb\comments\parse($input, false, 1024, 4);
return [
'type' => 'entry'
'name' => trim($result['name']),
'content' => $result['text']
];
}
private static function hasNumericKeys(array $arr) {
foreach($arr as $key=>$val)
if (is_numeric($key))
return true;
return false;
}
private static function isMicroformat($mf) {
return is_array($mf)
and !self::hasNumericKeys($mf)
and !empty($mf['type'])
and isset($mf['properties']);
}
// Given an array of microformats properties and a key name, return the plaintext value
// at that property
// e.g.
// {"properties":{"published":["foo"]}} results in "foo"
private static function getPlaintext($mf2, $k, $fallback=null) {
if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
// $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
$value = $mf2['properties'][$k][0];
if(is_string($value)) {
return $value;
} elseif(self::isMicroformat($value) && array_key_exists('value', $value)) {
return $value['value'];
}
}
return $fallback;
}
}

+ 62
- 4
tests/ParseTest.php View File

@ -23,7 +23,7 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$body = $response->getContent();
$this->assertEquals(400, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('error', $data->type);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('missing_url', $data->error);
}
@ -34,7 +34,7 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$body = $response->getContent();
$this->assertEquals(400, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('error', $data->type);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('invalid_url', $data->error);
}
@ -45,7 +45,7 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$body = $response->getContent();
$this->assertEquals(400, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('error', $data->type);
$this->assertObjectHasAttribute('error', $data);
$this->assertEquals('no_link_found', $data->error);
}
@ -56,8 +56,66 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertNotEquals('error', $data->type);
$this->assertObjectNotHasAttribute('error', $data);
$this->assertObjectNotHasAttribute('error', $data);
}
public function testHTMLContent() {
$url = 'http://source.example.com/html-content';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->text);
$this->assertEquals('This page has a link to <a href="http://target.example.com">target.example.com</a> and some <b>formatted text</b>.', $data->data->content->html);
}
public function testTextContent() {
$url = 'http://source.example.com/text-content';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('This page has a link to target.example.com and some formatted text but is in a p-content element so is plaintext.', $data->data->content->text);
}
public function testContentWithPrefixedName() {
$url = 'http://source.example.com/content-with-prefixed-name';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->text);
$this->assertEquals('This page has a link to <a href="http://target.example.com">target.example.com</a> and some <b>formatted text</b>.', $data->data->content->html);
}
public function testContentWithDistinctName() {
$url = 'http://source.example.com/content-with-distinct-name';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('Hello World', $data->data->name);
$this->assertEquals('This page has a link to target.example.com and some formatted text.', $data->data->content->text);
$this->assertEquals('This page has a link to <a href="http://target.example.com">target.example.com</a> and some <b>formatted text</b>.', $data->data->content->html);
}
public function testNameWithNoContent() {
$url = 'http://source.example.com/name-no-content';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('Hello World', $data->data->name);
$this->assertObjectNotHasAttribute('content', $data->data);
}
}

+ 15
- 0
tests/data/source.example.com/content-with-distinct-name View File

@ -0,0 +1,15 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<h2 class="p-name">Hello World</h2>
<p class="e-content">This page has a link to <a href="http://target.example.com">target.example.com</a> and some <b>formatted text</b>.</p>
</body>
</html>

+ 15
- 0
tests/data/source.example.com/content-with-prefixed-name View File

@ -0,0 +1,15 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<h2 class="p-name">This page has a link...</h2>
<p class="e-content">This page has a link to <a href="http://target.example.com">target.example.com</a> and some <b>formatted text</b>.</p>
</body>
</html>

+ 14
- 0
tests/data/source.example.com/html-content View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content">This page has a link to <a href="http://target.example.com">target.example.com</a> and some <b>formatted text</b>.</p>
</body>
</html>

+ 14
- 0
tests/data/source.example.com/name-no-content View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<h2 class="p-name">Hello World</h2>
</body>
</html>

+ 14
- 0
tests/data/source.example.com/text-content View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="p-content">This page has a link to <a href="http://target.example.com">target.example.com</a> and some <b>formatted text</b> but is in a p-content element so is plaintext.</p>
</body>
</html>

Loading…
Cancel
Save