Browse Source

Merge remote-tracking branch 'aaronpk/master'

pull/39/head
sebsel 5 years ago
parent
commit
c66c691f63
12 changed files with 377 additions and 67 deletions
  1. +6
    -13
      lib/XRay/Fetcher.php
  2. +49
    -0
      lib/XRay/Formats/Format.php
  3. +88
    -0
      lib/XRay/Formats/Hackernews.php
  4. +41
    -52
      lib/XRay/Formats/Mf2.php
  5. +9
    -2
      lib/XRay/Formats/Twitter.php
  6. +4
    -0
      lib/XRay/Parser.php
  7. +70
    -0
      tests/HackernewsTest.php
  8. +36
    -0
      tests/ParseTest.php
  9. +11
    -0
      tests/data/hacker-news.firebaseio.com/v0_item_14516538.json
  10. +11
    -0
      tests/data/hacker-news.firebaseio.com/v0_item_14516923.json
  11. +28
    -0
      tests/data/source.example.com/checkin
  12. +24
    -0
      tests/data/source.example.com/checkin-url

+ 6
- 13
lib/XRay/Fetcher.php View File

@ -53,6 +53,11 @@ class Fetcher {
return $this->_fetch_github($url, $opts);
}
// Check if this is a Hackernews URL and use the API
if(Formats\Hackernews::matches($url)) {
return Formats\Hackernews::fetch($this->http, $url, $opts);
}
// All other URLs are fetched normally
// Special-case appspot.com URLs to not follow redirects.
@ -145,19 +150,7 @@ class Fetcher {
];
}
$tweet = Formats\Twitter::fetch($url, $creds);
if(!$tweet) {
return [
'error' => 'twitter_error',
'error_description' => $e->getMessage()
];
}
return [
'url' => $url,
'body' => $tweet,
'code' => 200,
];
return Formats\Twitter::fetch($url, $creds);
}
private function _fetch_facebook($url, $opts) {

+ 49
- 0
lib/XRay/Formats/Format.php View File

@ -2,6 +2,7 @@
namespace p3k\XRay\Formats;
use DOMDocument, DOMXPath;
use HTMLPurifier, HTMLPurifier_Config;
interface iFormat {
@ -33,4 +34,52 @@ abstract class Format implements iFormat {
return [$doc, $xpath];
}
protected static function sanitizeHTML($html) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', [
'a',
'abbr',
'b',
'code',
'del',
'em',
'i',
'img',
'q',
'strike',
'strong',
'time',
'blockquote',
'pre',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'ul',
'li',
'ol'
]);
$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',
'Inline',
'Inline',
'Common',
[
'datetime' => 'Text'
]
);
// Override the allowed classes to only support Microformats2 classes
$def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
$purifier = new HTMLPurifier($config);
$sanitized = $purifier->purify($html);
$sanitized = str_replace("
","\r",$sanitized);
return $sanitized;
}
}

+ 88
- 0
lib/XRay/Formats/Hackernews.php View File

@ -0,0 +1,88 @@
<?php
namespace p3k\XRay\Formats;
use DateTime, DateTimeZone;
use Config;
use cebe\markdown\GithubMarkdown;
class Hackernews extends Format {
public static function matches_host($url) {
$host = parse_url($url, PHP_URL_HOST);
return $host == 'news.ycombinator.com';
}
public static function matches($url) {
if(preg_match('~https?://news\.ycombinator\.com/item\?id=(\d+)$~', $url, $match))
return $match;
else
return false;
}
public static function fetch($http, $url, $opts) {
$match = self::matches($url);
$response = $http->get('https://hacker-news.firebaseio.com/v0/item/'.$match[1].'.json');
if($response['code'] != 200) {
return [
'error' => 'hackernews_error',
'error_description' => $response['body'],
'code' => $response['code'],
];
}
return [
'url' => $url,
'body' => $response['body'],
'code' => $response['code'],
];
}
public static function parse($json, $url) {
$data = @json_decode($json, true);
if(!$data)
return self::_unknown();
$match = self::matches($url);
$date = DateTime::createFromFormat('U', $data['time']);
// Start building the h-entry
$entry = array(
'type' => 'entry',
'url' => $url,
'author' => [
'type' => 'card',
'name' => $data['by'],
'photo' => null,
'url' => 'https://news.ycombinator.com/user?id='.$data['by']
],
'published' => $date->format('c')
);
if(isset($data['title'])) {
$entry['name'] = $data['title'];
}
if(isset($data['text'])) {
$htmlContent = trim(self::sanitizeHTML($data['text']));
$textContent = str_replace('<p>', "\n<p>", $htmlContent);
$textContent = strip_tags($textContent);
$entry['content'] = [
'html' => $htmlContent,
'text' => $textContent
];
}
if(isset($data['parent'])) {
$entry['in-reply-to'] = ['https://news.ycombinator.com/item?id='.$data['parent']];
}
return [
'data' => $entry,
'original' => $json
];
}
}

+ 41
- 52
lib/XRay/Formats/Mf2.php View File

@ -3,7 +3,15 @@ namespace p3k\XRay\Formats;
use HTMLPurifier, HTMLPurifier_Config;
class Mf2 {
class Mf2 extends Format {
public static function matches_host($url) {
return true;
}
public static function matches($url) {
return true;
}
public static function parse($mf2, $url, $http) {
if(count($mf2['items']) == 0)
@ -227,6 +235,31 @@ class Mf2 {
}
}
private static function parseEmbeddedHCard($property, $item, &$http) {
if(array_key_exists($property, $item['properties'])) {
$mf2 = $item['properties'][$property][0];
if(is_string($mf2) && self::isURL($mf2)) {
$hcard = [
'type' => 'card',
'url' => $mf2
];
return $hcard;
} if(self::isMicroformat($mf2) && in_array('h-card', $mf2['type'])) {
$hcard = [
'type' => 'card',
];
$properties = ['name','latitude','longitude','locality','region','country','url'];
foreach($properties as $p) {
if($v=self::getPlaintext($mf2, $p)) {
$hcard[$p] = $v;
}
}
return $hcard;
}
}
return false;
}
private static function collectArrayURLValues($properties, $item, &$data, &$refs, &$http) {
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
@ -295,7 +328,7 @@ class Mf2 {
$refs = [];
// Single plaintext and URL values
self::collectSingleValues(['published','summary','rsvp','swarm-coins'], ['url'], $item, $data);
self::collectSingleValues(['published','summary','rsvp','swarm-coins'], ['url'], $item, $data, $http);
// These properties are always returned as arrays and may contain plaintext content
// First strip leading hashtags from category values if present
@ -316,6 +349,9 @@ class Mf2 {
if($author = self::findAuthor($mf2, $item, $http))
$data['author'] = $author;
if($checkin = self::parseEmbeddedHCard('checkin', $item, $http))
$data['checkin'] = $checkin;
$response = [
'data' => $data
];
@ -333,7 +369,7 @@ class Mf2 {
];
$refs = [];
self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $data);
self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $data, $http);
// Fallback for Mf1 "description" as content. The PHP parser does not properly map this to "content"
$description = self::parseHTMLValue('description', $item);
@ -397,7 +433,7 @@ class Mf2 {
'type' => 'product'
];
self::collectSingleValues(['name','identifier','price'], ['url'], $item, $data);
self::collectSingleValues(['name','identifier','price'], ['url'], $item, $data, $http);
$description = self::parseHTMLValue('description', $item);
if($description) {
@ -446,7 +482,7 @@ class Mf2 {
$refs = [];
// Single plaintext and URL values
self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $data);
self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $data, $http);
// These properties are always returned as arrays and may contain plaintext content
self::collectArrayValues(['category','location','attendee'], $item, $data, $refs, $http);
@ -655,53 +691,6 @@ class Mf2 {
return $author;
}
private static function sanitizeHTML($html) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', [
'a',
'abbr',
'b',
'code',
'del',
'em',
'i',
'img',
'q',
'strike',
'strong',
'time',
'blockquote',
'pre',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'ul',
'li',
'ol'
]);
$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',
'Inline',
'Inline',
'Common',
[
'datetime' => 'Text'
]
);
// Override the allowed classes to only support Microformats2 classes
$def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
$purifier = new HTMLPurifier($config);
$sanitized = $purifier->purify($html);
$sanitized = str_replace("&#xD;","\r",$sanitized);
return $sanitized;
}
private static function hasNumericKeys(array $arr) {
foreach($arr as $key=>$val)
if (is_numeric($key))

+ 9
- 2
lib/XRay/Formats/Twitter.php View File

@ -33,10 +33,17 @@ class Twitter extends Format {
try {
$tweet = $twitter->request('statuses/show/'.$tweet_id, 'GET', ['tweet_mode'=>'extended']);
} catch(\TwitterException $e) {
return false;
return [
'error' => 'twitter_error',
'error_description' => $e->getMessage()
];
}
return $tweet;
return [
'url' => $url,
'body' => $tweet,
'code' => 200,
];
}
public static function parse($json, $url) {

+ 4
- 0
lib/XRay/Parser.php View File

@ -38,6 +38,10 @@ class Parser {
return Formats\XKCD::parse($body, $url);
}
if(Formats\Hackernews::matches($url)) {
return Formats\Hackernews::parse($body, $url);
}
// No special parsers matched, parse for Microformats now
return Formats\HTML::parse($this->http, $body, $url, $opts);
}

+ 70
- 0
tests/HackernewsTest.php View File

@ -0,0 +1,70 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class HackernewsTest extends PHPUnit_Framework_TestCase {
private $http;
public function setUp() {
$this->client = new Parse();
$this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/');
$this->client->mc = null;
}
private function parse($params) {
$request = new Request($params);
$response = new Response();
return $this->client->parse($request, $response);
}
public function testSubmission() {
$url = 'https://news.ycombinator.com/item?id=14516538';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('2017-06-08T19:32:12+00:00', $data['data']['published']);
$this->assertEquals('vkb', $data['data']['author']['name']);
$this->assertEquals('https://news.ycombinator.com/user?id=vkb', $data['data']['author']['url']);
$this->assertEquals('What are we doing about Facebook, Google, and the closed internet?', $data['data']['name']);
$this->assertEquals('There have been many, many posts about how toxic advertising and Facebook are (I\'ve written many myself[1][2][3]) for our internet ecosystem today.<p>What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?</p><p>[1]http://veekaybee.github.io/facebook-is-collecting-this/
[2]http://veekaybee.github.io/content-is-dead/
[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/</p>', $data['data']['content']['html']);
$this->assertEquals('There have been many, many posts about how toxic advertising and Facebook are (I\'ve written many myself[1][2][3]) for our internet ecosystem today.
What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?
[1]http://veekaybee.github.io/facebook-is-collecting-this/
[2]http://veekaybee.github.io/content-is-dead/
[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/', $data['data']['content']['text']);
}
public function testComment() {
$url = 'https://news.ycombinator.com/item?id=14516923';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('2017-06-08T20:23:20+00:00', $data['data']['published']);
$this->assertEquals('aaronpk', $data['data']['author']['name']);
$this->assertEquals('https://news.ycombinator.com/user?id=aaronpk', $data['data']['author']['url']);
$this->assertEquals('https://news.ycombinator.com/item?id=14516538', $data['data']['in-reply-to'][0]);
$this->assertArrayNotHasKey('name', $data['data']);
$this->assertEquals('I am a member of the W3C Social Web Working Group (<a href="https://www.w3.org/wiki/Socialwg">https://www.w3.org/wiki/Socialwg</a>), and have been organizing IndieWebCamp (<a href="https://indieweb.org/">https://indieweb.org/</a>) conferences in this space for the last 7 years. We\'ve been making a lot of progress:<p>* <a href="https://www.w3.org/TR/webmention/">https://www.w3.org/TR/webmention/</a> - cross-site commenting</p><p>* <a href="https://www.w3.org/TR/micropub/">https://www.w3.org/TR/micropub/</a> - API for apps to create posts on various servers</p><p>* <a href="https://www.w3.org/TR/websub/">https://www.w3.org/TR/websub/</a> - realtime subscriptions to feeds</p><p>* More: <a href="https://indieweb.org/specs">https://indieweb.org/specs</a></p><p>We focus on making sure there are a plurality of implementations and approaches rather than trying to build a single software solution to solve everything.</p><p>Try commenting on my copy of this post on my website by sending me a webmention! <a href="https://aaronparecki.com/2017/06/08/9/indieweb">https://aaronparecki.com/2017/06/08/9/indieweb</a></p>', $data['data']['content']['html']);
$this->assertEquals('I am a member of the W3C Social Web Working Group (https://www.w3.org/wiki/Socialwg), and have been organizing IndieWebCamp (https://indieweb.org/) conferences in this space for the last 7 years. We\'ve been making a lot of progress:
* https://www.w3.org/TR/webmention/ - cross-site commenting
* https://www.w3.org/TR/micropub/ - API for apps to create posts on various servers
* https://www.w3.org/TR/websub/ - realtime subscriptions to feeds
* More: https://indieweb.org/specs
We focus on making sure there are a plurality of implementations and approaches rather than trying to build a single software solution to solve everything.
Try commenting on my copy of this post on my website by sending me a webmention! https://aaronparecki.com/2017/06/08/9/indieweb', $data['data']['content']['text']);
}
}

+ 36
- 0
tests/ParseTest.php View File

@ -499,6 +499,42 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertFalse($data['info']['found_fragment']);
}
public function testCheckin() {
$url = 'http://source.example.com/checkin';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$venue = $data['data']['checkin'];
$this->assertEquals('https://foursquare.com/v/57104d2e498ece022e169dca', $venue['url']);
$this->assertEquals('DreamHost', $venue['name']);
$this->assertEquals('45.518716', $venue['latitude']);
$this->assertEquals('Homebrew Website Club!', $data['data']['content']['text']);
$this->assertEquals('https://aaronparecki.com/2017/06/07/12/photo.jpg', $data['data']['photo'][0]);
$this->assertEquals('2017-06-07T17:14:40-07:00', $data['data']['published']);
$this->assertArrayNotHasKey('name', $data['data']);
}
public function testCheckinURLOnly() {
$url = 'http://source.example.com/checkin-url';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$venue = $data['data']['checkin'];
$this->assertEquals('https://foursquare.com/v/57104d2e498ece022e169dca', $venue['url']);
$this->assertEquals('Homebrew Website Club!', $data['data']['content']['text']);
$this->assertEquals('https://aaronparecki.com/2017/06/07/12/photo.jpg', $data['data']['photo'][0]);
$this->assertEquals('2017-06-07T17:14:40-07:00', $data['data']['published']);
$this->assertArrayNotHasKey('name', $data['data']);
}
public function testXKCD() {
$url = 'http://xkcd.com/1810/';
$response = $this->parse(['url' => $url]);

+ 11
- 0
tests/data/hacker-news.firebaseio.com/v0_item_14516538.json View File

@ -0,0 +1,11 @@
HTTP/1.1 200 OK
Server: nginx
Date: Thu, 08 Jun 2017 21:28:24 GMT
Content-Type: application/json; charset=utf-8
Content-Length: 949
Connection: keep-alive
Access-Control-Allow-Origin: *
Cache-Control: no-cache
Strict-Transport-Security: max-age=31556926; includeSubDomains; preload
{"by":"vkb","descendants":51,"id":14516538,"kids":[14516923,14517320,14517322,14517224,14516999,14516850,14517290,14516926,14516808,14517088,14517137,14516981,14516706,14517080,14517055,14516805,14516785,14516890,14517104,14516723,14516853,14517094],"score":84,"text":"There have been many, many posts about how toxic advertising and Facebook are (I&#x27;ve written many myself[1][2][3]) for our internet ecosystem today.<p>What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?<p>[1]http:&#x2F;&#x2F;veekaybee.github.io&#x2F;facebook-is-collecting-this&#x2F;\n[2]http:&#x2F;&#x2F;veekaybee.github.io&#x2F;content-is-dead&#x2F;\n[3] http:&#x2F;&#x2F;veekaybee.github.io&#x2F;who-is-doing-this-to-my-internet&#x2F;","time":1496950332,"title":"What are we doing about Facebook, Google, and the closed internet?","type":"story"}

+ 11
- 0
tests/data/hacker-news.firebaseio.com/v0_item_14516923.json View File

@ -0,0 +1,11 @@
HTTP/1.1 200 OK
Server: nginx
Date: Fri, 09 Jun 2017 14:30:19 GMT
Content-Type: application/json; charset=utf-8
Content-Length: 1701
Connection: keep-alive
Access-Control-Allow-Origin: *
Cache-Control: no-cache
Strict-Transport-Security: max-age=31556926; includeSubDomains; preload
{"by":"aaronpk","id":14516923,"kids":[14517124,14517655,14516983,14518902,14518663],"parent":14516538,"text":"I am a member of the W3C Social Web Working Group (<a href=\"https:&#x2F;&#x2F;www.w3.org&#x2F;wiki&#x2F;Socialwg\" rel=\"nofollow\">https:&#x2F;&#x2F;www.w3.org&#x2F;wiki&#x2F;Socialwg</a>), and have been organizing IndieWebCamp (<a href=\"https:&#x2F;&#x2F;indieweb.org&#x2F;\" rel=\"nofollow\">https:&#x2F;&#x2F;indieweb.org&#x2F;</a>) conferences in this space for the last 7 years. We&#x27;ve been making a lot of progress:<p>* <a href=\"https:&#x2F;&#x2F;www.w3.org&#x2F;TR&#x2F;webmention&#x2F;\" rel=\"nofollow\">https:&#x2F;&#x2F;www.w3.org&#x2F;TR&#x2F;webmention&#x2F;</a> - cross-site commenting<p>* <a href=\"https:&#x2F;&#x2F;www.w3.org&#x2F;TR&#x2F;micropub&#x2F;\" rel=\"nofollow\">https:&#x2F;&#x2F;www.w3.org&#x2F;TR&#x2F;micropub&#x2F;</a> - API for apps to create posts on various servers<p>* <a href=\"https:&#x2F;&#x2F;www.w3.org&#x2F;TR&#x2F;websub&#x2F;\" rel=\"nofollow\">https:&#x2F;&#x2F;www.w3.org&#x2F;TR&#x2F;websub&#x2F;</a> - realtime subscriptions to feeds<p>* More: <a href=\"https:&#x2F;&#x2F;indieweb.org&#x2F;specs\" rel=\"nofollow\">https:&#x2F;&#x2F;indieweb.org&#x2F;specs</a><p>We focus on making sure there are a plurality of implementations and approaches rather than trying to build a single software solution to solve everything.<p>Try commenting on my copy of this post on my website by sending me a webmention! <a href=\"https:&#x2F;&#x2F;aaronparecki.com&#x2F;2017&#x2F;06&#x2F;08&#x2F;9&#x2F;indieweb\" rel=\"nofollow\">https:&#x2F;&#x2F;aaronparecki.com&#x2F;2017&#x2F;06&#x2F;08&#x2F;9&#x2F;indieweb</a>","time":1496953400,"type":"comment"}

+ 28
- 0
tests/data/source.example.com/checkin View File

@ -0,0 +1,28 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<div class="u-checkin h-card">
at <a href="https://foursquare.com/v/57104d2e498ece022e169dca" class="u-url p-name">DreamHost</a>
<div style="display:none;">
<span class="p-latitude">45.518716</span>
<span class="p-longitude">-122.679614</span>
</div>
</div>
<p class="e-content p-name">Homebrew Website Club!</p>
<img src="https://aaronparecki.com/2017/06/07/12/photo.jpg" class="u-photo">
<a href="http://source.example.com/checkin" class="u-url">
<time class="dt-published" datetime="2017-06-07T17:14:40-07:00">
Wed, Jun 7, 2017 5:14pm -07:00
</time>
</a>
</body>
</html>

+ 24
- 0
tests/data/source.example.com/checkin-url View File

@ -0,0 +1,24 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<a class="u-checkin" href="https://foursquare.com/v/57104d2e498ece022e169dca">
at DreamHost
</a>
<p class="e-content p-name">Homebrew Website Club!</p>
<img src="https://aaronparecki.com/2017/06/07/12/photo.jpg" class="u-photo">
<a href="http://source.example.com/checkin" class="u-url">
<time class="dt-published" datetime="2017-06-07T17:14:40-07:00">
Wed, Jun 7, 2017 5:14pm -07:00
</time>
</a>
</body>
</html>

Loading…
Cancel
Save