diff --git a/lib/XRay/Fetcher.php b/lib/XRay/Fetcher.php index ec8e175..dd9e861 100644 --- a/lib/XRay/Fetcher.php +++ b/lib/XRay/Fetcher.php @@ -53,6 +53,11 @@ class Fetcher { return $this->_fetch_github($url, $opts); } + // Check if this is a Hackernews URL and use the API + if(Formats\Hackernews::matches($url)) { + return Formats\Hackernews::fetch($this->http, $url, $opts); + } + // All other URLs are fetched normally // Special-case appspot.com URLs to not follow redirects. @@ -145,19 +150,7 @@ class Fetcher { ]; } - $tweet = Formats\Twitter::fetch($url, $creds); - if(!$tweet) { - return [ - 'error' => 'twitter_error', - 'error_description' => $e->getMessage() - ]; - } - - return [ - 'url' => $url, - 'body' => $tweet, - 'code' => 200, - ]; + return Formats\Twitter::fetch($url, $creds); } private function _fetch_facebook($url, $opts) { diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index 47e9625..0c94101 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -2,6 +2,7 @@ namespace p3k\XRay\Formats; use DOMDocument, DOMXPath; +use HTMLPurifier, HTMLPurifier_Config; interface iFormat { @@ -33,4 +34,52 @@ abstract class Format implements iFormat { return [$doc, $xpath]; } + protected static function sanitizeHTML($html) { + $config = HTMLPurifier_Config::createDefault(); + $config->set('Cache.DefinitionImpl', null); + $config->set('HTML.AllowedElements', [ + 'a', + 'abbr', + 'b', + 'code', + 'del', + 'em', + 'i', + 'img', + 'q', + 'strike', + 'strong', + 'time', + 'blockquote', + 'pre', + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'ul', + 'li', + 'ol' + ]); + $def = $config->getHTMLDefinition(true); + $def->addElement( + 'time', + 'Inline', + 'Inline', + 'Common', + [ + 'datetime' => 'Text' + ] + ); + // Override the allowed classes to only support Microformats2 classes + $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2()); + $purifier = new HTMLPurifier($config); + $sanitized = $purifier->purify($html); + $sanitized = str_replace(" ","\r",$sanitized); + return $sanitized; + } + + } diff --git a/lib/XRay/Formats/Hackernews.php b/lib/XRay/Formats/Hackernews.php new file mode 100644 index 0000000..ac2aa71 --- /dev/null +++ b/lib/XRay/Formats/Hackernews.php @@ -0,0 +1,88 @@ +get('https://hacker-news.firebaseio.com/v0/item/'.$match[1].'.json'); + if($response['code'] != 200) { + return [ + 'error' => 'hackernews_error', + 'error_description' => $response['body'], + 'code' => $response['code'], + ]; + } + + return [ + 'url' => $url, + 'body' => $response['body'], + 'code' => $response['code'], + ]; + } + + public static function parse($json, $url) { + $data = @json_decode($json, true); + + if(!$data) + return self::_unknown(); + + $match = self::matches($url); + + $date = DateTime::createFromFormat('U', $data['time']); + + // Start building the h-entry + $entry = array( + 'type' => 'entry', + 'url' => $url, + 'author' => [ + 'type' => 'card', + 'name' => $data['by'], + 'photo' => null, + 'url' => 'https://news.ycombinator.com/user?id='.$data['by'] + ], + 'published' => $date->format('c') + ); + + if(isset($data['title'])) { + $entry['name'] = $data['title']; + } + + if(isset($data['text'])) { + $htmlContent = trim(self::sanitizeHTML($data['text'])); + $textContent = str_replace('

', "\n

", $htmlContent); + $textContent = strip_tags($textContent); + $entry['content'] = [ + 'html' => $htmlContent, + 'text' => $textContent + ]; + } + + if(isset($data['parent'])) { + $entry['in-reply-to'] = ['https://news.ycombinator.com/item?id='.$data['parent']]; + } + + return [ + 'data' => $entry, + 'original' => $json + ]; + } + +} diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index c25d9a3..b749c1c 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -3,7 +3,15 @@ namespace p3k\XRay\Formats; use HTMLPurifier, HTMLPurifier_Config; -class Mf2 { +class Mf2 extends Format { + + public static function matches_host($url) { + return true; + } + + public static function matches($url) { + return true; + } public static function parse($mf2, $url, $http) { if(count($mf2['items']) == 0) @@ -227,6 +235,31 @@ class Mf2 { } } + private static function parseEmbeddedHCard($property, $item, &$http) { + if(array_key_exists($property, $item['properties'])) { + $mf2 = $item['properties'][$property][0]; + if(is_string($mf2) && self::isURL($mf2)) { + $hcard = [ + 'type' => 'card', + 'url' => $mf2 + ]; + return $hcard; + } if(self::isMicroformat($mf2) && in_array('h-card', $mf2['type'])) { + $hcard = [ + 'type' => 'card', + ]; + $properties = ['name','latitude','longitude','locality','region','country','url']; + foreach($properties as $p) { + if($v=self::getPlaintext($mf2, $p)) { + $hcard[$p] = $v; + } + } + return $hcard; + } + } + return false; + } + private static function collectArrayURLValues($properties, $item, &$data, &$refs, &$http) { foreach($properties as $p) { if(array_key_exists($p, $item['properties'])) { @@ -295,7 +328,7 @@ class Mf2 { $refs = []; // Single plaintext and URL values - self::collectSingleValues(['published','summary','rsvp','swarm-coins'], ['url'], $item, $data); + self::collectSingleValues(['published','summary','rsvp','swarm-coins'], ['url'], $item, $data, $http); // These properties are always returned as arrays and may contain plaintext content // First strip leading hashtags from category values if present @@ -316,6 +349,9 @@ class Mf2 { if($author = self::findAuthor($mf2, $item, $http)) $data['author'] = $author; + if($checkin = self::parseEmbeddedHCard('checkin', $item, $http)) + $data['checkin'] = $checkin; + $response = [ 'data' => $data ]; @@ -333,7 +369,7 @@ class Mf2 { ]; $refs = []; - self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $data); + self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $data, $http); // Fallback for Mf1 "description" as content. The PHP parser does not properly map this to "content" $description = self::parseHTMLValue('description', $item); @@ -397,7 +433,7 @@ class Mf2 { 'type' => 'product' ]; - self::collectSingleValues(['name','identifier','price'], ['url'], $item, $data); + self::collectSingleValues(['name','identifier','price'], ['url'], $item, $data, $http); $description = self::parseHTMLValue('description', $item); if($description) { @@ -446,7 +482,7 @@ class Mf2 { $refs = []; // Single plaintext and URL values - self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $data); + self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $data, $http); // These properties are always returned as arrays and may contain plaintext content self::collectArrayValues(['category','location','attendee'], $item, $data, $refs, $http); @@ -655,53 +691,6 @@ class Mf2 { return $author; } - private static function sanitizeHTML($html) { - $config = HTMLPurifier_Config::createDefault(); - $config->set('Cache.DefinitionImpl', null); - $config->set('HTML.AllowedElements', [ - 'a', - 'abbr', - 'b', - 'code', - 'del', - 'em', - 'i', - 'img', - 'q', - 'strike', - 'strong', - 'time', - 'blockquote', - 'pre', - 'p', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'ul', - 'li', - 'ol' - ]); - $def = $config->getHTMLDefinition(true); - $def->addElement( - 'time', - 'Inline', - 'Inline', - 'Common', - [ - 'datetime' => 'Text' - ] - ); - // Override the allowed classes to only support Microformats2 classes - $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2()); - $purifier = new HTMLPurifier($config); - $sanitized = $purifier->purify($html); - $sanitized = str_replace(" ","\r",$sanitized); - return $sanitized; - } - private static function hasNumericKeys(array $arr) { foreach($arr as $key=>$val) if (is_numeric($key)) diff --git a/lib/XRay/Formats/Twitter.php b/lib/XRay/Formats/Twitter.php index 7462dd5..db2fd9c 100644 --- a/lib/XRay/Formats/Twitter.php +++ b/lib/XRay/Formats/Twitter.php @@ -33,10 +33,17 @@ class Twitter extends Format { try { $tweet = $twitter->request('statuses/show/'.$tweet_id, 'GET', ['tweet_mode'=>'extended']); } catch(\TwitterException $e) { - return false; + return [ + 'error' => 'twitter_error', + 'error_description' => $e->getMessage() + ]; } - return $tweet; + return [ + 'url' => $url, + 'body' => $tweet, + 'code' => 200, + ]; } public static function parse($json, $url) { diff --git a/lib/XRay/Parser.php b/lib/XRay/Parser.php index bf9f13a..369dde3 100644 --- a/lib/XRay/Parser.php +++ b/lib/XRay/Parser.php @@ -38,6 +38,10 @@ class Parser { return Formats\XKCD::parse($body, $url); } + if(Formats\Hackernews::matches($url)) { + return Formats\Hackernews::parse($body, $url); + } + // No special parsers matched, parse for Microformats now return Formats\HTML::parse($this->http, $body, $url, $opts); } diff --git a/tests/HackernewsTest.php b/tests/HackernewsTest.php new file mode 100644 index 0000000..2fbceb5 --- /dev/null +++ b/tests/HackernewsTest.php @@ -0,0 +1,70 @@ +client = new Parse(); + $this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/'); + $this->client->mc = null; + } + + private function parse($params) { + $request = new Request($params); + $response = new Response(); + return $this->client->parse($request, $response); + } + + public function testSubmission() { + $url = 'https://news.ycombinator.com/item?id=14516538'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('2017-06-08T19:32:12+00:00', $data['data']['published']); + $this->assertEquals('vkb', $data['data']['author']['name']); + $this->assertEquals('https://news.ycombinator.com/user?id=vkb', $data['data']['author']['url']); + $this->assertEquals('What are we doing about Facebook, Google, and the closed internet?', $data['data']['name']); + $this->assertEquals('There have been many, many posts about how toxic advertising and Facebook are (I\'ve written many myself[1][2][3]) for our internet ecosystem today.

What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?

[1]http://veekaybee.github.io/facebook-is-collecting-this/ +[2]http://veekaybee.github.io/content-is-dead/ +[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/

', $data['data']['content']['html']); + $this->assertEquals('There have been many, many posts about how toxic advertising and Facebook are (I\'ve written many myself[1][2][3]) for our internet ecosystem today. +What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals? +[1]http://veekaybee.github.io/facebook-is-collecting-this/ +[2]http://veekaybee.github.io/content-is-dead/ +[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/', $data['data']['content']['text']); + } + + public function testComment() { + $url = 'https://news.ycombinator.com/item?id=14516923'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('2017-06-08T20:23:20+00:00', $data['data']['published']); + $this->assertEquals('aaronpk', $data['data']['author']['name']); + $this->assertEquals('https://news.ycombinator.com/user?id=aaronpk', $data['data']['author']['url']); + $this->assertEquals('https://news.ycombinator.com/item?id=14516538', $data['data']['in-reply-to'][0]); + $this->assertArrayNotHasKey('name', $data['data']); + $this->assertEquals('I am a member of the W3C Social Web Working Group (https://www.w3.org/wiki/Socialwg), and have been organizing IndieWebCamp (https://indieweb.org/) conferences in this space for the last 7 years. We\'ve been making a lot of progress:

* https://www.w3.org/TR/webmention/ - cross-site commenting

* https://www.w3.org/TR/micropub/ - API for apps to create posts on various servers

* https://www.w3.org/TR/websub/ - realtime subscriptions to feeds

* More: https://indieweb.org/specs

We focus on making sure there are a plurality of implementations and approaches rather than trying to build a single software solution to solve everything.

Try commenting on my copy of this post on my website by sending me a webmention! https://aaronparecki.com/2017/06/08/9/indieweb

', $data['data']['content']['html']); + $this->assertEquals('I am a member of the W3C Social Web Working Group (https://www.w3.org/wiki/Socialwg), and have been organizing IndieWebCamp (https://indieweb.org/) conferences in this space for the last 7 years. We\'ve been making a lot of progress: +* https://www.w3.org/TR/webmention/ - cross-site commenting +* https://www.w3.org/TR/micropub/ - API for apps to create posts on various servers +* https://www.w3.org/TR/websub/ - realtime subscriptions to feeds +* More: https://indieweb.org/specs +We focus on making sure there are a plurality of implementations and approaches rather than trying to build a single software solution to solve everything. +Try commenting on my copy of this post on my website by sending me a webmention! https://aaronparecki.com/2017/06/08/9/indieweb', $data['data']['content']['text']); + } + + +} + diff --git a/tests/ParseTest.php b/tests/ParseTest.php index 8853c6f..6d5b0a4 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -499,6 +499,42 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertFalse($data['info']['found_fragment']); } + public function testCheckin() { + $url = 'http://source.example.com/checkin'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + $venue = $data['data']['checkin']; + $this->assertEquals('https://foursquare.com/v/57104d2e498ece022e169dca', $venue['url']); + $this->assertEquals('DreamHost', $venue['name']); + $this->assertEquals('45.518716', $venue['latitude']); + $this->assertEquals('Homebrew Website Club!', $data['data']['content']['text']); + $this->assertEquals('https://aaronparecki.com/2017/06/07/12/photo.jpg', $data['data']['photo'][0]); + $this->assertEquals('2017-06-07T17:14:40-07:00', $data['data']['published']); + $this->assertArrayNotHasKey('name', $data['data']); + } + + public function testCheckinURLOnly() { + $url = 'http://source.example.com/checkin-url'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + $venue = $data['data']['checkin']; + $this->assertEquals('https://foursquare.com/v/57104d2e498ece022e169dca', $venue['url']); + $this->assertEquals('Homebrew Website Club!', $data['data']['content']['text']); + $this->assertEquals('https://aaronparecki.com/2017/06/07/12/photo.jpg', $data['data']['photo'][0]); + $this->assertEquals('2017-06-07T17:14:40-07:00', $data['data']['published']); + $this->assertArrayNotHasKey('name', $data['data']); + } + public function testXKCD() { $url = 'http://xkcd.com/1810/'; $response = $this->parse(['url' => $url]); diff --git a/tests/data/hacker-news.firebaseio.com/v0_item_14516538.json b/tests/data/hacker-news.firebaseio.com/v0_item_14516538.json new file mode 100644 index 0000000..47b64cc --- /dev/null +++ b/tests/data/hacker-news.firebaseio.com/v0_item_14516538.json @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Server: nginx +Date: Thu, 08 Jun 2017 21:28:24 GMT +Content-Type: application/json; charset=utf-8 +Content-Length: 949 +Connection: keep-alive +Access-Control-Allow-Origin: * +Cache-Control: no-cache +Strict-Transport-Security: max-age=31556926; includeSubDomains; preload + +{"by":"vkb","descendants":51,"id":14516538,"kids":[14516923,14517320,14517322,14517224,14516999,14516850,14517290,14516926,14516808,14517088,14517137,14516981,14516706,14517080,14517055,14516805,14516785,14516890,14517104,14516723,14516853,14517094],"score":84,"text":"There have been many, many posts about how toxic advertising and Facebook are (I've written many myself[1][2][3]) for our internet ecosystem today.

What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?

[1]http://veekaybee.github.io/facebook-is-collecting-this/\n[2]http://veekaybee.github.io/content-is-dead/\n[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/","time":1496950332,"title":"What are we doing about Facebook, Google, and the closed internet?","type":"story"} \ No newline at end of file diff --git a/tests/data/hacker-news.firebaseio.com/v0_item_14516923.json b/tests/data/hacker-news.firebaseio.com/v0_item_14516923.json new file mode 100644 index 0000000..21c8119 --- /dev/null +++ b/tests/data/hacker-news.firebaseio.com/v0_item_14516923.json @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Server: nginx +Date: Fri, 09 Jun 2017 14:30:19 GMT +Content-Type: application/json; charset=utf-8 +Content-Length: 1701 +Connection: keep-alive +Access-Control-Allow-Origin: * +Cache-Control: no-cache +Strict-Transport-Security: max-age=31556926; includeSubDomains; preload + +{"by":"aaronpk","id":14516923,"kids":[14517124,14517655,14516983,14518902,14518663],"parent":14516538,"text":"I am a member of the W3C Social Web Working Group (https://www.w3.org/wiki/Socialwg), and have been organizing IndieWebCamp (https://indieweb.org/) conferences in this space for the last 7 years. We've been making a lot of progress:

* https://www.w3.org/TR/webmention/ - cross-site commenting

* https://www.w3.org/TR/micropub/ - API for apps to create posts on various servers

* https://www.w3.org/TR/websub/ - realtime subscriptions to feeds

* More: https://indieweb.org/specs

We focus on making sure there are a plurality of implementations and approaches rather than trying to build a single software solution to solve everything.

Try commenting on my copy of this post on my website by sending me a webmention! https://aaronparecki.com/2017/06/08/9/indieweb","time":1496953400,"type":"comment"} \ No newline at end of file diff --git a/tests/data/source.example.com/checkin b/tests/data/source.example.com/checkin new file mode 100644 index 0000000..53150d9 --- /dev/null +++ b/tests/data/source.example.com/checkin @@ -0,0 +1,28 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

+ at DreamHost +
+ 45.518716 + -122.679614 +
+
+

Homebrew Website Club!

+ + + + + + + diff --git a/tests/data/source.example.com/checkin-url b/tests/data/source.example.com/checkin-url new file mode 100644 index 0000000..9621917 --- /dev/null +++ b/tests/data/source.example.com/checkin-url @@ -0,0 +1,24 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + + + at DreamHost + +

Homebrew Website Club!

+ + + + + + +