From d0de523746f33063ecd42251eb3c30241400ba45 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Thu, 8 Jun 2017 16:33:51 -0700 Subject: [PATCH] add hackernews support closes #40 --- lib/XRay/Fetcher.php | 5 ++ lib/XRay/Formats/Format.php | 49 +++++++++++ lib/XRay/Formats/Hackernews.php | 84 +++++++++++++++++++ lib/XRay/Formats/Mf2.php | 57 ++----------- lib/XRay/Parser.php | 4 + tests/HackernewsTest.php | 42 ++++++++++ .../v0_item_14516538.json | 11 +++ 7 files changed, 204 insertions(+), 48 deletions(-) create mode 100644 lib/XRay/Formats/Hackernews.php create mode 100644 tests/HackernewsTest.php create mode 100644 tests/data/hacker-news.firebaseio.com/v0_item_14516538.json diff --git a/lib/XRay/Fetcher.php b/lib/XRay/Fetcher.php index 8139cf8..608baea 100644 --- a/lib/XRay/Fetcher.php +++ b/lib/XRay/Fetcher.php @@ -48,6 +48,11 @@ class Fetcher { return $this->_fetch_github($url, $opts); } + // Check if this is a Hackernews URL and use the API + if(Formats\Hackernews::matches($url)) { + return Formats\Hackernews::fetch($this->http, $url, $opts); + } + // All other URLs are fetched normally // Special-case appspot.com URLs to not follow redirects. diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index 47e9625..0c94101 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -2,6 +2,7 @@ namespace p3k\XRay\Formats; use DOMDocument, DOMXPath; +use HTMLPurifier, HTMLPurifier_Config; interface iFormat { @@ -33,4 +34,52 @@ abstract class Format implements iFormat { return [$doc, $xpath]; } + protected static function sanitizeHTML($html) { + $config = HTMLPurifier_Config::createDefault(); + $config->set('Cache.DefinitionImpl', null); + $config->set('HTML.AllowedElements', [ + 'a', + 'abbr', + 'b', + 'code', + 'del', + 'em', + 'i', + 'img', + 'q', + 'strike', + 'strong', + 'time', + 'blockquote', + 'pre', + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'ul', + 'li', + 'ol' + ]); + $def = $config->getHTMLDefinition(true); + $def->addElement( + 'time', + 'Inline', + 'Inline', + 'Common', + [ + 'datetime' => 'Text' + ] + ); + // Override the allowed classes to only support Microformats2 classes + $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2()); + $purifier = new HTMLPurifier($config); + $sanitized = $purifier->purify($html); + $sanitized = str_replace(" ","\r",$sanitized); + return $sanitized; + } + + } diff --git a/lib/XRay/Formats/Hackernews.php b/lib/XRay/Formats/Hackernews.php new file mode 100644 index 0000000..780683d --- /dev/null +++ b/lib/XRay/Formats/Hackernews.php @@ -0,0 +1,84 @@ +get('https://hacker-news.firebaseio.com/v0/item/'.$match[1].'.json'); + if($response['code'] != 200) { + return [ + 'error' => 'hackernews_error', + 'error_description' => $response['body'], + 'code' => $response['code'], + ]; + } + + return [ + 'url' => $url, + 'body' => $response['body'], + 'code' => $response['code'], + ]; + } + + public static function parse($json, $url) { + $data = @json_decode($json, true); + + if(!$data) + return self::_unknown(); + + $match = self::matches($url); + + $date = DateTime::createFromFormat('U', $data['time']); + + // Start building the h-entry + $entry = array( + 'type' => 'entry', + 'url' => $url, + 'author' => [ + 'type' => 'card', + 'name' => $data['by'], + 'photo' => null, + 'url' => 'https://news.ycombinator.com/user?id='.$data['by'] + ], + 'published' => $date->format('c') + ); + + if(isset($data['title'])) { + $entry['name'] = $data['title']; + } + + if(isset($data['text'])) { + $htmlContent = trim(self::sanitizeHTML($data['text'])); + $textContent = str_replace('

', "

\n

", $htmlContent); + $textContent = strip_tags($textContent); + $entry['content'] = [ + 'html' => $htmlContent, + 'text' => $textContent + ]; + } + + return [ + 'data' => $entry, + 'original' => $json + ]; + } + +} diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index c25d9a3..4df9349 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -3,7 +3,15 @@ namespace p3k\XRay\Formats; use HTMLPurifier, HTMLPurifier_Config; -class Mf2 { +class Mf2 extends Format { + + public static function matches_host($url) { + return true; + } + + public static function matches($url) { + return true; + } public static function parse($mf2, $url, $http) { if(count($mf2['items']) == 0) @@ -655,53 +663,6 @@ class Mf2 { return $author; } - private static function sanitizeHTML($html) { - $config = HTMLPurifier_Config::createDefault(); - $config->set('Cache.DefinitionImpl', null); - $config->set('HTML.AllowedElements', [ - 'a', - 'abbr', - 'b', - 'code', - 'del', - 'em', - 'i', - 'img', - 'q', - 'strike', - 'strong', - 'time', - 'blockquote', - 'pre', - 'p', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'ul', - 'li', - 'ol' - ]); - $def = $config->getHTMLDefinition(true); - $def->addElement( - 'time', - 'Inline', - 'Inline', - 'Common', - [ - 'datetime' => 'Text' - ] - ); - // Override the allowed classes to only support Microformats2 classes - $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2()); - $purifier = new HTMLPurifier($config); - $sanitized = $purifier->purify($html); - $sanitized = str_replace(" ","\r",$sanitized); - return $sanitized; - } - private static function hasNumericKeys(array $arr) { foreach($arr as $key=>$val) if (is_numeric($key)) diff --git a/lib/XRay/Parser.php b/lib/XRay/Parser.php index 639aba7..bb0101a 100644 --- a/lib/XRay/Parser.php +++ b/lib/XRay/Parser.php @@ -34,6 +34,10 @@ class Parser { return Formats\XKCD::parse($body, $url); } + if(Formats\Hackernews::matches($url)) { + return Formats\Hackernews::parse($body, $url); + } + // No special parsers matched, parse for Microformats now return Formats\HTML::parse($this->http, $body, $url, $opts); } diff --git a/tests/HackernewsTest.php b/tests/HackernewsTest.php new file mode 100644 index 0000000..71cd9d4 --- /dev/null +++ b/tests/HackernewsTest.php @@ -0,0 +1,42 @@ +client = new Parse(); + $this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/'); + $this->client->mc = null; + } + + private function parse($params) { + $request = new Request($params); + $response = new Response(); + return $this->client->parse($request, $response); + } + + public function testSubmission() { + $url = 'https://news.ycombinator.com/item?id=14516538'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('2017-06-08T19:32:12+00:00', $data['data']['published']); + $this->assertEquals('vkb', $data['data']['author']['name']); + $this->assertEquals('https://news.ycombinator.com/user?id=vkb', $data['data']['author']['url']); + $this->assertEquals('What are we doing about Facebook, Google, and the closed internet?', $data['data']['name']); + $this->assertEquals('There have been many, many posts about how toxic advertising and Facebook are (I\'ve written many myself[1][2][3]) for our internet ecosystem today.

What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?

[1]http://veekaybee.github.io/facebook-is-collecting-this/ +[2]http://veekaybee.github.io/content-is-dead/ +[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/

', $data['data']['content']['html']); + } + + + +} + diff --git a/tests/data/hacker-news.firebaseio.com/v0_item_14516538.json b/tests/data/hacker-news.firebaseio.com/v0_item_14516538.json new file mode 100644 index 0000000..47b64cc --- /dev/null +++ b/tests/data/hacker-news.firebaseio.com/v0_item_14516538.json @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Server: nginx +Date: Thu, 08 Jun 2017 21:28:24 GMT +Content-Type: application/json; charset=utf-8 +Content-Length: 949 +Connection: keep-alive +Access-Control-Allow-Origin: * +Cache-Control: no-cache +Strict-Transport-Security: max-age=31556926; includeSubDomains; preload + +{"by":"vkb","descendants":51,"id":14516538,"kids":[14516923,14517320,14517322,14517224,14516999,14516850,14517290,14516926,14516808,14517088,14517137,14516981,14516706,14517080,14517055,14516805,14516785,14516890,14517104,14516723,14516853,14517094],"score":84,"text":"There have been many, many posts about how toxic advertising and Facebook are (I've written many myself[1][2][3]) for our internet ecosystem today.

What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?

[1]http://veekaybee.github.io/facebook-is-collecting-this/\n[2]http://veekaybee.github.io/content-is-dead/\n[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/","time":1496950332,"title":"What are we doing about Facebook, Google, and the closed internet?","type":"story"} \ No newline at end of file