Browse Source

add hackernews support

closes #40
pull/49/head
Aaron Parecki 4 years ago
parent
commit
d0de523746
No known key found for this signature in database GPG Key ID: 276C2817346D6056
7 changed files with 204 additions and 48 deletions
  1. +5
    -0
      lib/XRay/Fetcher.php
  2. +49
    -0
      lib/XRay/Formats/Format.php
  3. +84
    -0
      lib/XRay/Formats/Hackernews.php
  4. +9
    -48
      lib/XRay/Formats/Mf2.php
  5. +4
    -0
      lib/XRay/Parser.php
  6. +42
    -0
      tests/HackernewsTest.php
  7. +11
    -0
      tests/data/hacker-news.firebaseio.com/v0_item_14516538.json

+ 5
- 0
lib/XRay/Fetcher.php View File

@ -48,6 +48,11 @@ class Fetcher {
return $this->_fetch_github($url, $opts);
}
// Check if this is a Hackernews URL and use the API
if(Formats\Hackernews::matches($url)) {
return Formats\Hackernews::fetch($this->http, $url, $opts);
}
// All other URLs are fetched normally
// Special-case appspot.com URLs to not follow redirects.

+ 49
- 0
lib/XRay/Formats/Format.php View File

@ -2,6 +2,7 @@
namespace p3k\XRay\Formats;
use DOMDocument, DOMXPath;
use HTMLPurifier, HTMLPurifier_Config;
interface iFormat {
@ -33,4 +34,52 @@ abstract class Format implements iFormat {
return [$doc, $xpath];
}
protected static function sanitizeHTML($html) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', [
'a',
'abbr',
'b',
'code',
'del',
'em',
'i',
'img',
'q',
'strike',
'strong',
'time',
'blockquote',
'pre',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'ul',
'li',
'ol'
]);
$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',
'Inline',
'Inline',
'Common',
[
'datetime' => 'Text'
]
);
// Override the allowed classes to only support Microformats2 classes
$def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
$purifier = new HTMLPurifier($config);
$sanitized = $purifier->purify($html);
$sanitized = str_replace("
","\r",$sanitized);
return $sanitized;
}
}

+ 84
- 0
lib/XRay/Formats/Hackernews.php View File

@ -0,0 +1,84 @@
<?php
namespace p3k\XRay\Formats;
use DateTime, DateTimeZone;
use Config;
use cebe\markdown\GithubMarkdown;
class Hackernews extends Format {
public static function matches_host($url) {
$host = parse_url($url, PHP_URL_HOST);
return $host == 'news.ycombinator.com';
}
public static function matches($url) {
if(preg_match('~https?://news\.ycombinator\.com/item\?id=(\d+)$~', $url, $match))
return $match;
else
return false;
}
public static function fetch($http, $url, $opts) {
$match = self::matches($url);
$response = $http->get('https://hacker-news.firebaseio.com/v0/item/'.$match[1].'.json');
if($response['code'] != 200) {
return [
'error' => 'hackernews_error',
'error_description' => $response['body'],
'code' => $response['code'],
];
}
return [
'url' => $url,
'body' => $response['body'],
'code' => $response['code'],
];
}
public static function parse($json, $url) {
$data = @json_decode($json, true);
if(!$data)
return self::_unknown();
$match = self::matches($url);
$date = DateTime::createFromFormat('U', $data['time']);
// Start building the h-entry
$entry = array(
'type' => 'entry',
'url' => $url,
'author' => [
'type' => 'card',
'name' => $data['by'],
'photo' => null,
'url' => 'https://news.ycombinator.com/user?id='.$data['by']
],
'published' => $date->format('c')
);
if(isset($data['title'])) {
$entry['name'] = $data['title'];
}
if(isset($data['text'])) {
$htmlContent = trim(self::sanitizeHTML($data['text']));
$textContent = str_replace('</p><p>', "</p>\n<p>", $htmlContent);
$textContent = strip_tags($textContent);
$entry['content'] = [
'html' => $htmlContent,
'text' => $textContent
];
}
return [
'data' => $entry,
'original' => $json
];
}
}

+ 9
- 48
lib/XRay/Formats/Mf2.php View File

@ -3,7 +3,15 @@ namespace p3k\XRay\Formats;
use HTMLPurifier, HTMLPurifier_Config;
class Mf2 {
class Mf2 extends Format {
public static function matches_host($url) {
return true;
}
public static function matches($url) {
return true;
}
public static function parse($mf2, $url, $http) {
if(count($mf2['items']) == 0)
@ -655,53 +663,6 @@ class Mf2 {
return $author;
}
private static function sanitizeHTML($html) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', [
'a',
'abbr',
'b',
'code',
'del',
'em',
'i',
'img',
'q',
'strike',
'strong',
'time',
'blockquote',
'pre',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'ul',
'li',
'ol'
]);
$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',
'Inline',
'Inline',
'Common',
[
'datetime' => 'Text'
]
);
// Override the allowed classes to only support Microformats2 classes
$def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
$purifier = new HTMLPurifier($config);
$sanitized = $purifier->purify($html);
$sanitized = str_replace("&#xD;","\r",$sanitized);
return $sanitized;
}
private static function hasNumericKeys(array $arr) {
foreach($arr as $key=>$val)
if (is_numeric($key))

+ 4
- 0
lib/XRay/Parser.php View File

@ -34,6 +34,10 @@ class Parser {
return Formats\XKCD::parse($body, $url);
}
if(Formats\Hackernews::matches($url)) {
return Formats\Hackernews::parse($body, $url);
}
// No special parsers matched, parse for Microformats now
return Formats\HTML::parse($this->http, $body, $url, $opts);
}

+ 42
- 0
tests/HackernewsTest.php View File

@ -0,0 +1,42 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class HackernewsTest extends PHPUnit_Framework_TestCase {
private $http;
public function setUp() {
$this->client = new Parse();
$this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/');
$this->client->mc = null;
}
private function parse($params) {
$request = new Request($params);
$response = new Response();
return $this->client->parse($request, $response);
}
public function testSubmission() {
$url = 'https://news.ycombinator.com/item?id=14516538';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('2017-06-08T19:32:12+00:00', $data['data']['published']);
$this->assertEquals('vkb', $data['data']['author']['name']);
$this->assertEquals('https://news.ycombinator.com/user?id=vkb', $data['data']['author']['url']);
$this->assertEquals('What are we doing about Facebook, Google, and the closed internet?', $data['data']['name']);
$this->assertEquals('There have been many, many posts about how toxic advertising and Facebook are (I\'ve written many myself[1][2][3]) for our internet ecosystem today.<p>What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?</p><p>[1]http://veekaybee.github.io/facebook-is-collecting-this/
[2]http://veekaybee.github.io/content-is-dead/
[3] http://veekaybee.github.io/who-is-doing-this-to-my-internet/</p>', $data['data']['content']['html']);
}
}

+ 11
- 0
tests/data/hacker-news.firebaseio.com/v0_item_14516538.json View File

@ -0,0 +1,11 @@
HTTP/1.1 200 OK
Server: nginx
Date: Thu, 08 Jun 2017 21:28:24 GMT
Content-Type: application/json; charset=utf-8
Content-Length: 949
Connection: keep-alive
Access-Control-Allow-Origin: *
Cache-Control: no-cache
Strict-Transport-Security: max-age=31556926; includeSubDomains; preload
{"by":"vkb","descendants":51,"id":14516538,"kids":[14516923,14517320,14517322,14517224,14516999,14516850,14517290,14516926,14516808,14517088,14517137,14516981,14516706,14517080,14517055,14516805,14516785,14516890,14517104,14516723,14516853,14517094],"score":84,"text":"There have been many, many posts about how toxic advertising and Facebook are (I&#x27;ve written many myself[1][2][3]) for our internet ecosystem today.<p>What projects or companies are you working on to combat filter bubbles, walled gardens, emotional manipulation, and the like, and how can the HN community help you in your goals?<p>[1]http:&#x2F;&#x2F;veekaybee.github.io&#x2F;facebook-is-collecting-this&#x2F;\n[2]http:&#x2F;&#x2F;veekaybee.github.io&#x2F;content-is-dead&#x2F;\n[3] http:&#x2F;&#x2F;veekaybee.github.io&#x2F;who-is-doing-this-to-my-internet&#x2F;","time":1496950332,"title":"What are we doing about Facebook, Google, and the closed internet?","type":"story"}

Loading…
Cancel
Save