Browse Source

parse XKCD comics

skip image alt text for now

closes #34
pull/39/head
Aaron Parecki 7 years ago
parent
commit
63ab3031a3
No known key found for this signature in database GPG Key ID: 276C2817346D6056
8 changed files with 241 additions and 1 deletions
  1. +1
    -0
      composer.json
  2. +1
    -0
      config.production.php
  3. +1
    -0
      config.template.php
  4. +7
    -0
      controllers/Parse.php
  5. +0
    -1
      lib/Formats/Twitter.php
  6. +76
    -0
      lib/Formats/XKCD.php
  7. +13
    -0
      tests/ParseTest.php
  8. +142
    -0
      tests/data/xkcd.com/1810_

+ 1
- 0
composer.json View File

@ -22,6 +22,7 @@
"lib/Formats/Mf2.php", "lib/Formats/Mf2.php",
"lib/Formats/Instagram.php", "lib/Formats/Instagram.php",
"lib/Formats/Twitter.php", "lib/Formats/Twitter.php",
"lib/Formats/XKCD.php",
"lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php" "lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php"
] ]
}, },

+ 1
- 0
config.production.php View File

@ -1,5 +1,6 @@
<?php <?php
class Config { class Config {
public static $base = 'https://xray.p3k.io';
public static $cache = true; public static $cache = true;
public static $admins = [ public static $admins = [
'https://aaronparecki.com/' 'https://aaronparecki.com/'

+ 1
- 0
config.template.php View File

@ -1,5 +1,6 @@
<?php <?php
class Config { class Config {
public static $base = 'http://xray.dev';
public static $cache = false; public static $cache = false;
public static $admins = [ public static $admins = [
'https://you.example.com/' 'https://you.example.com/'

+ 7
- 0
controllers/Parse.php View File

@ -179,6 +179,13 @@ class Parse {
return $this->respond($response, 200, $data); return $this->respond($response, 200, $data);
} }
if($host == 'xkcd.com') {
$data = Formats\XKCD::parse($result['body'], $url);
$data['url'] = $result['url'];
$data['code'] = $result['code'];
return $this->respond($response, 200, $data);
}
// attempt to parse the page as HTML // attempt to parse the page as HTML
$doc = new DOMDocument(); $doc = new DOMDocument();
@$doc->loadHTML(self::toHtmlEntities($result['body'])); @$doc->loadHTML(self::toHtmlEntities($result['body']));

+ 0
- 1
lib/Formats/Twitter.php View File

@ -1,7 +1,6 @@
<?php <?php
namespace XRay\Formats; namespace XRay\Formats;
use DOMDocument, DOMXPath;
use DateTime, DateTimeZone; use DateTime, DateTimeZone;
use Parse; use Parse;

+ 76
- 0
lib/Formats/XKCD.php View File

@ -0,0 +1,76 @@
<?php
namespace XRay\Formats;
use DOMDocument, DOMXPath;
use DateTime, DateTimeZone;
use Parse, Config;
class XKCD {
public static function parse($html, $url) {
list($doc, $xpath) = self::_loadHTML($html);
if(!$doc)
return self::_unknown();
$entry = [
'type' => 'entry',
'url' => $url,
'author' => [
'type' => 'card',
'name' => 'XKCD',
'photo' => Config::$base.'/images/xkcd.png',
'url' => 'https://xkcd.com/'
]
];
$name = $doc->getElementById('ctitle');
$entry['name'] = $name->nodeValue;
$photo = $xpath->query("//div[@id='comic']/img");
if($photo->length != 1)
return self::_unknown();
$photo = $photo->item(0);
$img1 = $photo->getAttribute('src');
$img2 = $photo->getAttribute('srcset');
if($img2) {
$img2 = explode(',', $img2)[0];
if(preg_match('/([^ ]+)/', $img2, $match)) {
$img2 = $match[1];
}
}
$src = \Mf2\resolveUrl($url, $img2 ?: $img1);
$entry['photo'] = [$src];
$response = [
'data' => $entry
];
return $response;
}
private static function _unknown() {
return [
'data' => [
'type' => 'unknown'
]
];
}
private static function _loadHTML($html) {
$doc = new DOMDocument();
@$doc->loadHTML($html);
if(!$doc) {
return [null, null];
}
$xpath = new DOMXPath($doc);
return [$doc, $xpath];
}
}

+ 13
- 0
tests/ParseTest.php View File

@ -501,4 +501,17 @@ class ParseTest extends PHPUnit_Framework_TestCase {
$this->assertFalse($data['info']['found_fragment']); $this->assertFalse($data['info']['found_fragment']);
} }
public function testXKCD() {
$url = 'http://xkcd.com/1810/';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('http://xkcd.com/1810/', $data['data']['url']);
$this->assertEquals('Chat Systems', $data['data']['name']);
$this->assertContains('http://imgs.xkcd.com/comics/chat_systems_2x.png', $data['data']['photo']);
}
} }

+ 142
- 0
tests/data/xkcd.com/1810_ View File

@ -0,0 +1,142 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2017 03:29:14 GMT
Content-Type: application/json
Connection: keep-alive
<!DOCTYPE html>
<html>
<head>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-25700708-7', 'auto');
ga('send', 'pageview');
</script>
<link rel="stylesheet" type="text/css" href="/s/b0dcca.css" title="Default"/>
<title>xkcd: Chat Systems</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge"/>
<link rel="shortcut icon" href="/s/919f27.ico" type="image/x-icon"/>
<link rel="icon" href="/s/919f27.ico" type="image/x-icon"/>
<link rel="alternate" type="application/atom+xml" title="Atom 1.0" href="/atom.xml"/>
<link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="/rss.xml"/>
<script type="text/javascript" src="/s/b66ed7.js" async></script>
<script type="text/javascript" src="/s/1b9456.js" async></script>
</head>
<body>
<div id="topContainer">
<div id="topLeft">
<ul>
<li><a href="/archive">Archive</a></li>
<li><a href="http://what-if.xkcd.com">What If?</a></li>
<li><a href="http://blag.xkcd.com">Blag</a></li>
<li><a href="http://store.xkcd.com/">Store</a></li>
<li><a rel="author" href="/about">About</a></li>
</ul>
</div>
<div id="topRight">
<div id="masthead">
<span><a href="/"><img src="/s/0b7742.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
<span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
</div>
<div id="news">
There are <a href="https://store.xkcd.com/collections/featured">four new shirts</a> in <a href="https://store.xkcd.com/">the xkcd store</a>,<br />
along with <a href="https://store.xkcd.com/collections/posters">posters</a> and <a href="https://store.xkcd.com/collections/everything">lots of other stuff</a>!
</div>
</div>
<div id="bgLeft" class="bg box"></div>
<div id="bgRight" class="bg box"></div>
</div>
<div id="middleContainer" class="box">
<div id="ctitle">Chat Systems</div>
<ul class="comicNav">
<li><a href="/1/">|&lt;</a></li>
<li><a rel="prev" href="/1809/" accesskey="p">&lt; Prev</a></li>
<li><a href="//c.xkcd.com/random/comic/">Random</a></li>
<li><a rel="next" href="/1811/" accesskey="n">Next &gt;</a></li>
<li><a href="/">&gt;|</a></li>
</ul>
<div id="comic">
<img src="//imgs.xkcd.com/comics/chat_systems.png" title="I&#39;m one of the few Instagram users who connects solely through the Unix &#39;talk&#39; gateway." alt="Chat Systems" srcset="//imgs.xkcd.com/comics/chat_systems_2x.png 2x"/>
</div>
<ul class="comicNav">
<li><a href="/1/">|&lt;</a></li>
<li><a rel="prev" href="/1809/" accesskey="p">&lt; Prev</a></li>
<li><a href="//c.xkcd.com/random/comic/">Random</a></li>
<li><a rel="next" href="/1811/" accesskey="n">Next &gt;</a></li>
<li><a href="/">&gt;|</a></li>
</ul>
<br />
Permanent link to this comic: https://xkcd.com/1810/<br />
Image URL (for hotlinking/embedding): https://imgs.xkcd.com/comics/chat_systems.png
<div id="transcript" style="display: none"></div>
</div>
<div id="bottom" class="box">
<img src="//imgs.xkcd.com/s/a899e84.jpg" width="520" height="100" alt="Selected Comics" usemap="#comicmap"/>
<map id="comicmap" name="comicmap">
<area shape="rect" coords="0,0,100,100" href="/150/" alt="Grownups"/>
<area shape="rect" coords="104,0,204,100" href="/730/" alt="Circuit Diagram"/>
<area shape="rect" coords="208,0,308,100" href="/162/" alt="Angular Momentum"/>
<area shape="rect" coords="312,0,412,100" href="/688/" alt="Self-Description"/>
<area shape="rect" coords="416,0,520,100" href="/556/" alt="Alternative Energy Revolution"/>
</map>
<div>
<!--
Search comic titles and transcripts:
<script type="text/javascript" src="//www.google.com/jsapi"></script>
<script type="text/javascript">google.load('search', '1');google.setOnLoadCallback(function() {google.search.CustomSearchControl.attachAutoCompletion('012652707207066138651:zudjtuwe28q',document.getElementById('q'),'cse-search-box');});</script>
<form action="//www.google.com/cse" id="cse-search-box">
<div>
<input type="hidden" name="cx" value="012652707207066138651:zudjtuwe28q"/>
<input type="hidden" name="ie" value="UTF-8"/>
<input type="text" name="q" id="q" size="31"/>
<input type="submit" name="sa" value="Search"/>
</div>
</form>
<script type="text/javascript" src="//www.google.com/cse/brand?form=cse-search-box&amp;lang=en"></script>
-->
<a href="/rss.xml">RSS Feed</a> - <a href="/atom.xml">Atom Feed</a>
</div>
<br />
<div id="comicLinks">
Comics I enjoy:<br/>
<a href="http://threewordphrase.com/">Three Word Phrase</a>,
<a href="http://www.smbc-comics.com/">SMBC</a>,
<a href="http://www.qwantz.com">Dinosaur Comics</a>,
<a href="http://oglaf.com/">Oglaf</a> (nsfw),
<a href="http://www.asofterworld.com">A Softer World</a>,
<a href="http://buttersafe.com/">Buttersafe</a>,
<a href="http://pbfcomics.com/">Perry Bible Fellowship</a>,
<a href="http://questionablecontent.net/">Questionable Content</a>,
<a href="http://www.buttercupfestival.com/">Buttercup Festival</a>,
<a href="http://www.mspaintadventures.com/?s=6&p=001901">Homestuck</a>,
<a href="http://www.jspowerhour.com/">Junior Scientist Power Hour</a>
</div>
<br />
<div id="comicLinks">
Other things:<br/>
<a href="http://womenalsoknowstuff.com/">Women Also Know Stuff</a>,
<a href="https://techsolidarity.org/">Tech Solidarity</a>
</div>
<br />
<center>
<div id="footnote" style="width:70%">xkcd.com is best viewed with Netscape Navigator 4.0 or below on a Pentium 3&plusmn;1 emulated in Javascript on an Apple IIGS at a screen resolution of 1024x1. Please enable your ad blockers, disable high-heat drying, and remove your device from Airplane Mode and set it to Boat Mode. For security reasons, please leave caps lock on while browsing.</div>
</center>
<div id="licenseText">
<p>
This work is licensed under a
<a href="http://creativecommons.org/licenses/by-nc/2.5/">Creative Commons Attribution-NonCommercial 2.5 License</a>.
</p><p>
This means you're free to copy and share these comics (but not to sell them). <a rel="license" href="/license.html">More details</a>.</p>
</div>
</div>
</body>
<!-- Layout by Ian Clasbey, davean, and chromakode -->
</html>

Loading…
Cancel
Save