From 63ab3031a374a22195a3d101c2445ae3caeae9d3 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sat, 15 Apr 2017 10:17:11 -0700 Subject: [PATCH] parse XKCD comics skip image alt text for now closes #34 --- composer.json | 1 + config.production.php | 1 + config.template.php | 1 + controllers/Parse.php | 7 ++ lib/Formats/Twitter.php | 1 - lib/Formats/XKCD.php | 76 ++++++++++++++++++++ tests/ParseTest.php | 13 ++++ tests/data/xkcd.com/1810_ | 142 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 lib/Formats/XKCD.php create mode 100644 tests/data/xkcd.com/1810_ diff --git a/composer.json b/composer.json index 0f60c3b..0ddc7e8 100644 --- a/composer.json +++ b/composer.json @@ -22,6 +22,7 @@ "lib/Formats/Mf2.php", "lib/Formats/Instagram.php", "lib/Formats/Twitter.php", + "lib/Formats/XKCD.php", "lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php" ] }, diff --git a/config.production.php b/config.production.php index 29c42a8..12a7e47 100644 --- a/config.production.php +++ b/config.production.php @@ -1,5 +1,6 @@ respond($response, 200, $data); } + if($host == 'xkcd.com') { + $data = Formats\XKCD::parse($result['body'], $url); + $data['url'] = $result['url']; + $data['code'] = $result['code']; + return $this->respond($response, 200, $data); + } + // attempt to parse the page as HTML $doc = new DOMDocument(); @$doc->loadHTML(self::toHtmlEntities($result['body'])); diff --git a/lib/Formats/Twitter.php b/lib/Formats/Twitter.php index 2c49900..5bc1fe8 100644 --- a/lib/Formats/Twitter.php +++ b/lib/Formats/Twitter.php @@ -1,7 +1,6 @@ 'entry', + 'url' => $url, + 'author' => [ + 'type' => 'card', + 'name' => 'XKCD', + 'photo' => Config::$base.'/images/xkcd.png', + 'url' => 'https://xkcd.com/' + ] + ]; + + $name = $doc->getElementById('ctitle'); + $entry['name'] = $name->nodeValue; + + $photo = $xpath->query("//div[@id='comic']/img"); + if($photo->length != 1) + return self::_unknown(); + + $photo = $photo->item(0); + $img1 = $photo->getAttribute('src'); + $img2 = $photo->getAttribute('srcset'); + if($img2) { + $img2 = explode(',', $img2)[0]; + if(preg_match('/([^ ]+)/', $img2, $match)) { + $img2 = $match[1]; + } + } + + $src = \Mf2\resolveUrl($url, $img2 ?: $img1); + + $entry['photo'] = [$src]; + + $response = [ + 'data' => $entry + ]; + + return $response; + } + + private static function _unknown() { + return [ + 'data' => [ + 'type' => 'unknown' + ] + ]; + } + + private static function _loadHTML($html) { + $doc = new DOMDocument(); + @$doc->loadHTML($html); + + if(!$doc) { + return [null, null]; + } + + $xpath = new DOMXPath($doc); + + return [$doc, $xpath]; + } + +} diff --git a/tests/ParseTest.php b/tests/ParseTest.php index ab2e773..579d07c 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -501,4 +501,17 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertFalse($data['info']['found_fragment']); } + public function testXKCD() { + $url = 'http://xkcd.com/1810/'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('http://xkcd.com/1810/', $data['data']['url']); + $this->assertEquals('Chat Systems', $data['data']['name']); + $this->assertContains('http://imgs.xkcd.com/comics/chat_systems_2x.png', $data['data']['photo']); + } + } diff --git a/tests/data/xkcd.com/1810_ b/tests/data/xkcd.com/1810_ new file mode 100644 index 0000000..c2d2f87 --- /dev/null +++ b/tests/data/xkcd.com/1810_ @@ -0,0 +1,142 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2017 03:29:14 GMT +Content-Type: application/json +Connection: keep-alive + + + + + + +xkcd: Chat Systems + + + + + + + + + + +
+ +
+
+xkcd.com logo +A webcomic of romance,
sarcasm, math, and language.
+
+
+There are four new shirts in the xkcd store,
+along with posters and lots of other stuff! + +
+
+
+
+
+
+ +
Chat Systems
+ +
+Chat Systems +
+ +
+Permanent link to this comic: https://xkcd.com/1810/
+Image URL (for hotlinking/embedding): https://imgs.xkcd.com/comics/chat_systems.png + +
+
+Selected Comics + +Grownups +Circuit Diagram +Angular Momentum +Self-Description +Alternative Energy Revolution + +
+ +RSS Feed - Atom Feed +
+
+ +
+ +
+
+
xkcd.com is best viewed with Netscape Navigator 4.0 or below on a Pentium 3±1 emulated in Javascript on an Apple IIGS at a screen resolution of 1024x1. Please enable your ad blockers, disable high-heat drying, and remove your device from Airplane Mode and set it to Boat Mode. For security reasons, please leave caps lock on while browsing.
+
+
+

+This work is licensed under a +Creative Commons Attribution-NonCommercial 2.5 License. +

+This means you're free to copy and share these comics (but not to sell them). More details.

+
+
+ + + +