From 1f6de10aba154d1b44e9a24d1491dd7e9a4ddff5 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sat, 9 Apr 2016 17:52:24 -0700 Subject: [PATCH] add tests for validating URL fields * fields that should be URLs will now be omitted if the value was not a URL, such as when the value is `javascript:alert()` * makes Mf2 class slightly more self-contained by duplicating the URL helper functions into it * fixes tests to not cache responses in memcache --- ...TMLPurifier_AttrDef_HTML_Microformats2.php | 3 +- lib/Formats/Mf2.php | 131 +++++++++++++----- tests/AuthorTest.php | 1 + tests/FeedTest.php | 1 + tests/SanitizeTest.php | 38 +++++ .../h-entry-with-email-author | 28 ++++ .../h-entry-with-javascript-urls | 44 ++++++ 7 files changed, 208 insertions(+), 38 deletions(-) create mode 100644 tests/data/sanitize.example/h-entry-with-email-author create mode 100644 tests/data/sanitize.example/h-entry-with-javascript-urls diff --git a/lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php b/lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php index d3d506f..0d910e5 100644 --- a/lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php +++ b/lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php @@ -1,9 +1,10 @@ [$v]], $u, $http); - if($ref) { - $refs[$u] = $ref['data']; + // If these are not objects, they must be URLs + $set = [ + 'normal' => ['category','invitee'], + 'url' => ['in-reply-to','like-of','repost-of','bookmark-of'] + ]; + foreach($set as $type=>$properties) { + foreach($properties as $p) { + if(array_key_exists($p, $item['properties'])) { + foreach($item['properties'][$p] as $v) { + if(is_string($v) && ($type == 'normal' || self::isURL($v))) { + if(!array_key_exists($p, $data)) $data[$p] = []; + $data[$p][] = $v; + } + elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) { + if(!array_key_exists($p, $data)) $data[$p] = []; + $data[$p][] = $u; + // parse the object and put the result in the "refs" object + $ref = self::parse(['items'=>[$v]], $u, $http); + if($ref) { + $refs[$u] = $ref['data']; + } } } - } - } + } + } } // Determine if the name is distinct from the content @@ -192,6 +209,7 @@ class Mf2 { if($htmlContent && $textContent != $htmlContent) { $data['content']['html'] = $htmlContent; } + // TODO: If no HTML content was included in the post, create HTML by autolinking? } if($author = self::findAuthor($mf2, $item, $http)) @@ -217,20 +235,29 @@ class Mf2 { // Single plaintext values $properties = ['name','summary','url','published','start','end','duration']; foreach($properties as $p) { - if($v = self::getPlaintext($item, $p)) - $data[$p] = $v; + if($v = self::getPlaintext($item, $p)) { + if($p == 'url') { + if(self::isURL($v)) + $data[$p] = $v; + } else { + $data[$p] = $v; + } + } } // Always arrays - $properties = ['photo','video','syndication']; + $properties = ['photo','video','audio','syndication']; foreach($properties as $p) { if(array_key_exists($p, $item['properties'])) { - $data[$p] = []; foreach($item['properties'][$p] as $v) { - if(is_string($v)) + if(is_string($v) && self::isURL($v)) { + if(!array_key_exists($p, $data)) $data[$p] = []; $data[$p][] = $v; - elseif(is_array($v) and array_key_exists('value', $v)) + } + elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) { + if(!array_key_exists($p, $data)) $data[$p] = []; $data[$p][] = $v['value']; + } } } } @@ -243,7 +270,7 @@ class Mf2 { foreach($item['properties'][$p] as $v) { if(is_string($v)) $data[$p][] = $v; - elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url'))) { + elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) { $data[$p][] = $u; // parse the object and put the result in the "refs" object $ref = self::parse(['items'=>[$v]], $u, $http); @@ -325,15 +352,25 @@ class Mf2 { // If there is a matching author URL, use that one $found = false; foreach($item['properties']['url'] as $url) { - $url = \normalize_url($url); - if($url == $authorURL) { - $data['url'] = $url; - $found = true; + if(self::isURL($url)) { + $url = self::normalize_url($url); + if($url == $authorURL) { + $data['url'] = $url; + $found = true; + } } } - if(!$found) $data['url'] = $item['properties']['url'][0]; + if(!$found && self::isURL($item['properties']['url'][0])) { + $data['url'] = $item['properties']['url'][0]; + } } else if($v = self::getPlaintext($item, $p)) { - $data[$p] = $v; + // Make sure the URL property is actually a URL + if($p == 'url' || $p == 'photo') { + if(self::isURL($v)) + $data[$p] = $v; + } else { + $data[$p] = $v; + } } } @@ -481,7 +518,7 @@ class Mf2 { ] ); // Override the allowed classes to only support Microformats2 classes - $def->manager->attrTypes->set('Class', new \HTMLPurifier_AttrDef_HTML_Microformats2()); + $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2()); $purifier = new HTMLPurifier($config); $sanitized = $purifier->purify($html); $sanitized = str_replace(" ","\r",$sanitized); @@ -566,4 +603,24 @@ class Mf2 { return \mf2\Parse($result['body'], $url); } + private static function normalize_url($url) { + $parts = parse_url($url); + if(empty($parts['path'])) + $parts['path'] = '/'; + $parts['host'] = strtolower($parts['host']); + return self::build_url($parts); + } + + private static function build_url($parsed_url) { + $scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : ''; + $host = isset($parsed_url['host']) ? $parsed_url['host'] : ''; + $port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : ''; + $user = isset($parsed_url['user']) ? $parsed_url['user'] : ''; + $pass = isset($parsed_url['pass']) ? ':' . $parsed_url['pass'] : ''; + $pass = ($user || $pass) ? "$pass@" : ''; + $path = isset($parsed_url['path']) ? $parsed_url['path'] : ''; + $query = isset($parsed_url['query']) ? '?' . $parsed_url['query'] : ''; + $fragment = isset($parsed_url['fragment']) ? '#' . $parsed_url['fragment'] : ''; + return "$scheme$user$pass$host$port$path$query$fragment"; + } } diff --git a/tests/AuthorTest.php b/tests/AuthorTest.php index 280417d..5f5a20b 100644 --- a/tests/AuthorTest.php +++ b/tests/AuthorTest.php @@ -9,6 +9,7 @@ class AuthorTest extends PHPUnit_Framework_TestCase { public function setUp() { $this->client = new Parse(); $this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); + $this->client->mc = null; } private function parse($params) { diff --git a/tests/FeedTest.php b/tests/FeedTest.php index 3f52a35..a7e5932 100644 --- a/tests/FeedTest.php +++ b/tests/FeedTest.php @@ -9,6 +9,7 @@ class FeedTest extends PHPUnit_Framework_TestCase { public function setUp() { $this->client = new Parse(); $this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); + $this->client->mc = null; } private function parse($params) { diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index aff7d42..f7aa4a7 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -9,6 +9,7 @@ class SanitizeTest extends PHPUnit_Framework_TestCase { public function setUp() { $this->client = new Parse(); $this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); + $this->client->mc = null; } private function parse($params) { @@ -113,4 +114,41 @@ class SanitizeTest extends PHPUnit_Framework_TestCase { $this->assertEquals('This content has some HTML escaped entities such as & ampersand, " quote, escaped <code> HTML tags, an ümlaut, an @at sign.', $data['data']['content']['html']); } + public function testSanitizeJavascriptURLs() { + $url = 'http://sanitize.example/h-entry-with-javascript-urls'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals('entry', $data['data']['type']); + $this->assertEquals('', $data['data']['author']['url']); + $this->assertArrayNotHasKey('url', $data['data']); + $this->assertArrayNotHasKey('photo', $data['data']); + $this->assertArrayNotHasKey('audio', $data['data']); + $this->assertArrayNotHasKey('video', $data['data']); + $this->assertArrayNotHasKey('syndication', $data['data']); + $this->assertArrayNotHasKey('in-reply-to', $data['data']); + $this->assertArrayNotHasKey('like-of', $data['data']); + $this->assertArrayNotHasKey('repost-of', $data['data']); + $this->assertArrayNotHasKey('bookmark-of', $data['data']); + $this->assertEquals('Author', $data['data']['author']['name']); + $this->assertEquals('', $data['data']['author']['photo']); + } + + public function testSanitizeEmailAuthorURL() { + $url = 'http://sanitize.example/h-entry-with-email-author'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('entry', $data->data->type); + $this->assertEquals('', $data->data->author->url); + $this->assertEquals('Author', $data->data->author->name); + $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo); + } + } diff --git a/tests/data/sanitize.example/h-entry-with-email-author b/tests/data/sanitize.example/h-entry-with-email-author new file mode 100644 index 0000000..e379539 --- /dev/null +++ b/tests/data/sanitize.example/h-entry-with-email-author @@ -0,0 +1,28 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+ + + +

Hello World

+ +
+ + + \ No newline at end of file diff --git a/tests/data/sanitize.example/h-entry-with-javascript-urls b/tests/data/sanitize.example/h-entry-with-javascript-urls new file mode 100644 index 0000000..d178814 --- /dev/null +++ b/tests/data/sanitize.example/h-entry-with-javascript-urls @@ -0,0 +1,44 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 + + + + + + Example + + + +
+ + + +

Hello World

+ + attack + attack + attack + attack + attack + attack + attack + attack + attack +
+ attack +
+
+ attack +
+ +
+ + + \ No newline at end of file