From 66a9b1cc9ed071e3eec6d8fcb03f3e0e384b5873 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sun, 28 Feb 2016 15:25:53 -0800 Subject: [PATCH] sanitize HTML in the entry allow only a basic set of tags, and remove any non-mf2 classes closes #2 --- composer.json | 3 +- ...TMLPurifier_AttrDef_HTML_Microformats2.php | 41 +++++++++++++++++++ lib/Formats/Mf2.php | 19 +++++---- tests/SanitizeTest.php | 15 ++++++- .../sanitize.example/entry-with-mf2-classes | 17 ++++++++ .../sanitize.example/entry-with-valid-tags | 2 + 6 files changed, 86 insertions(+), 11 deletions(-) create mode 100644 lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php create mode 100644 tests/data/sanitize.example/entry-with-mf2-classes diff --git a/composer.json b/composer.json index 3588d01..0addc84 100644 --- a/composer.json +++ b/composer.json @@ -13,7 +13,8 @@ "lib/HTTPCurl.php", "lib/HTTPStream.php", "lib/HTTP.php", - "lib/Formats/Mf2.php" + "lib/Formats/Mf2.php", + "lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php" ] }, "autoload-dev": { diff --git a/lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php b/lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php new file mode 100644 index 0000000..5d19c8e --- /dev/null +++ b/lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php @@ -0,0 +1,41 @@ +getDefinition('HTML')->doctype->name; + if ($name == "XHTML 1.1" || $name == "XHTML 2.0") { + return parent::split($string, $config, $context); + } else { + return preg_split('/\s+/', $string); + } + } + + /** + * @param array $tokens + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return array + */ + protected function filter($tokens, $config, $context) + { + $ret = array(); + foreach ($tokens as $token) { + if(preg_match('/^([hpue]|dt)-.+/', $token)) { + $ret[] = $token; + } + } + return $ret; + } +} diff --git a/lib/Formats/Mf2.php b/lib/Formats/Mf2.php index a1797ed..11a7bf8 100644 --- a/lib/Formats/Mf2.php +++ b/lib/Formats/Mf2.php @@ -39,13 +39,7 @@ class Mf2 { private static function parseHEntry($mf2, $http) { $data = [ - 'type' => 'entry', - 'author' => [ - 'type' => 'card', - 'name' => null, - 'url' => null, - 'photo' => null - ] + 'type' => 'entry' ]; $refs = []; @@ -119,7 +113,7 @@ class Mf2 { $contentCompare = preg_replace('/\s/','',trim($textContent)); // Check if the name is a prefix of the content - if(strpos($contentCompare, $nameCompare) === 0) { + if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) { $name = null; } } @@ -136,7 +130,8 @@ class Mf2 { } } - $data['author'] = self::findAuthor($mf2, $item, $http); + if($author = self::findAuthor($mf2, $item, $http)) + $data['author'] = $author; $response = [ 'data' => $data @@ -291,6 +286,9 @@ class Mf2 { } + if(!$author['name'] && !$author['photo'] && !$author['url']) + return null; + return $author; } @@ -312,6 +310,7 @@ class Mf2 { 'time', 'blockquote', 'pre', + 'p', 'h1', 'h2', 'h3', @@ -329,6 +328,8 @@ class Mf2 { 'datetime' => 'Text' ] ); + // Override the allowed classes to only support Microformats2 classes + $def->manager->attrTypes->set('Class', new \HTMLPurifier_AttrDef_HTML_Microformats2()); $purifier = new HTMLPurifier($config); return $purifier->purify($html); } diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index 8db456a..c123ee0 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -43,6 +43,7 @@ class SanitizeTest extends PHPUnit_Framework_TestCase { $this->assertContains('
Blockquote tags are okay
', $html); $this->assertContains('
preformatted text is okay too', $html, '
 missing');
     $this->assertContains('for code examples and such
', $html, '
 missing');
+    $this->assertContains('

Paragraph tags are allowed

', $html, '

missing'); $this->assertContains('

One

', $html, '

missing'); $this->assertContains('

Two

', $html, '

missing'); $this->assertContains('

Three

', $html, '

missing'); @@ -61,11 +62,23 @@ class SanitizeTest extends PHPUnit_Framework_TestCase { $html = $data['data']['content']['html']; $this->assertEquals('entry', $data['data']['type']); - $this->assertNotContains('

', $html); $this->assertNotContains('