Browse Source

sanitize HTML in the entry

allow only a basic set of tags, and remove any non-mf2 classes

closes #2
pull/39/head
Aaron Parecki 8 years ago
parent
commit
66a9b1cc9e
6 changed files with 86 additions and 11 deletions
  1. +2
    -1
      composer.json
  2. +41
    -0
      lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php
  3. +10
    -9
      lib/Formats/Mf2.php
  4. +14
    -1
      tests/SanitizeTest.php
  5. +17
    -0
      tests/data/sanitize.example/entry-with-mf2-classes
  6. +2
    -0
      tests/data/sanitize.example/entry-with-valid-tags

+ 2
- 1
composer.json View File

@ -13,7 +13,8 @@
"lib/HTTPCurl.php",
"lib/HTTPStream.php",
"lib/HTTP.php",
"lib/Formats/Mf2.php"
"lib/Formats/Mf2.php",
"lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php"
]
},
"autoload-dev": {

+ 41
- 0
lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php View File

@ -0,0 +1,41 @@
<?php
/**
* Allows Microformats2 classes but rejects any others
*/
class HTMLPurifier_AttrDef_HTML_Microformats2 extends HTMLPurifier_AttrDef_HTML_Nmtokens
{
/**
* @param string $string
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return bool|string
*/
protected function split($string, $config, $context)
{
// really, this twiddle should be lazy loaded
$name = $config->getDefinition('HTML')->doctype->name;
if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
return parent::split($string, $config, $context);
} else {
return preg_split('/\s+/', $string);
}
}
/**
* @param array $tokens
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
protected function filter($tokens, $config, $context)
{
$ret = array();
foreach ($tokens as $token) {
if(preg_match('/^([hpue]|dt)-.+/', $token)) {
$ret[] = $token;
}
}
return $ret;
}
}

+ 10
- 9
lib/Formats/Mf2.php View File

@ -39,13 +39,7 @@ class Mf2 {
private static function parseHEntry($mf2, $http) {
$data = [
'type' => 'entry',
'author' => [
'type' => 'card',
'name' => null,
'url' => null,
'photo' => null
]
'type' => 'entry'
];
$refs = [];
@ -119,7 +113,7 @@ class Mf2 {
$contentCompare = preg_replace('/\s/','',trim($textContent));
// Check if the name is a prefix of the content
if(strpos($contentCompare, $nameCompare) === 0) {
if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) {
$name = null;
}
}
@ -136,7 +130,8 @@ class Mf2 {
}
}
$data['author'] = self::findAuthor($mf2, $item, $http);
if($author = self::findAuthor($mf2, $item, $http))
$data['author'] = $author;
$response = [
'data' => $data
@ -291,6 +286,9 @@ class Mf2 {
}
if(!$author['name'] && !$author['photo'] && !$author['url'])
return null;
return $author;
}
@ -312,6 +310,7 @@ class Mf2 {
'time',
'blockquote',
'pre',
'p',
'h1',
'h2',
'h3',
@ -329,6 +328,8 @@ class Mf2 {
'datetime' => 'Text'
]
);
// Override the allowed classes to only support Microformats2 classes
$def->manager->attrTypes->set('Class', new \HTMLPurifier_AttrDef_HTML_Microformats2());
$purifier = new HTMLPurifier($config);
return $purifier->purify($html);
}

+ 14
- 1
tests/SanitizeTest.php View File

@ -43,6 +43,7 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$this->assertContains('<blockquote>Blockquote tags are okay</blockquote>', $html);
$this->assertContains('<pre>preformatted text is okay too', $html, '<pre> missing');
$this->assertContains('for code examples and such</pre>', $html, '<pre> missing');
$this->assertContains('<p>Paragraph tags are allowed</p>', $html, '<p> missing');
$this->assertContains('<h1>One</h1>', $html, '<h1> missing');
$this->assertContains('<h2>Two</h2>', $html, '<h2> missing');
$this->assertContains('<h3>Three</h3>', $html, '<h3> missing');
@ -61,11 +62,23 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$html = $data['data']['content']['html'];
$this->assertEquals('entry', $data['data']['type']);
$this->assertNotContains('<p>', $html);
$this->assertNotContains('<script>', $html);
$this->assertNotContains('<style>', $html);
$this->assertNotContains('visiblity', $html); // from the CSS
$this->assertNotContains('alert', $html); // from the JS
}
public function testAllowsMF2Classes() {
$url = 'http://sanitize.example/entry-with-mf2-classes';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$html = $data['data']['content']['html'];
$this->assertEquals('entry', $data['data']['type']);
$this->assertContains('<h2 class="p-name">Hello World</h2>', $html);
}
}

+ 17
- 0
tests/data/sanitize.example/entry-with-mf2-classes View File

@ -0,0 +1,17 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<div class="e-content">
<h2 class="p-name header">Hello World</h2>
This content has a valid tags that contains microformats classes.
</div>
</body>
</html>

+ 2
- 0
tests/data/sanitize.example/entry-with-valid-tags View File

@ -17,6 +17,8 @@ Connection: keep-alive
<pre>preformatted text is okay too
for code examples and such</pre>
<p>Paragraph tags are allowed</p>
<h1>One</h1>
<h2>Two</h2>
<h3>Three</h3>

Loading…
Cancel
Save