Browse Source

sanitize HTML

sanitize the HTML returned in the content property. allows a common set of HTML tags.

for #2
pull/39/head
Aaron Parecki 9 years ago
parent
commit
241594dcf5
6 changed files with 209 additions and 5 deletions
  1. +2
    -1
      composer.json
  2. +46
    -2
      composer.lock
  3. +43
    -2
      lib/Formats/Mf2.php
  4. +71
    -0
      tests/SanitizeTest.php
  5. +19
    -0
      tests/data/sanitize.example/entry-with-unsafe-tags
  6. +28
    -0
      tests/data/sanitize.example/entry-with-valid-tags

+ 2
- 1
composer.json View File

@ -2,7 +2,8 @@
"require": {
"league/plates": "3.*",
"league/route": "1.*",
"mf2/mf2": "0.2.*"
"mf2/mf2": "0.2.*",
"ezyang/htmlpurifier": "4.*"
},
"autoload": {
"files": [

+ 46
- 2
composer.lock View File

@ -4,9 +4,53 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "a620639fa69cc332e4d4f49436f81ecc",
"content-hash": "b26a0c76296b944624f36bbb163392ae",
"hash": "9ca7e7a96c33dc8c293a42cbcd4c1d2f",
"content-hash": "c1c0c63887a953998208639cd85555a3",
"packages": [
{
"name": "ezyang/htmlpurifier",
"version": "v4.7.0",
"source": {
"type": "git",
"url": "https://github.com/ezyang/htmlpurifier.git",
"reference": "ae1828d955112356f7677c465f94f7deb7d27a40"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/ezyang/htmlpurifier/zipball/ae1828d955112356f7677c465f94f7deb7d27a40",
"reference": "ae1828d955112356f7677c465f94f7deb7d27a40",
"shasum": ""
},
"require": {
"php": ">=5.2"
},
"type": "library",
"autoload": {
"psr-0": {
"HTMLPurifier": "library/"
},
"files": [
"library/HTMLPurifier.composer.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL"
],
"authors": [
{
"name": "Edward Z. Yang",
"email": "admin@htmlpurifier.org",
"homepage": "http://ezyang.com"
}
],
"description": "Standards compliant HTML filter written in PHP",
"homepage": "http://htmlpurifier.org/",
"keywords": [
"html"
],
"time": "2015-08-05 01:03:42"
},
{
"name": "ircmaxell/password-compat",
"version": "v1.0.4",

+ 43
- 2
lib/Formats/Mf2.php View File

@ -1,6 +1,8 @@
<?php
namespace XRay\Formats;
use HTMLPurifier, HTMLPurifier_Config;
class Mf2 {
public static function parse($mf2, $url, $http) {
@ -102,8 +104,8 @@ class Mf2 {
$textContent = $content;
} elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
if(array_key_exists('html', $content)) {
$textContent = trim(strip_tags($content['html']));
$htmlContent = trim($content['html']);
$htmlContent = trim(self::sanitizeHTML($content['html']));
$textContent = trim(str_replace("&#xD;","\r",strip_tags($htmlContent)));
} else {
$textContent = trim($content['value']);
}
@ -292,6 +294,45 @@ class Mf2 {
return $author;
}
private static function sanitizeHTML($html) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', [
'a',
'abbr',
'b',
'code',
'del',
'em',
'i',
'img',
'q',
'strike',
'strong',
'time',
'blockquote',
'pre',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
]);
$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',
'Inline',
'Inline',
'Common',
[
'datetime' => 'Text'
]
);
$purifier = new HTMLPurifier($config);
return $purifier->purify($html);
}
private static function responseDisplayText($name, $summary, $content) {
// Build a fake h-entry to pass to the comments parser

+ 71
- 0
tests/SanitizeTest.php View File

@ -0,0 +1,71 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class SanitizeTest extends PHPUnit_Framework_TestCase {
private $http;
public function setUp() {
$this->client = new Parse();
$this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/');
}
private function parse($params) {
$request = new Request($params);
$response = new Response();
return $this->client->parse($request, $response);
}
public function testAllowsWhitelistedTags() {
$url = 'http://sanitize.example/entry-with-valid-tags';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$html = $data['data']['content']['html'];
$this->assertEquals('entry', $data['data']['type']);
$this->assertContains('This content has only valid tags.', $html);
$this->assertContains('<a href="http://sanitize.example/example">links</a>,', $html, '<a> missing');
$this->assertContains('<abbr>abbreviations</abbr>,', $html, '<abbr> missing');
$this->assertContains('<b>bold</b>,', $html, '<b> missing');
$this->assertContains('<code>inline code</code>,', $html, '<code> missing');
$this->assertContains('<del>delete</del>,', $html, '<del> missing');
$this->assertContains('<em>emphasis</em>,', $html, '<em> missing');
$this->assertContains('<i>italics</i>,', $html, '<i> missing');
$this->assertContains('<img alt="images are allowed" src="http://sanitize.example/example.jpg" />', $html, '<img> missing');
$this->assertContains('<q>inline quote</q>,', $html, '<q> missing');
$this->assertContains('<strike>strikethrough</strike>,', $html, '<strike> missing');
$this->assertContains('<strong>strong text</strong>,', $html, '<strong> missing');
$this->assertContains('<time datetime="2016-01-01">time elements</time>', $html, '<time> missing');
$this->assertContains('<blockquote>Blockquote tags are okay</blockquote>', $html);
$this->assertContains('<pre>preformatted text is okay too', $html, '<pre> missing');
$this->assertContains('for code examples and such</pre>', $html, '<pre> missing');
$this->assertContains('<h1>One</h1>', $html, '<h1> missing');
$this->assertContains('<h2>Two</h2>', $html, '<h2> missing');
$this->assertContains('<h3>Three</h3>', $html, '<h3> missing');
$this->assertContains('<h4>Four</h4>', $html, '<h4> missing');
$this->assertContains('<h5>Five</h5>', $html, '<h5> missing');
$this->assertContains('<h6>Six</h6>', $html, '<h6> missing');
}
public function testRemovesUnsafeTags() {
$url = 'http://sanitize.example/entry-with-unsafe-tags';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$html = $data['data']['content']['html'];
$this->assertEquals('entry', $data['data']['type']);
$this->assertNotContains('<p>', $html);
$this->assertNotContains('<script>', $html);
$this->assertNotContains('<style>', $html);
$this->assertNotContains('visiblity', $html); // from the CSS
$this->assertNotContains('alert', $html); // from the JS
}
}

+ 19
- 0
tests/data/sanitize.example/entry-with-unsafe-tags View File

@ -0,0 +1,19 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<div class="e-content">
<p>This content has a bunch of invalid tags but also some <b>valid ones</b>.</p>
<script>alert('woo')</script>
<style>body { visiblity: hidden }</style>
<p>Hello World</p>
</div>
</body>
</html>

+ 28
- 0
tests/data/sanitize.example/entry-with-valid-tags View File

@ -0,0 +1,28 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<div class="e-content">
This content has only valid tags. <a href="/example">links</a>, <abbr>abbreviations</abbr>, <b>bold</b>, <code>inline code</code>, <del>delete</del>, <em>emphasis</em>, <i>italics</i>, <img src="/example.jpg" alt="images are allowed"> <q>inline quote</q>, <strike>strikethrough</strike>, <strong>strong text</strong>, and <time datetime="2016-01-01">time elements</time> are supported inline elements.
<blockquote>Blockquote tags are okay</blockquote>
<pre>preformatted text is okay too
for code examples and such</pre>
<h1>One</h1>
<h2>Two</h2>
<h3>Three</h3>
<h4>Four</h4>
<h5>Five</h5>
<h6>Six</h6>
</div>
</body>
</html>

Loading…
Cancel
Save