Browse Source

add tests for validating URL fields

* fields that should be URLs will now be omitted if the value was not a URL, such as when the value is `javascript:alert()`
* makes Mf2 class slightly more self-contained by duplicating the URL helper functions into it
* fixes tests to not cache responses in memcache
pull/39/head
Aaron Parecki 8 years ago
parent
commit
1f6de10aba
7 changed files with 208 additions and 38 deletions
  1. +2
    -1
      lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php
  2. +94
    -37
      lib/Formats/Mf2.php
  3. +1
    -0
      tests/AuthorTest.php
  4. +1
    -0
      tests/FeedTest.php
  5. +38
    -0
      tests/SanitizeTest.php
  6. +28
    -0
      tests/data/sanitize.example/h-entry-with-email-author
  7. +44
    -0
      tests/data/sanitize.example/h-entry-with-javascript-urls

+ 2
- 1
lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php View File

@ -1,9 +1,10 @@
<?php
namespace XRay\Formats;
/**
* Allows Microformats2 classes but rejects any others
*/
class HTMLPurifier_AttrDef_HTML_Microformats2 extends HTMLPurifier_AttrDef_HTML_Nmtokens
class HTMLPurifier_AttrDef_HTML_Microformats2 extends \HTMLPurifier_AttrDef_HTML_Nmtokens
{
/**
* @param string $string

+ 94
- 37
lib/Formats/Mf2.php View File

@ -30,7 +30,7 @@ class Mf2 {
if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
if(array_key_exists('url', $item['properties'])) {
$urls = $item['properties']['url'];
$urls = array_map('\normalize_url', $urls);
$urls = array_map('self::normalize_url', $urls);
if(in_array($url, $urls)) {
Parse::debug("mf2:1: Recognized $url as an h-entry because an h-entry on the page matched the URL of the request");
return self::parseAsHEntry($mf2, $item, $http);
@ -60,7 +60,7 @@ class Mf2 {
and array_key_exists('url', $item['properties'])
) {
$urls = $item['properties']['url'];
$urls = array_map('\normalize_url', $urls);
$urls = array_map('self::normalize_url', $urls);
if(in_array($url, $urls)) {
// TODO: check for children h-entrys (like tantek.com), or sibling h-entries (like aaronparecki.com)
// and return the result as a feed instead
@ -79,7 +79,7 @@ class Mf2 {
if($hentrys == 1) {
if($lastSeenEntry) {
$urls = $lastSeenEntry['properties']['url'];
$urls = array_map('\normalize_url', $urls);
$urls = array_map('self::normalize_url', $urls);
if(count($urls) && !in_array($url, $urls)) {
Parse::debug("mf2:5: Recognized $url as an h-feed no h-entrys on the page matched the URL of the request");
return self::parseAsHFeed($mf2, $http);
@ -110,42 +110,59 @@ class Mf2 {
// Single plaintext values
$properties = ['url','published','summary','rsvp'];
foreach($properties as $p) {
if($v = self::getPlaintext($item, $p))
$data[$p] = $v;
if($v = self::getPlaintext($item, $p)) {
if($p == 'url') {
if(self::isURL($v))
$data[$p] = $v;
} else {
$data[$p] = $v;
}
}
}
// Always arrays
$properties = ['photo','video','audio','syndication'];
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
$data[$p] = [];
foreach($item['properties'][$p] as $v) {
if(is_string($v))
if(is_string($v) && self::isURL($v)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v;
elseif(is_array($v) and array_key_exists('value', $v))
}
elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v['value'];
}
}
}
}
// Always returned as arrays, and may also create external references
$properties = ['in-reply-to','like-of','repost-of','bookmark-of','category','invitee'];
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
$data[$p] = [];
foreach($item['properties'][$p] as $v) {
if(is_string($v))
$data[$p][] = $v;
elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url'))) {
$data[$p][] = $u;
// parse the object and put the result in the "refs" object
$ref = self::parse(['items'=>[$v]], $u, $http);
if($ref) {
$refs[$u] = $ref['data'];
// If these are not objects, they must be URLs
$set = [
'normal' => ['category','invitee'],
'url' => ['in-reply-to','like-of','repost-of','bookmark-of']
];
foreach($set as $type=>$properties) {
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
foreach($item['properties'][$p] as $v) {
if(is_string($v) && ($type == 'normal' || self::isURL($v))) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v;
}
elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $u;
// parse the object and put the result in the "refs" object
$ref = self::parse(['items'=>[$v]], $u, $http);
if($ref) {
$refs[$u] = $ref['data'];
}
}
}
}
}
}
}
}
// Determine if the name is distinct from the content
@ -192,6 +209,7 @@ class Mf2 {
if($htmlContent && $textContent != $htmlContent) {
$data['content']['html'] = $htmlContent;
}
// TODO: If no HTML content was included in the post, create HTML by autolinking?
}
if($author = self::findAuthor($mf2, $item, $http))
@ -217,20 +235,29 @@ class Mf2 {
// Single plaintext values
$properties = ['name','summary','url','published','start','end','duration'];
foreach($properties as $p) {
if($v = self::getPlaintext($item, $p))
$data[$p] = $v;
if($v = self::getPlaintext($item, $p)) {
if($p == 'url') {
if(self::isURL($v))
$data[$p] = $v;
} else {
$data[$p] = $v;
}
}
}
// Always arrays
$properties = ['photo','video','syndication'];
$properties = ['photo','video','audio','syndication'];
foreach($properties as $p) {
if(array_key_exists($p, $item['properties'])) {
$data[$p] = [];
foreach($item['properties'][$p] as $v) {
if(is_string($v))
if(is_string($v) && self::isURL($v)) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v;
elseif(is_array($v) and array_key_exists('value', $v))
}
elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
if(!array_key_exists($p, $data)) $data[$p] = [];
$data[$p][] = $v['value'];
}
}
}
}
@ -243,7 +270,7 @@ class Mf2 {
foreach($item['properties'][$p] as $v) {
if(is_string($v))
$data[$p][] = $v;
elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url'))) {
elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
$data[$p][] = $u;
// parse the object and put the result in the "refs" object
$ref = self::parse(['items'=>[$v]], $u, $http);
@ -325,15 +352,25 @@ class Mf2 {
// If there is a matching author URL, use that one
$found = false;
foreach($item['properties']['url'] as $url) {
$url = \normalize_url($url);
if($url == $authorURL) {
$data['url'] = $url;
$found = true;
if(self::isURL($url)) {
$url = self::normalize_url($url);
if($url == $authorURL) {
$data['url'] = $url;
$found = true;
}
}
}
if(!$found) $data['url'] = $item['properties']['url'][0];
if(!$found && self::isURL($item['properties']['url'][0])) {
$data['url'] = $item['properties']['url'][0];
}
} else if($v = self::getPlaintext($item, $p)) {
$data[$p] = $v;
// Make sure the URL property is actually a URL
if($p == 'url' || $p == 'photo') {
if(self::isURL($v))
$data[$p] = $v;
} else {
$data[$p] = $v;
}
}
}
@ -481,7 +518,7 @@ class Mf2 {
]
);
// Override the allowed classes to only support Microformats2 classes
$def->manager->attrTypes->set('Class', new \HTMLPurifier_AttrDef_HTML_Microformats2());
$def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
$purifier = new HTMLPurifier($config);
$sanitized = $purifier->purify($html);
$sanitized = str_replace("&#xD;","\r",$sanitized);
@ -566,4 +603,24 @@ class Mf2 {
return \mf2\Parse($result['body'], $url);
}
private static function normalize_url($url) {
$parts = parse_url($url);
if(empty($parts['path']))
$parts['path'] = '/';
$parts['host'] = strtolower($parts['host']);
return self::build_url($parts);
}
private static function build_url($parsed_url) {
$scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : '';
$host = isset($parsed_url['host']) ? $parsed_url['host'] : '';
$port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : '';
$user = isset($parsed_url['user']) ? $parsed_url['user'] : '';
$pass = isset($parsed_url['pass']) ? ':' . $parsed_url['pass'] : '';
$pass = ($user || $pass) ? "$pass@" : '';
$path = isset($parsed_url['path']) ? $parsed_url['path'] : '';
$query = isset($parsed_url['query']) ? '?' . $parsed_url['query'] : '';
$fragment = isset($parsed_url['fragment']) ? '#' . $parsed_url['fragment'] : '';
return "$scheme$user$pass$host$port$path$query$fragment";
}
}

+ 1
- 0
tests/AuthorTest.php View File

@ -9,6 +9,7 @@ class AuthorTest extends PHPUnit_Framework_TestCase {
public function setUp() {
$this->client = new Parse();
$this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/');
$this->client->mc = null;
}
private function parse($params) {

+ 1
- 0
tests/FeedTest.php View File

@ -9,6 +9,7 @@ class FeedTest extends PHPUnit_Framework_TestCase {
public function setUp() {
$this->client = new Parse();
$this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/');
$this->client->mc = null;
}
private function parse($params) {

+ 38
- 0
tests/SanitizeTest.php View File

@ -9,6 +9,7 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
public function setUp() {
$this->client = new Parse();
$this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/');
$this->client->mc = null;
}
private function parse($params) {
@ -113,4 +114,41 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('This content has some <i>HTML escaped</i> entities such as &amp; ampersand, " quote, escaped &lt;code&gt; HTML tags, an ümlaut, an @at sign.', $data['data']['content']['html']);
}
public function testSanitizeJavascriptURLs() {
$url = 'http://sanitize.example/h-entry-with-javascript-urls';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('', $data['data']['author']['url']);
$this->assertArrayNotHasKey('url', $data['data']);
$this->assertArrayNotHasKey('photo', $data['data']);
$this->assertArrayNotHasKey('audio', $data['data']);
$this->assertArrayNotHasKey('video', $data['data']);
$this->assertArrayNotHasKey('syndication', $data['data']);
$this->assertArrayNotHasKey('in-reply-to', $data['data']);
$this->assertArrayNotHasKey('like-of', $data['data']);
$this->assertArrayNotHasKey('repost-of', $data['data']);
$this->assertArrayNotHasKey('bookmark-of', $data['data']);
$this->assertEquals('Author', $data['data']['author']['name']);
$this->assertEquals('', $data['data']['author']['photo']);
}
public function testSanitizeEmailAuthorURL() {
$url = 'http://sanitize.example/h-entry-with-email-author';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('entry', $data->data->type);
$this->assertEquals('', $data->data->author->url);
$this->assertEquals('Author', $data->data->author->name);
$this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo);
}
}

+ 28
- 0
tests/data/sanitize.example/h-entry-with-email-author View File

@ -0,0 +1,28 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Example</title>
</head>
<body>
<div class="h-entry">
<div class="p-author h-card">
<a href="mailto:author@example.com" class="u-url">
<img src="/photo.jpg" class="u-photo">
<span class="p-name">Author</span>
</a>
</div>
<p class="p-name e-content">Hello World</p>
</div>
</body>
</html>

+ 44
- 0
tests/data/sanitize.example/h-entry-with-javascript-urls View File

@ -0,0 +1,44 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Example</title>
</head>
<body>
<div class="h-entry">
<div class="p-author h-card">
<a href="javascript:alert('author href')" class="u-url">
<img src="javascript:alert('author photo')" class="u-photo">
<span class="p-name">Author</span>
</a>
</div>
<p class="p-name e-content">Hello World</p>
<a href="javascript:alert('url')" class="u-url">attack</a>
<a href="javascript:alert('photo')" class="u-photo">attack</a>
<a href="javascript:alert('audio')" class="u-audio">attack</a>
<a href="javascript:alert('video')" class="u-video">attack</a>
<a href="javascript:alert('syndication')" class="u-syndication">attack</a>
<a href="javascript:alert('in-reply-to')" class="u-in-reply-to">attack</a>
<a href="javascript:alert('like-of')" class="u-like-of">attack</a>
<a href="javascript:alert('repost-of')" class="u-repost-of">attack</a>
<a href="javascript:alert('bookmark-of')" class="u-bookmark-of">attack</a>
<div class="p-photo h-card">
<a href="javascript:alert('photo')" class="u-url p-name">attack</a>
</div>
<div class="p-repost-of h-card">
<a href="javascript:alert('repost-of')" class="u-url p-name">attack</a>
</div>
</div>
</body>
</html>

Loading…
Cancel
Save