Browse Source

adds a bunch of broken tests for #52

pull/60/head
Aaron Parecki 3 years ago
parent
commit
bdedef6e1e
No known key found for this signature in database GPG Key ID: 276C2817346D6056
9 changed files with 311 additions and 14 deletions
  1. +9
    -6
      lib/XRay/Formats/Format.php
  2. +26
    -8
      lib/XRay/Formats/Mf2.php
  3. +100
    -0
      tests/SanitizeTest.php
  4. +47
    -0
      tests/data/sanitize.example/cleverdevil
  5. +14
    -0
      tests/data/sanitize.example/entry-with-img-no-implied-photo
  6. +14
    -0
      tests/data/sanitize.example/photo-in-content
  7. +14
    -0
      tests/data/sanitize.example/photo-in-content-empty-alt
  8. +14
    -0
      tests/data/sanitize.example/photo-in-content-with-alt
  9. +73
    -0
      tests/data/sanitize.example/photos-with-alt

+ 9
- 6
lib/XRay/Formats/Format.php View File

@ -34,10 +34,8 @@ abstract class Format implements iFormat {
return [$doc, $xpath];
}
protected static function sanitizeHTML($html) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', [
protected static function sanitizeHTML($html, $allowImg=true) {
$allowed = [
'a',
'abbr',
'b',
@ -45,7 +43,6 @@ abstract class Format implements iFormat {
'del',
'em',
'i',
'img',
'q',
'strike',
'strong',
@ -62,7 +59,13 @@ abstract class Format implements iFormat {
'ul',
'li',
'ol'
]);
];
if($allowImg)
$allowed[] = 'img';
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', $allowed);
$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',

+ 26
- 8
lib/XRay/Formats/Mf2.php View File

@ -220,21 +220,39 @@ class Mf2 extends Format {
$textContent = $content;
} elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
if(array_key_exists('html', $content)) {
$htmlContent = trim(self::sanitizeHTML($content['html']));
#$textContent = trim(str_replace("
","\r",strip_tags($htmlContent)));
// Only allow images in the content if there is no photo property set
if(isset($item['properties']['photo']))
$allowImg = false;
else
$allowImg = true;
$htmlContent = trim(self::sanitizeHTML($content['html'], $allowImg));
$textContent = trim(str_replace("
","\r",$content['value']));
} else {
$textContent = trim($content['value']);
}
}
$data = [
'text' => $textContent
];
if($htmlContent && $textContent != $htmlContent) {
$data['html'] = $htmlContent;
if($textContent || $htmlContent) {
$data = [
'text' => $textContent
];
// Only add HTML content if there is actual content.
// If the text content ends up empty, then the HTML should be too
// e.g. <div class="e-content"><a href=""><img src="" class="u-photo"></a></div>
// should not return content of <a href=""></a>
// TODO: still need to remove empty <a> tags when there is other text in the content
if($htmlContent && $textContent && $textContent != $htmlContent) {
$data['html'] = $htmlContent;
}
if(!$data['text'])
return null;
return $data;
} else {
return null;
}
return $data;
}
// Always return arrays, and may contain plaintext content

+ 100
- 0
tests/SanitizeTest.php View File

@ -151,4 +151,104 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo);
}
public function testPhotoInContent() {
// https://github.com/aaronpk/XRay/issues/52
$url = 'http://sanitize.example/photo-in-content';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
#print_r($data->data);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}
public function testPhotoInContentEmptyAltAttribute() {
// https://github.com/aaronpk/XRay/issues/52
$url = 'http://sanitize.example/photo-in-content-empty-alt';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}
public function testPhotoInContentWithAlt() {
// https://github.com/aaronpk/XRay/issues/52
$url = 'http://sanitize.example/photo-in-content-with-alt';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}
public function testPhotoInContentWithNoText() {
$url = 'http://sanitize.example/cleverdevil';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectHasAttribute('name', $data->data);
$this->assertEquals('Oh, how well they know me! 🥃', $data->data->name);
$this->assertObjectNotHasAttribute('content', $data->data);
$this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]);
}
public function testPhotosWithAlt() {
// https://github.com/microformats/microformats2-parsing/issues/16
$url = 'http://sanitize.example/photos-with-alt';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
#print_r($data->data);
$this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]);
$this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]);
}
public function testEntryWithImgNoImpliedPhoto() {
// See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985
// and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683
// and https://github.com/microformats/microformats2-parsing/issues/16
$url = 'http://sanitize.example/entry-with-img-no-implied-photo';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertObjectNotHasAttribute('photo', $data->data);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo">', $data->data->content->html);
}
}

+ 47
- 0
tests/data/sanitize.example/cleverdevil View File

@ -0,0 +1,47 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body>
<div
class="col-md-8 col-md-offset-2 h-entry idno-photos idno-object idno-content">
<div>
<div class="p-author author h-card vcard">
<a href="https://cleverdevil.io/profile/cleverdevil" class="u-url icon-container"><img class="u-photo"
src="https://cleverdevil.io/file/2fa19f964fb8970faaf20b909c69d6cb/thumb.png"/></a>
<a class="p-name fn u-url url"
href="https://cleverdevil.io/profile/cleverdevil">Jonathan LaCour</a>
<a class="u-url" href="https://cleverdevil.io/profile/cleverdevil">
<!-- This is here to force the hand of your MF2 parser --></a>
</div>
<div class="break">&nbsp;</div>
</div>
<div class="datestamp">
<p>
<a class="u-url url" href="https://cleverdevil.io/2018/oh-how-well-they-know-me" rel="permalink">
<time class="dt-published"
datetime="2018-01-11T23:03:32+00:00">January 11, 2018</time>
</a>
</p>
</div>
<div class="idno-body">
<h2 class="photo-title p-name"><a
href="https://cleverdevil.io/2018/oh-how-well-they-know-me">Oh, how well they know me! 🥃</a>
</h2>
<div class="e-content entry-content">
<div class="photo-view">
<a href="https://cleverdevil.io/file/8ff3c164f54ff736bb5fb64d50e1276c/C66353A6-9AAE-46F2-B9BC-1BA7F5574D54.jpeg"
data-original-img="https://cleverdevil.io/file/8ff3c164f54ff736bb5fb64d50e1276c/C66353A6-9AAE-46F2-B9BC-1BA7F5574D54.jpeg"
data-title="Oh, how well they know me! 🥃"
data-footer=""><img src="https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg" class="u-photo" alt="Oh, how well they know me! 🥃" /></a>
</div>
</body>
</html>

+ 14
- 0
tests/data/sanitize.example/entry-with-img-no-implied-photo View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content p-name">This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo"></p>
</body>
</html>

+ 14
- 0
tests/data/sanitize.example/photo-in-content View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content p-name">This is a photo post with an <code>img</code> tag inside the content. <img class="u-photo" src="http://target.example.com/photo.jpg"></p>
</body>
</html>

+ 14
- 0
tests/data/sanitize.example/photo-in-content-empty-alt View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content p-name">This is a photo post with an <code>img</code> tag inside the content. <img class="u-photo" src="http://target.example.com/photo.jpg" alt=""></p>
</body>
</html>

+ 14
- 0
tests/data/sanitize.example/photo-in-content-with-alt View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content p-name">This is a photo post with an <code>img</code> tag inside the content. <img class="u-photo" src="http://target.example.com/photo.jpg" alt="a photo"></p>
</body>
</html>

+ 73
- 0
tests/data/sanitize.example/photos-with-alt View File

@ -0,0 +1,73 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<body class="post hentry h-entry as-note">
<header><div class="header">
<h1><a href="../" rel="author home">tantek.com</a></h1>
<form class="search" action="https://duckduckgo.com/html/" method="get">
<input type="hidden" name="type" value="list" />
<input type="search" value="site:tantek.com " name="q" />
<button type="submit">Search</button>
</form>
</div></header>
<div class="sidestuff">
<ul class="snav">
<li><a href="003/t2/first-yoga-class-this-year" id="previtem" title="View the previous (older) item in the stream." rel="prev"><abbr>&#x2190;</abbr></a></li><li><a href="005/t1/yesterday-rest-homework-day" id="nextitem" title="View the next (newer) item in the stream" rel="next"><abbr>&#x2192;</abbr></a></li>
</ul>
</div>
<a href="../" class="p-author h-card author-icon" rel="author" title="Tantek Çelik"><img src="../logo.jpg" alt="Tantek Çelik" /></a>
<p class="p-name entry-title e-content entry-content article"><a class="auto-link figure u-photo" href="https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg"><img class="auto-embed" alt="a jpg. " src="https://igx.4sqi.net/img/general/width960/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg"/></a><a class="auto-link figure u-photo" href="https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg"><img class="auto-embed" alt="a jpg. " src="https://igx.4sqi.net/img/general/width960/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg"/></a>🌆 Made it to the first #<span class="p-category auto-tag">NPSF</span> #<span class="p-category auto-tag">earlygang</span> of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #<span class="p-category auto-tag">100PDPD</span><br class="auto-break"/><br class="auto-break"/>#<span class="p-category auto-tag">justshowup</span> #<span class="p-category auto-tag">darknesstodawn</span> #<span class="p-category auto-tag">wakeupthesun</span> #<span class="p-category auto-tag">fromwhereirun</span> #<span class="p-category auto-tag">NovemberProject</span> #<span class="p-category auto-tag">sunrise</span> #<span class="p-category auto-tag">latergram</span> #<span class="p-category auto-tag">nofilter</span></p><span class="info footer">
<span class="dt-published published dt-updated updated">
<time class="value" datetime="21:08-0800">21:08</time> on <time class="value">2018-01-03</time> </span>
<span class="lt">
(ttk.me t4sE3) </span>
using <span class="using">BBEdit</span> </span>
<div class="info footer">
<form action="http://tantek.com/2018/003/t3/first-npsf-earlygang-year"><div>
<label><span class="lt">URL:</span>
<input class="u-url url u-uid uid bookmark" type="url" size="70" style="max-width:100%" value="http://tantek.com/2018/003/t3/first-npsf-earlygang-year" />
</label>
</div></form>
</div>
<div class="info footer">
<a class="u-syndication" rel="syndication" style="float:right; clear:right; margin-left:1em; height:2em"
href="https://www.instagram.com/p/BdhBVjNFi35">
<img src="/icon/instagram.png" style="vertical-align:-30%;margin:-3px -5px -4px 0" alt=""/>
View on Instagram</a>
<a class="u-syndication" rel="syndication" style="float:right; clear:right; margin-left:1em; height:2em"
href="https://www.facebook.com/10103634146088903">
<img src="/icon/facebook.png" style="vertical-align:-30%;margin:-3px -10px -4px 0" alt=""/>
View on Facebook</a>
<a class="u-syndication" rel="syndication" style="float:right; clear:right; margin-left:1em; height:2em"
href="https://www.flickr.com/photos/tantek/24621899937/">
<span style="display:inline-block; margin-top:.1em; font-size:1.8em;vertical-align:-5%">••</span>
View on Flickr</a>
<a class="u-syndication" rel="syndication" style="float:right; clear:right; height:2em"
href="https://twitter.com/t/status/948789001854517248">
<img src="/icon/twitter.png" style="vertical-align:-30%;margin:-3px -6px -4px 0" alt=""/>
View
on Twitter
</a><div style="line-height:2.6em; font-size:1em;">
<indie-action do="like" with="http://tantek.com/2018/003/t3/first-npsf-earlygang-year">
<a class="action favorite" target="_blank" href="https://twitter.com/intent/favorite?tweet_id=948789001854517248"><img src="https://g.twimg.com/dev/documentation/image/like-icon-16.png" style="vertical-align:-15%" alt=""/> Like</a>
</indie-action>
<indie-action do="repost" with="http://tantek.com/2018/003/t3/first-npsf-earlygang-year">
<a class="action reply" target="_blank" href="https://twitter.com/intent/retweet?tweet_id=948789001854517248"><img src="https://g.twimg.com/dev/documentation/image/retweet-icon-16.png" style="vertical-align:0%;margin:0 -0.1em 0 0.5em" alt=""/> Repost</a>
</indie-action>
<indie-action do="reply" with="http://tantek.com/2018/003/t3/first-npsf-earlygang-year">
<a class="action reply" target="_blank" href="https://twitter.com/intent/tweet?in_reply_to=948789001854517248"><img src="https://g.twimg.com/dev/documentation/image/reply-icon-16.png" style="vertical-align:0%;margin:0 -0.1em 0 0.5em" alt=""/> Reply</a>
</indie-action>
</div>
</div>
</li>
</body>
</html>

Loading…
Cancel
Save