Browse Source

fix checking for empty post content

XRay now looks for images inside the HTML and does not consider those empty posts
pull/97/head v1.10.2
Aaron Parecki 4 years ago
parent
commit
aacac198a8
No known key found for this signature in database GPG Key ID: 276C2817346D6056
9 changed files with 165 additions and 10 deletions
  1. +33
    -0
      lib/XRay/Formats/Format.php
  2. +10
    -2
      lib/XRay/Formats/Mf2.php
  3. +65
    -8
      tests/SanitizeTest.php
  4. +11
    -0
      tests/data/sanitize.example/content-is-only-video
  5. +1
    -0
      tests/data/sanitize.example/entry-with-img-no-implied-photo
  6. +11
    -0
      tests/data/sanitize.example/photo-in-content-no-p-with-alt
  7. +12
    -0
      tests/data/sanitize.example/photo-in-content-no-p-with-url-photo
  8. +11
    -0
      tests/data/sanitize.example/photo-in-content-with-alt-no-text
  9. +11
    -0
      tests/data/sanitize.example/photo-in-content-with-p-no-alt

+ 33
- 0
lib/XRay/Formats/Format.php View File

@ -75,6 +75,8 @@ abstract class Format implements iFormat {
}
$def = $config->getHTMLDefinition(true);
// add HTML <time> element
$def->addElement(
'time',
'Inline',
@ -84,6 +86,37 @@ abstract class Format implements iFormat {
'datetime' => 'Text'
]
);
/*
// This isn't working right now, not sure why
// http://developers.whatwg.org/the-video-element.html#the-video-element
$def->addElement(
'video',
'Block',
'Optional: (source, Flow) | (Flow, source) | Flow',
'Common',
[
'src' => 'URI',
'type' => 'Text',
'width' => 'Length',
'height' => 'Length',
'poster' => 'URI',
'preload' => 'Enum#auto,metadata,none',
'controls' => 'Bool',
]
);
$def->addElement(
'source',
'Block',
'Flow',
'Common',
[
'src' => 'URI',
'type' => 'Text',
]
);
*/
// Override the allowed classes to only support Microformats2 classes
$def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
$purifier = new HTMLPurifier($config);

+ 10
- 2
lib/XRay/Formats/Mf2.php View File

@ -278,7 +278,15 @@ class Mf2 extends Format {
$data['html'] = $htmlContent;
}
if(!$data['text'])
// Also add HTML content if it contains images, video or audio
// TODO: allow video and audio tags in content, then uncomment this
if(strpos($htmlContent, '<img') !== false
/* || strpos($htmlContent, '<video') !== false */
/* || strpos($htmlContent, '<audio') !== false */) {
$data['html'] = $htmlContent;
}
if(!$data['text'] && !isset($data['html']))
return null;
return $data;
@ -419,7 +427,7 @@ class Mf2 extends Format {
// If there is content, always return the plaintext content, and return HTML content if it's different
if($content) {
$content = self::parseHTMLValue('content', $item);
if($content['text']) {
if($content['text'] || $content['html']) {
$data['content']['text'] = $content['text'];
if(isset($content['html']))
$data['content']['html'] = $content['html'];

+ 65
- 8
tests/SanitizeTest.php View File

@ -240,7 +240,7 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}
public function testPhotoInContentWithNoText() {
public function testPhotoInContentWithNameAndNoText() {
$url = 'http://sanitize.example/cleverdevil';
$response = $this->parse(['url' => $url]);
@ -285,6 +285,67 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
}
public function testPhotoInContentWithNoText() {
$url = 'http://sanitize.example/photo-in-content-with-alt-no-text';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('<p><img src="http://sanitize.example/photo.jpg" alt="test" /></p>', $data['data']['content']['html']);
$this->assertEquals('', $data['data']['content']['text']);
}
public function testPhotoInContentWithPNoAlt() {
$url = 'http://sanitize.example/photo-in-content-with-p-no-alt';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('<p><img src="http://sanitize.example/photo.jpg" alt="photo.jpg" /></p>', $data['data']['content']['html']);
$this->assertEquals('', $data['data']['content']['text']);
}
public function testPhotoInContentNoPWithURLPhoto() {
$url = 'http://sanitize.example/photo-in-content-no-p-with-url-photo';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
$this->assertEquals('<img src="http://sanitize.example/photo.jpg" alt="test" />', $data['data']['content']['html']);
$this->assertEquals('', $data['data']['content']['text']);
}
public function testPhotoInContentNoPWithAlt() {
// This h-entry has no u-url so has an implied u-photo. we don't actually care what happens with it because
// this should never happen in the wild
$url = 'http://sanitize.example/photo-in-content-no-p-with-alt';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
}
/*
// TODO: add support for embedded video and audio tags in html content
public function testContentIsOnlyVideo() {
$url = 'http://sanitize.example/content-is-only-video';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body, true);
print_r($data);
}
*/
public function testPhotosWithAlt() {
// https://github.com/microformats/microformats2-parsing/issues/16
@ -295,17 +356,15 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
#print_r($data->data);
$this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD'."\n\n".'#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]);
$this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]);
}
/*
// Commented out until #55 is resolved
// Ignoring this issue for now. This should not happen in the wild.
// https://github.com/aaronpk/XRay/issues/55
// Skipping the implied photo check because in the wild, h-entrys should not exist withou a u-url, which stops implied parsing.
public function testEntryWithImgNoImpliedPhoto() {
// See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985
// and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683
@ -319,11 +378,9 @@ class SanitizeTest extends PHPUnit_Framework_TestCase {
$this->assertObjectNotHasAttribute('photo', $data->data);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo">', $data->data->content->html);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo" />', $data->data->content->html);
}
*/
public function testWhitespaceWithBreakTags() {
$url = 'http://sanitize.example/entry-with-br-tags';

+ 11
- 0
tests/data/sanitize.example/content-is-only-video View File

@ -0,0 +1,11 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<div class="h-entry">
<div class="e-content">
<video src="/video.mp4"></video>
</div>
</div>

+ 1
- 0
tests/data/sanitize.example/entry-with-img-no-implied-photo View File

@ -10,5 +10,6 @@ Connection: keep-alive
</head>
<body class="h-entry">
<p class="e-content p-name">This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo"></p>
<a href="/permalink" class="u-url">permalink</a>
</body>
</html>

+ 11
- 0
tests/data/sanitize.example/photo-in-content-no-p-with-alt View File

@ -0,0 +1,11 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<div class="h-entry">
<div class="e-content">
<img src="/photo.jpg" alt="test">
</div>
</div>

+ 12
- 0
tests/data/sanitize.example/photo-in-content-no-p-with-url-photo View File

@ -0,0 +1,12 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<div class="h-entry">
<div class="e-content">
<img src="/photo.jpg" alt="test">
</div>
<a href="/permalink" class="u-url">permalink</a>
</div>

+ 11
- 0
tests/data/sanitize.example/photo-in-content-with-alt-no-text View File

@ -0,0 +1,11 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<div class="h-entry">
<div class="e-content">
<p><img src="/photo.jpg" alt="test"></p>
</div>
</div>

+ 11
- 0
tests/data/sanitize.example/photo-in-content-with-p-no-alt View File

@ -0,0 +1,11 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<div class="h-entry">
<div class="e-content">
<p><img src="/photo.jpg"></p>
</div>
</div>

Loading…
Cancel
Save