diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index fc481ea..dff38d4 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -34,10 +34,8 @@ abstract class Format implements iFormat { return [$doc, $xpath]; } - protected static function sanitizeHTML($html) { - $config = HTMLPurifier_Config::createDefault(); - $config->set('Cache.DefinitionImpl', null); - $config->set('HTML.AllowedElements', [ + protected static function sanitizeHTML($html, $allowImg=true) { + $allowed = [ 'a', 'abbr', 'b', @@ -45,7 +43,6 @@ abstract class Format implements iFormat { 'del', 'em', 'i', - 'img', 'q', 'strike', 'strong', @@ -62,7 +59,13 @@ abstract class Format implements iFormat { 'ul', 'li', 'ol' - ]); + ]; + if($allowImg) + $allowed[] = 'img'; + + $config = HTMLPurifier_Config::createDefault(); + $config->set('Cache.DefinitionImpl', null); + $config->set('HTML.AllowedElements', $allowed); $def = $config->getHTMLDefinition(true); $def->addElement( 'time', diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index 7c41e22..675a328 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -220,21 +220,39 @@ class Mf2 extends Format { $textContent = $content; } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) { if(array_key_exists('html', $content)) { - $htmlContent = trim(self::sanitizeHTML($content['html'])); - #$textContent = trim(str_replace(" ","\r",strip_tags($htmlContent))); + // Only allow images in the content if there is no photo property set + if(isset($item['properties']['photo'])) + $allowImg = false; + else + $allowImg = true; + + $htmlContent = trim(self::sanitizeHTML($content['html'], $allowImg)); $textContent = trim(str_replace(" ","\r",$content['value'])); } else { $textContent = trim($content['value']); } } - $data = [ - 'text' => $textContent - ]; - if($htmlContent && $textContent != $htmlContent) { - $data['html'] = $htmlContent; + if($textContent || $htmlContent) { + $data = [ + 'text' => $textContent + ]; + // Only add HTML content if there is actual content. + // If the text content ends up empty, then the HTML should be too + // e.g.
+ // should not return content of + // TODO: still need to remove empty tags when there is other text in the content + if($htmlContent && $textContent && $textContent != $htmlContent) { + $data['html'] = $htmlContent; + } + + if(!$data['text']) + return null; + + return $data; + } else { + return null; } - return $data; } // Always return arrays, and may contain plaintext content diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index 54fdcf6..a06a84a 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -151,4 +151,104 @@ class SanitizeTest extends PHPUnit_Framework_TestCase { $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo); } + public function testPhotoInContent() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + #print_r($data->data); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentEmptyAltAttribute() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content-empty-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentWithAlt() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content-with-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentWithNoText() { + $url = 'http://sanitize.example/cleverdevil'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectHasAttribute('name', $data->data); + $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name); + $this->assertObjectNotHasAttribute('content', $data->data); + $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]); + } + + public function testPhotosWithAlt() { + // https://github.com/microformats/microformats2-parsing/issues/16 + + $url = 'http://sanitize.example/photos-with-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + #print_r($data->data); + + $this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]); + $this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]); + } + + public function testEntryWithImgNoImpliedPhoto() { + // See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985 + // and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683 + // and https://github.com/microformats/microformats2-parsing/issues/16 + $url = 'http://sanitize.example/entry-with-img-no-implied-photo'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('photo', $data->data); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed. a photo', $data->data->content->html); + } + } diff --git a/tests/data/sanitize.example/cleverdevil b/tests/data/sanitize.example/cleverdevil new file mode 100644 index 0000000..fe9809d --- /dev/null +++ b/tests/data/sanitize.example/cleverdevil @@ -0,0 +1,47 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+
+ +
 
+
+
+

+ + + +

+
+
+

Oh, how well they know me! 🥃 +

+ +
+ +
+ Oh, how well they know me! 🥃 +
+ + diff --git a/tests/data/sanitize.example/entry-with-img-no-implied-photo b/tests/data/sanitize.example/entry-with-img-no-implied-photo new file mode 100644 index 0000000..f788e57 --- /dev/null +++ b/tests/data/sanitize.example/entry-with-img-no-implied-photo @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed. a photo

+ + diff --git a/tests/data/sanitize.example/photo-in-content b/tests/data/sanitize.example/photo-in-content new file mode 100644 index 0000000..7e9258d --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + diff --git a/tests/data/sanitize.example/photo-in-content-empty-alt b/tests/data/sanitize.example/photo-in-content-empty-alt new file mode 100644 index 0000000..c55581d --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content-empty-alt @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + diff --git a/tests/data/sanitize.example/photo-in-content-with-alt b/tests/data/sanitize.example/photo-in-content-with-alt new file mode 100644 index 0000000..8b95892 --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content-with-alt @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content. a photo

+ + diff --git a/tests/data/sanitize.example/photos-with-alt b/tests/data/sanitize.example/photos-with-alt new file mode 100644 index 0000000..41853cb --- /dev/null +++ b/tests/data/sanitize.example/photos-with-alt @@ -0,0 +1,73 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + +
+

tantek.com

+ + + +
+ +
+ +
+ + +

a jpg. a jpg. 🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD

#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter

+ + on + + (ttk.me t4sE3) + using BBEdit + + + + +