diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index fc481ea..41637ef 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -34,10 +34,8 @@ abstract class Format implements iFormat { return [$doc, $xpath]; } - protected static function sanitizeHTML($html) { - $config = HTMLPurifier_Config::createDefault(); - $config->set('Cache.DefinitionImpl', null); - $config->set('HTML.AllowedElements', [ + protected static function sanitizeHTML($html, $allowImg=true) { + $allowed = [ 'a', 'abbr', 'b', @@ -45,7 +43,6 @@ abstract class Format implements iFormat { 'del', 'em', 'i', - 'img', 'q', 'strike', 'strong', @@ -62,7 +59,13 @@ abstract class Format implements iFormat { 'ul', 'li', 'ol' - ]); + ]; + if($allowImg) + $allowed[] = 'img'; + + $config = HTMLPurifier_Config::createDefault(); + $config->set('Cache.DefinitionImpl', null); + $config->set('HTML.AllowedElements', $allowed); $def = $config->getHTMLDefinition(true); $def->addElement( 'time', @@ -81,8 +84,16 @@ abstract class Format implements iFormat { return trim($sanitized); } + // Return a plaintext version of the input HTML protected static function stripHTML($html) { - return trim(strip_tags($html)); + $config = HTMLPurifier_Config::createDefault(); + $config->set('Cache.DefinitionImpl', null); + $config->set('HTML.AllowedElements', ['br']); + $purifier = new HTMLPurifier($config); + $sanitized = $purifier->purify($html); + $sanitized = str_replace(" ","\r",$sanitized); + $sanitized = html_entity_decode($sanitized); + return trim(str_replace('
',"\n", $sanitized)); } diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index dbb03e0..936091e 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -220,21 +220,40 @@ class Mf2 extends Format { $textContent = $content; } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) { if(array_key_exists('html', $content)) { - $htmlContent = trim(self::sanitizeHTML($content['html'])); - #$textContent = trim(str_replace(" ","\r",strip_tags($htmlContent))); - $textContent = trim(str_replace(" ","\r",$content['value'])); + // Only allow images in the content if there is no photo property set + if(isset($item['properties']['photo'])) + $allowImg = false; + else + $allowImg = true; + + $htmlContent = trim(self::sanitizeHTML($content['html'], $allowImg)); + #$textContent = trim(str_replace(" ","\r",$content['value'])); + $textContent = trim(self::stripHTML($htmlContent)); } else { $textContent = trim($content['value']); } } - $data = [ - 'text' => $textContent - ]; - if($htmlContent && $textContent != $htmlContent) { - $data['html'] = $htmlContent; + if($textContent || $htmlContent) { + $data = [ + 'text' => $textContent + ]; + // Only add HTML content if there is actual content. + // If the text content ends up empty, then the HTML should be too + // e.g.
+ // should not return content of + // TODO: still need to remove empty tags when there is other text in the content + if($htmlContent && $textContent && $textContent != $htmlContent) { + $data['html'] = $htmlContent; + } + + if(!$data['text']) + return null; + + return $data; + } else { + return null; } - return $data; } // Always return arrays, and may contain plaintext content @@ -321,12 +340,16 @@ class Mf2 extends Format { $textContent = null; $htmlContent = null; - $content = self::parseHTMLValue('content', $item); - if($content) { + $content = self::getHTMLValue($item, 'content'); + + if(is_string($content)) { + $textContent = $content; + } elseif($content) { $htmlContent = array_key_exists('html', $content) ? $content['html'] : null; - $textContent = array_key_exists('text', $content) ? $content['text'] : null; + $textContent = array_key_exists('value', $content) ? $content['value'] : null; } + $checkedname = $name; if($content) { // Trim ellipses from the name $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name); @@ -337,19 +360,29 @@ class Mf2 extends Format { // Check if the name is a prefix of the content if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) { - $name = null; + $checkedname = null; } } - if($name) { - $data['name'] = $name; + if($checkedname) { + $data['name'] = $checkedname; } // If there is content, always return the plaintext content, and return HTML content if it's different if($content) { - $data['content']['text'] = $content['text']; - if(array_key_exists('html', $content)) - $data['content']['html'] = $content['html']; + $content = self::parseHTMLValue('content', $item); + if($content['text']) { + $data['content']['text'] = $content['text']; + if(isset($content['html'])) + $data['content']['html'] = $content['html']; + } else { + // If the content text was blank because the img was removed and that was the only content, + // then put the name back as the name if it was previously set. + // See https://github.com/aaronpk/XRay/issues/57 + if($name) { + $data['name'] = $name; + } + } } } @@ -744,6 +777,20 @@ class Mf2 extends Format { return $fallback; } + private static function getHTMLValue($mf2, $k, $fallback=null) { + // Return an array with html and value if the value is html, otherwise return a string + if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) { + // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser + $value = $mf2['properties'][$k][0]; + if(is_string($value)) { + return $value; + } elseif(isset($value['html'])) { + return $value; + } + } + return $fallback; + } + private static function getPlaintextValues($mf2, $k, $values=[]) { if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) { foreach($mf2['properties'][$k] as $value) { diff --git a/lib/helpers.php b/lib/helpers.php index d3a503f..8a90a9c 100644 --- a/lib/helpers.php +++ b/lib/helpers.php @@ -11,7 +11,8 @@ function normalize_url($url) { $parts = parse_url($url); if(empty($parts['path'])) $parts['path'] = '/'; - $parts['host'] = strtolower($parts['host']); + if(isset($parts['host'])) + $parts['host'] = strtolower($parts['host']); return build_url($parts); } diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index 54fdcf6..9d215f3 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -151,4 +151,149 @@ class SanitizeTest extends PHPUnit_Framework_TestCase { $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo); } + public function testPhotoInContentNoAlt() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInTextContentNoAlt() { + // https://github.com/aaronpk/XRay/issues/56 + + $url = 'http://sanitize.example/photo-in-text-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentEmptyAltAttribute() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content-empty-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentWithAlt() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content-with-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentWithNoText() { + $url = 'http://sanitize.example/cleverdevil'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectHasAttribute('name', $data->data); + $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name); + $this->assertObjectNotHasAttribute('content', $data->data); + $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]); + } + + public function testPhotoWithDupeNameAndAlt1() { + // https://github.com/aaronpk/XRay/issues/57 + $url = 'http://sanitize.example/photo-with-dupe-name-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectHasAttribute('name', $data->data); + $this->assertEquals('Photo caption', $data->data->name); + $this->assertObjectNotHasAttribute('content', $data->data); + $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]); + } + + public function testPhotoWithDupeNameAndAlt2() { + // This is simliar to adactio's markup + // https://adactio.com/notes/13301 + $url = 'http://sanitize.example/photo-with-dupe-name-alt-2'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectHasAttribute('content', $data->data); + $this->assertEquals('Photo caption', $data->data->content->text); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]); + } + + public function testPhotosWithAlt() { + // https://github.com/microformats/microformats2-parsing/issues/16 + + $url = 'http://sanitize.example/photos-with-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + #print_r($data->data); + + $this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]); + $this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]); + } + + public function testEntryWithImgNoImpliedPhoto() { + // See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985 + // and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683 + // and https://github.com/microformats/microformats2-parsing/issues/16 + $url = 'http://sanitize.example/entry-with-img-no-implied-photo'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('photo', $data->data); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed. a photo', $data->data->content->html); + } + } diff --git a/tests/data/sanitize.example/cleverdevil b/tests/data/sanitize.example/cleverdevil new file mode 100644 index 0000000..fe9809d --- /dev/null +++ b/tests/data/sanitize.example/cleverdevil @@ -0,0 +1,47 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+
+ +
 
+
+
+

+ + + +

+
+
+

Oh, how well they know me! 🥃 +

+ +
+ +
+ Oh, how well they know me! 🥃 +
+ + diff --git a/tests/data/sanitize.example/entry-with-img-no-implied-photo b/tests/data/sanitize.example/entry-with-img-no-implied-photo new file mode 100644 index 0000000..f788e57 --- /dev/null +++ b/tests/data/sanitize.example/entry-with-img-no-implied-photo @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed. a photo

+ + diff --git a/tests/data/sanitize.example/photo-in-content b/tests/data/sanitize.example/photo-in-content new file mode 100644 index 0000000..7e9258d --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + diff --git a/tests/data/sanitize.example/photo-in-content-empty-alt b/tests/data/sanitize.example/photo-in-content-empty-alt new file mode 100644 index 0000000..c55581d --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content-empty-alt @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + diff --git a/tests/data/sanitize.example/photo-in-content-with-alt b/tests/data/sanitize.example/photo-in-content-with-alt new file mode 100644 index 0000000..8b95892 --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content-with-alt @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content. a photo

+ + diff --git a/tests/data/sanitize.example/photo-in-text-content b/tests/data/sanitize.example/photo-in-text-content new file mode 100644 index 0000000..36dd2b2 --- /dev/null +++ b/tests/data/sanitize.example/photo-in-text-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + diff --git a/tests/data/sanitize.example/photo-with-dupe-name-alt b/tests/data/sanitize.example/photo-with-dupe-name-alt new file mode 100644 index 0000000..85a203c --- /dev/null +++ b/tests/data/sanitize.example/photo-with-dupe-name-alt @@ -0,0 +1,15 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

Photo caption

+

Photo caption

+ + diff --git a/tests/data/sanitize.example/photo-with-dupe-name-alt-2 b/tests/data/sanitize.example/photo-with-dupe-name-alt-2 new file mode 100644 index 0000000..c5ac1a9 --- /dev/null +++ b/tests/data/sanitize.example/photo-with-dupe-name-alt-2 @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

Photo caption Photo caption

+ + diff --git a/tests/data/sanitize.example/photos-with-alt b/tests/data/sanitize.example/photos-with-alt new file mode 100644 index 0000000..41853cb --- /dev/null +++ b/tests/data/sanitize.example/photos-with-alt @@ -0,0 +1,73 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + +
+

tantek.com

+ + + +
+ +
+ +
+ + +

a jpg. a jpg. 🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD

#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter

+ + on + + (ttk.me t4sE3) + using BBEdit + + + + + diff --git a/tests/data/source.example.com/h-entry-no-content b/tests/data/source.example.com/h-entry-no-content new file mode 100644 index 0000000..da52466 --- /dev/null +++ b/tests/data/source.example.com/h-entry-no-content @@ -0,0 +1,15 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a Post

+ permalink + +