From ea1f054bdc1e1008fc064d7cb3d44352992eb03b Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 2 Feb 2024 07:26:02 -0800 Subject: [PATCH] fix multiline posts with smashed together paragraph tags closes #117 --- lib/XRay/Formats/Format.php | 10 +++++++++- tests/ActivityStreamsTest.php | 2 +- tests/ParseTest.php | 12 ++++++++++++ tests/SanitizeTest.php | 16 +++++++++++++++- tests/data/sanitize.example/entry-with-p-tags | 16 ++++++++++++++++ .../source.example.com/text-content-with-p-tags | 14 ++++++++++++++ 6 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 tests/data/sanitize.example/entry-with-p-tags create mode 100644 tests/data/source.example.com/text-content-with-p-tags diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index a741552..b059f14 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -141,10 +141,18 @@ abstract class Format implements iFormat { $config->set('Cache.DefinitionImpl', null); $config->set('HTML.AllowedElements', ['br']); $purifier = new HTMLPurifier($config); + + // Insert two br tags between smashed together paragraph tags. + // The paragraph tags will be removed by the HTMLPurifier, leaving just the br's, which + // will then be replaced by newlines. + $html = trim(str_replace('

', "



", $html)); + $sanitized = $purifier->purify($html); $sanitized = str_replace(" ","\r",$sanitized); $sanitized = html_entity_decode($sanitized); - return trim(str_replace(['
','
'],"\n", $sanitized)); + $sanitized = trim(str_replace(['
','
'],"\n", $sanitized)); + + return $sanitized; } } diff --git a/tests/ActivityStreamsTest.php b/tests/ActivityStreamsTest.php index 27df6c3..2c4c098 100644 --- a/tests/ActivityStreamsTest.php +++ b/tests/ActivityStreamsTest.php @@ -236,7 +236,7 @@ class ActivityStreamsTest extends PHPUnit\Framework\TestCase $this->assertEquals('reply', $data['data']['post-type']); $this->assertEquals('https://toot.cat/@jamey/100471682482196371', $data['data']['url']); $this->assertEquals('2018-07-31T22:30:09+00:00', $data['data']['published']); - $this->assertEquals('@darius Huh, I just have never encountered anyone using the phrase generically like that.But you might consider writing IndieWeb.org-style bots (Atom+WebSub, and optionally WebMention if you want them to be interactive), and then using https://fed.brid.gy/ as an alternative to implementing ActivityPub yourself...', $data['data']['content']['text']); + $this->assertEquals("@darius Huh, I just have never encountered anyone using the phrase generically like that.\n\nBut you might consider writing IndieWeb.org-style bots (Atom+WebSub, and optionally WebMention if you want them to be interactive), and then using https://fed.brid.gy/ as an alternative to implementing ActivityPub yourself...", $data['data']['content']['text']); $this->assertEquals('https://social.tinysubversions.com/users/darius/statuses/100471614681787834', $data['data']['in-reply-to'][0]); $this->assertEquals('Jamey Sharp', $data['data']['author']['name']); $this->assertEquals('https://s3-us-west-2.amazonaws.com/tootcatapril2017/accounts/avatars/000/013/259/original/c904452a8411e4f5.jpg', $data['data']['author']['photo']); diff --git a/tests/ParseTest.php b/tests/ParseTest.php index e793555..bcb274e 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -315,6 +315,18 @@ class ParseTest extends PHPUnit\Framework\TestCase $this->assertEquals('This page has a link to target.example.com and some formatted text but is in a p-content element so is plaintext.', $data->data->content->text); } + public function testNewlinesInTextContent() { + $url = 'http://source.example.com/text-content-with-p-tags'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals('mf2+html', $data->{'source-format'}); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals("Hello\nWorld", $data->data->content->text); + } + public function testArticleWithFeaturedImage() { $url = 'http://source.example.com/article-with-featured-image'; diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index 5bf84c1..7ce4e4f 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -424,7 +424,7 @@ class SanitizeTest extends PHPUnit\Framework\TestCase $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed. a photo', $data->data->content->html); } - public function testWhitespaceWithBreakTags() + public function testEntryWithBreakTags() { $url = 'http://sanitize.example/entry-with-br-tags'; $response = $this->parse(['url' => $url]); @@ -437,4 +437,18 @@ class SanitizeTest extends PHPUnit\Framework\TestCase $this->assertEquals("This content has two break tags to indicate a paragraph break.\n\nThis is how tantek's autolinker works.", $data->data->content->text); } + public function testEntryWithParagraphTags() + { + $url = 'http://sanitize.example/entry-with-p-tags'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertEquals('

This is a multiline post separated by paragraph tags with no space between them.

This is how Mastodon formats HTML.

', $data->data->content->html); + $this->assertEquals("This is a multiline post separated by paragraph tags with no space between them.\n\nThis is how Mastodon formats HTML.", $data->data->content->text); + } + + } diff --git a/tests/data/sanitize.example/entry-with-p-tags b/tests/data/sanitize.example/entry-with-p-tags new file mode 100644 index 0000000..04ad142 --- /dev/null +++ b/tests/data/sanitize.example/entry-with-p-tags @@ -0,0 +1,16 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 02 Mar 2018 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+

This is a multiline post separated by paragraph tags with no space between them.

This is how Mastodon formats HTML.

+
+ + diff --git a/tests/data/source.example.com/text-content-with-p-tags b/tests/data/source.example.com/text-content-with-p-tags new file mode 100644 index 0000000..3fabe2f --- /dev/null +++ b/tests/data/source.example.com/text-content-with-p-tags @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

Hello

World

+ +