You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

440 lines
20 KiB

  1. <?php
  2. use Symfony\Component\HttpFoundation\Request;
  3. use Symfony\Component\HttpFoundation\Response;
  4. class SanitizeTest extends PHPUnit\Framework\TestCase
  5. {
  6. private $http;
  7. public function setUp(): void
  8. {
  9. $this->client = new Parse();
  10. $this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/');
  11. $this->client->mc = null;
  12. }
  13. private function parse($params)
  14. {
  15. $request = new Request($params);
  16. $response = new Response();
  17. return $this->client->parse($request, $response);
  18. }
  19. public function testAllowsWhitelistedTags()
  20. {
  21. $url = 'http://sanitize.example/entry-with-valid-tags';
  22. $response = $this->parse(['url' => $url]);
  23. $body = $response->getContent();
  24. $this->assertEquals(200, $response->getStatusCode());
  25. $data = json_decode($body, true);
  26. $html = $data['data']['content']['html'];
  27. $this->assertEquals('entry', $data['data']['type']);
  28. $this->assertStringContainsString('This content has only valid tags.', $html);
  29. $this->assertStringContainsString('<a href="http://sanitize.example/example">links</a>,', $html, '<a> missing');
  30. $this->assertStringContainsString('<abbr>abbreviations</abbr>,', $html, '<abbr> missing');
  31. $this->assertStringContainsString('<b>bold</b>,', $html, '<b> missing');
  32. $this->assertStringContainsString('<code>inline code</code>,', $html, '<code> missing');
  33. $this->assertStringContainsString('<del>delete</del>,', $html, '<del> missing');
  34. $this->assertStringContainsString('<em>emphasis</em>,', $html, '<em> missing');
  35. $this->assertStringContainsString('<i>italics</i>,', $html, '<i> missing');
  36. $this->assertStringContainsString('<img src="http://sanitize.example/example.jpg" alt="images are allowed" />', $html, '<img> missing');
  37. $this->assertStringContainsString('<q>inline quote</q>,', $html, '<q> missing');
  38. $this->assertStringContainsString('<strike>strikethrough</strike>,', $html, '<strike> missing');
  39. $this->assertStringContainsString('<strong>strong text</strong>,', $html, '<strong> missing');
  40. $this->assertStringContainsString('<time datetime="2016-01-01">time elements</time>', $html, '<time> missing');
  41. $this->assertStringContainsString('<blockquote>Blockquote tags are okay</blockquote>', $html);
  42. $this->assertStringContainsString('<pre>preformatted text is okay too', $html, '<pre> missing');
  43. $this->assertStringContainsString('for code examples and such</pre>', $html, '<pre> missing');
  44. $this->assertStringContainsString('<p>Paragraph tags are allowed</p>', $html, '<p> missing');
  45. $this->assertStringContainsString('<h1>One</h1>', $html, '<h1> missing');
  46. $this->assertStringContainsString('<h2>Two</h2>', $html, '<h2> missing');
  47. $this->assertStringContainsString('<h3>Three</h3>', $html, '<h3> missing');
  48. $this->assertStringContainsString('<h4>Four</h4>', $html, '<h4> missing');
  49. $this->assertStringContainsString('<h5>Five</h5>', $html, '<h5> missing');
  50. $this->assertStringContainsString('<h6>Six</h6>', $html, '<h6> missing');
  51. $this->assertStringContainsString('<ul>', $html, '<ul> missing');
  52. $this->assertStringContainsString('<li>One</li>', $html, '<li> missing');
  53. $this->assertStringContainsString('<p>We should allow<br />break<br />tags too</p>', $html, '<br> missing');
  54. }
  55. public function testRemovesUnsafeTags()
  56. {
  57. $url = 'http://sanitize.example/entry-with-unsafe-tags';
  58. $response = $this->parse(['url' => $url]);
  59. $body = $response->getContent();
  60. $this->assertEquals(200, $response->getStatusCode());
  61. $data = json_decode($body, true);
  62. $html = $data['data']['content']['html'];
  63. $text = $data['data']['content']['text'];
  64. $this->assertEquals('entry', $data['data']['type']);
  65. $this->assertStringNotContainsString('<script>', $html);
  66. $this->assertStringNotContainsString('<style>', $html);
  67. $this->assertStringNotContainsString('visiblity', $html); // from the CSS
  68. $this->assertStringNotContainsString('alert', $html); // from the JS
  69. $this->assertStringNotContainsString('visiblity', $text);
  70. $this->assertStringNotContainsString('alert', $text);
  71. }
  72. public function testAllowsMF2Classes()
  73. {
  74. $url = 'http://sanitize.example/entry-with-mf2-classes';
  75. $response = $this->parse(['url' => $url]);
  76. $body = $response->getContent();
  77. $this->assertEquals(200, $response->getStatusCode());
  78. $data = json_decode($body, true);
  79. $html = $data['data']['content']['html'];
  80. $this->assertEquals('entry', $data['data']['type']);
  81. $this->assertStringContainsString('<h2 class="p-name">Hello World</h2>', $html);
  82. $this->assertStringContainsString('<h3>Utility Class</h3>', $html);
  83. }
  84. public function testEscapingHTMLTagsInText()
  85. {
  86. $url = 'http://sanitize.example/html-escaping-in-text';
  87. $response = $this->parse(['url' => $url]);
  88. $body = $response->getContent();
  89. $this->assertEquals(200, $response->getStatusCode());
  90. $data = json_decode($body, true);
  91. $this->assertEquals('entry', $data['data']['type']);
  92. $this->assertEquals('This content has some HTML escaped entities such as & ampersand, " quote, escaped <code> HTML tags, an ümlaut, an @at sign.', $data['data']['content']['text']);
  93. }
  94. public function testEscapingHTMLTagsInHTML()
  95. {
  96. $url = 'http://sanitize.example/html-escaping-in-html';
  97. $response = $this->parse(['url' => $url]);
  98. $body = $response->getContent();
  99. $this->assertEquals(200, $response->getStatusCode());
  100. $data = json_decode($body, true);
  101. $this->assertEquals('entry', $data['data']['type']);
  102. $this->assertArrayNotHasKey('name', $data['data']);
  103. $this->assertEquals('This content has some HTML escaped entities such as & ampersand, " quote, escaped <code> HTML tags, an ümlaut, an @at sign.', $data['data']['content']['text']);
  104. $this->assertEquals('This content has some <i>HTML escaped</i> entities such as &amp; ampersand, " quote, escaped &lt;code&gt; HTML tags, an ümlaut, an @at sign.', $data['data']['content']['html']);
  105. }
  106. public function testAllowIframeVideo()
  107. {
  108. $url = 'http://sanitize.example/entry-with-iframe-video';
  109. $response = $this->parse(['url' => $url]);
  110. $body = $response->getContent();
  111. $data = json_decode($body, true);
  112. $html = $data['data']['content']['html'];
  113. $this->assertStringNotContainsString('<iframe', $html);
  114. $response = $this->parse(['url' => $url, 'allow-iframe-video' => 'true']);
  115. $body = $response->getContent();
  116. $data = json_decode($body, true);
  117. $html = $data['data']['content']['html'];
  118. $this->assertStringContainsString('youtube.com', $html);
  119. $this->assertStringNotContainsString('https://attack-domain.com', $html);
  120. $this->assertStringNotContainsString('<iframe width="580" height="345"', $html);
  121. }
  122. public function testSanitizeJavascriptURLs()
  123. {
  124. $url = 'http://sanitize.example/h-entry-with-javascript-urls';
  125. $response = $this->parse(['url' => $url]);
  126. $body = $response->getContent();
  127. $this->assertEquals(200, $response->getStatusCode());
  128. $data = json_decode($body, true);
  129. $this->assertEquals('entry', $data['data']['type']);
  130. $this->assertEquals('', $data['data']['author']['url']);
  131. $this->assertArrayNotHasKey('url', $data['data']);
  132. $this->assertArrayNotHasKey('photo', $data['data']);
  133. $this->assertArrayNotHasKey('audio', $data['data']);
  134. $this->assertArrayNotHasKey('video', $data['data']);
  135. $this->assertArrayNotHasKey('syndication', $data['data']);
  136. $this->assertArrayNotHasKey('in-reply-to', $data['data']);
  137. $this->assertArrayNotHasKey('like-of', $data['data']);
  138. $this->assertArrayNotHasKey('repost-of', $data['data']);
  139. $this->assertArrayNotHasKey('bookmark-of', $data['data']);
  140. $this->assertEquals('Author', $data['data']['author']['name']);
  141. $this->assertEquals('', $data['data']['author']['photo']);
  142. }
  143. public function testSanitizeEmailAuthorURL()
  144. {
  145. $url = 'http://sanitize.example/h-entry-with-email-author';
  146. $response = $this->parse(['url' => $url]);
  147. $body = $response->getContent();
  148. $this->assertEquals(200, $response->getStatusCode());
  149. $data = json_decode($body);
  150. $this->assertEquals('entry', $data->data->type);
  151. $this->assertEquals('', $data->data->author->url);
  152. $this->assertEquals('Author', $data->data->author->name);
  153. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo);
  154. }
  155. public function testPhotoInContentNoAlt()
  156. {
  157. // https://github.com/aaronpk/XRay/issues/52
  158. $url = 'http://sanitize.example/photo-in-content';
  159. $response = $this->parse(['url' => $url]);
  160. $body = $response->getContent();
  161. $this->assertEquals(200, $response->getStatusCode());
  162. $data = json_decode($body);
  163. $this->assertObjectNotHasAttribute('name', $data->data);
  164. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  165. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  166. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  167. }
  168. /*
  169. // Commented out until #56 is resolved
  170. // https://github.com/aaronpk/XRay/issues/56
  171. public function testPhotoInTextContentNoAlt() {
  172. $url = 'http://sanitize.example/photo-in-text-content';
  173. $response = $this->parse(['url' => $url]);
  174. $body = $response->getContent();
  175. $this->assertEquals(200, $response->getStatusCode());
  176. $data = json_decode($body);
  177. $this->assertObjectNotHasAttribute('name', $data->data);
  178. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  179. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  180. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  181. }
  182. */
  183. public function testRelativePhotoInContent()
  184. {
  185. $url = 'http://sanitize.example/photo-in-content-relative';
  186. $response = $this->parse(['url' => $url]);
  187. $body = $response->getContent();
  188. $this->assertEquals(200, $response->getStatusCode());
  189. $data = json_decode($body);
  190. $this->assertStringContainsString('http://sanitize.example/photo1.jpg', $data->data->content->html);
  191. }
  192. public function testRelativePhotoProperty()
  193. {
  194. $url = 'http://sanitize.example/photo-relative';
  195. $response = $this->parse(['url' => $url]);
  196. $body = $response->getContent();
  197. $this->assertEquals(200, $response->getStatusCode());
  198. $data = json_decode($body);
  199. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
  200. }
  201. public function testPhotoInContentEmptyAltAttribute()
  202. {
  203. // https://github.com/aaronpk/XRay/issues/52
  204. $url = 'http://sanitize.example/photo-in-content-empty-alt';
  205. $response = $this->parse(['url' => $url]);
  206. $body = $response->getContent();
  207. $this->assertEquals(200, $response->getStatusCode());
  208. $data = json_decode($body);
  209. $this->assertObjectNotHasAttribute('name', $data->data);
  210. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  211. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  212. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  213. }
  214. public function testPhotoInContentWithAlt()
  215. {
  216. // https://github.com/aaronpk/XRay/issues/52
  217. $url = 'http://sanitize.example/photo-in-content-with-alt';
  218. $response = $this->parse(['url' => $url]);
  219. $body = $response->getContent();
  220. $this->assertEquals(200, $response->getStatusCode());
  221. $data = json_decode($body);
  222. $this->assertObjectNotHasAttribute('name', $data->data);
  223. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  224. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  225. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  226. }
  227. public function testPhotoInContentWithNameAndNoText()
  228. {
  229. $url = 'http://sanitize.example/cleverdevil';
  230. $response = $this->parse(['url' => $url]);
  231. $body = $response->getContent();
  232. $this->assertEquals(200, $response->getStatusCode());
  233. $data = json_decode($body);
  234. $this->assertObjectHasAttribute('name', $data->data);
  235. $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name);
  236. $this->assertObjectNotHasAttribute('content', $data->data);
  237. $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]);
  238. }
  239. public function testPhotoWithDupeNameAndAlt1()
  240. {
  241. // https://github.com/aaronpk/XRay/issues/57
  242. $url = 'http://sanitize.example/photo-with-dupe-name-alt';
  243. $response = $this->parse(['url' => $url]);
  244. $body = $response->getContent();
  245. $this->assertEquals(200, $response->getStatusCode());
  246. $data = json_decode($body);
  247. $this->assertObjectHasAttribute('name', $data->data);
  248. $this->assertEquals('Photo caption', $data->data->name);
  249. $this->assertObjectNotHasAttribute('content', $data->data);
  250. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
  251. }
  252. public function testPhotoWithDupeNameAndAlt2()
  253. {
  254. // This is simliar to adactio's markup
  255. // https://adactio.com/notes/13301
  256. $url = 'http://sanitize.example/photo-with-dupe-name-alt-2';
  257. $response = $this->parse(['url' => $url]);
  258. $body = $response->getContent();
  259. $this->assertEquals(200, $response->getStatusCode());
  260. $data = json_decode($body);
  261. $this->assertObjectHasAttribute('content', $data->data);
  262. $this->assertEquals('Photo caption', $data->data->content->text);
  263. $this->assertObjectNotHasAttribute('name', $data->data);
  264. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
  265. }
  266. public function testPhotoInContentWithNoText()
  267. {
  268. $url = 'http://sanitize.example/photo-in-content-with-alt-no-text';
  269. $response = $this->parse(['url' => $url]);
  270. $body = $response->getContent();
  271. $this->assertEquals(200, $response->getStatusCode());
  272. $data = json_decode($body, true);
  273. $this->assertEquals('<p><img src="http://sanitize.example/photo.jpg" alt="test" /></p>', $data['data']['content']['html']);
  274. $this->assertEquals('', $data['data']['content']['text']);
  275. }
  276. public function testPhotoInContentWithPNoAlt()
  277. {
  278. $url = 'http://sanitize.example/photo-in-content-with-p-no-alt';
  279. $response = $this->parse(['url' => $url]);
  280. $body = $response->getContent();
  281. $this->assertEquals(200, $response->getStatusCode());
  282. $data = json_decode($body, true);
  283. $this->assertEquals('<p><img src="http://sanitize.example/photo.jpg" alt="photo.jpg" /></p>', $data['data']['content']['html']);
  284. $this->assertEquals('', $data['data']['content']['text']);
  285. }
  286. public function testPhotoInContentNoPWithURLPhoto()
  287. {
  288. $url = 'http://sanitize.example/photo-in-content-no-p-with-url-photo';
  289. $response = $this->parse(['url' => $url]);
  290. $body = $response->getContent();
  291. $this->assertEquals(200, $response->getStatusCode());
  292. $data = json_decode($body, true);
  293. $this->assertEquals('<img src="http://sanitize.example/photo.jpg" alt="test" />', $data['data']['content']['html']);
  294. $this->assertEquals('', $data['data']['content']['text']);
  295. }
  296. public function testPhotoInContentNoPWithAlt()
  297. {
  298. // This h-entry has no u-url so has an implied u-photo. we don't actually care what happens with it because
  299. // this should never happen in the wild
  300. $url = 'http://sanitize.example/photo-in-content-no-p-with-alt';
  301. $response = $this->parse(['url' => $url]);
  302. $body = $response->getContent();
  303. $this->assertEquals(200, $response->getStatusCode());
  304. $data = json_decode($body, true);
  305. }
  306. /*
  307. // TODO: add support for embedded video and audio tags in html content
  308. public function testContentIsOnlyVideo() {
  309. $url = 'http://sanitize.example/content-is-only-video';
  310. $response = $this->parse(['url' => $url]);
  311. $body = $response->getContent();
  312. $this->assertEquals(200, $response->getStatusCode());
  313. $data = json_decode($body, true);
  314. print_r($data);
  315. }
  316. */
  317. public function testPhotosWithAlt()
  318. {
  319. // https://github.com/microformats/microformats2-parsing/issues/16
  320. $url = 'http://sanitize.example/photos-with-alt';
  321. $response = $this->parse(['url' => $url]);
  322. $body = $response->getContent();
  323. $this->assertEquals(200, $response->getStatusCode());
  324. $data = json_decode($body);
  325. $this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD'."\n\n".'#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text);
  326. $this->assertObjectNotHasAttribute('name', $data->data);
  327. $this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]);
  328. $this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]);
  329. }
  330. // Ignoring this issue for now. This should not happen in the wild.
  331. // https://github.com/aaronpk/XRay/issues/55
  332. // Skipping the implied photo check because in the wild, h-entrys should not exist withou a u-url, which stops implied parsing.
  333. public function testEntryWithImgNoImpliedPhoto()
  334. {
  335. // See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985
  336. // and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683
  337. // and https://github.com/microformats/microformats2-parsing/issues/16
  338. $url = 'http://sanitize.example/entry-with-img-no-implied-photo';
  339. $response = $this->parse(['url' => $url]);
  340. $body = $response->getContent();
  341. $this->assertEquals(200, $response->getStatusCode());
  342. $data = json_decode($body);
  343. $this->assertObjectNotHasAttribute('photo', $data->data);
  344. $this->assertObjectNotHasAttribute('name', $data->data);
  345. $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text);
  346. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo" />', $data->data->content->html);
  347. }
  348. public function testWhitespaceWithBreakTags()
  349. {
  350. $url = 'http://sanitize.example/entry-with-br-tags';
  351. $response = $this->parse(['url' => $url]);
  352. $body = $response->getContent();
  353. $this->assertEquals(200, $response->getStatusCode());
  354. $data = json_decode($body);
  355. $this->assertEquals('This content has two break tags to indicate a paragraph break.<br /><br />This is how tantek\'s autolinker works.', $data->data->content->html);
  356. $this->assertEquals("This content has two break tags to indicate a paragraph break.\n\nThis is how tantek's autolinker works.", $data->data->content->text);
  357. }
  358. }