You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

415 lines
19 KiB

8 years ago
  1. <?php
  2. use Symfony\Component\HttpFoundation\Request;
  3. use Symfony\Component\HttpFoundation\Response;
  4. class SanitizeTest extends PHPUnit_Framework_TestCase {
  5. private $http;
  6. public function setUp() {
  7. $this->client = new Parse();
  8. $this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/');
  9. $this->client->mc = null;
  10. }
  11. private function parse($params) {
  12. $request = new Request($params);
  13. $response = new Response();
  14. return $this->client->parse($request, $response);
  15. }
  16. public function testAllowsWhitelistedTags() {
  17. $url = 'http://sanitize.example/entry-with-valid-tags';
  18. $response = $this->parse(['url' => $url]);
  19. $body = $response->getContent();
  20. $this->assertEquals(200, $response->getStatusCode());
  21. $data = json_decode($body, true);
  22. $html = $data['data']['content']['html'];
  23. $this->assertEquals('entry', $data['data']['type']);
  24. $this->assertContains('This content has only valid tags.', $html);
  25. $this->assertContains('<a href="http://sanitize.example/example">links</a>,', $html, '<a> missing');
  26. $this->assertContains('<abbr>abbreviations</abbr>,', $html, '<abbr> missing');
  27. $this->assertContains('<b>bold</b>,', $html, '<b> missing');
  28. $this->assertContains('<code>inline code</code>,', $html, '<code> missing');
  29. $this->assertContains('<del>delete</del>,', $html, '<del> missing');
  30. $this->assertContains('<em>emphasis</em>,', $html, '<em> missing');
  31. $this->assertContains('<i>italics</i>,', $html, '<i> missing');
  32. $this->assertContains('<img src="http://sanitize.example/example.jpg" alt="images are allowed" />', $html, '<img> missing');
  33. $this->assertContains('<q>inline quote</q>,', $html, '<q> missing');
  34. $this->assertContains('<strike>strikethrough</strike>,', $html, '<strike> missing');
  35. $this->assertContains('<strong>strong text</strong>,', $html, '<strong> missing');
  36. $this->assertContains('<time datetime="2016-01-01">time elements</time>', $html, '<time> missing');
  37. $this->assertContains('<blockquote>Blockquote tags are okay</blockquote>', $html);
  38. $this->assertContains('<pre>preformatted text is okay too', $html, '<pre> missing');
  39. $this->assertContains('for code examples and such</pre>', $html, '<pre> missing');
  40. $this->assertContains('<p>Paragraph tags are allowed</p>', $html, '<p> missing');
  41. $this->assertContains('<h1>One</h1>', $html, '<h1> missing');
  42. $this->assertContains('<h2>Two</h2>', $html, '<h2> missing');
  43. $this->assertContains('<h3>Three</h3>', $html, '<h3> missing');
  44. $this->assertContains('<h4>Four</h4>', $html, '<h4> missing');
  45. $this->assertContains('<h5>Five</h5>', $html, '<h5> missing');
  46. $this->assertContains('<h6>Six</h6>', $html, '<h6> missing');
  47. $this->assertContains('<ul>', $html, '<ul> missing');
  48. $this->assertContains('<li>One</li>', $html, '<li> missing');
  49. $this->assertContains('<p>We should allow<br />break<br />tags too</p>', $html, '<br> missing');
  50. }
  51. public function testRemovesUnsafeTags() {
  52. $url = 'http://sanitize.example/entry-with-unsafe-tags';
  53. $response = $this->parse(['url' => $url]);
  54. $body = $response->getContent();
  55. $this->assertEquals(200, $response->getStatusCode());
  56. $data = json_decode($body, true);
  57. $html = $data['data']['content']['html'];
  58. $text = $data['data']['content']['text'];
  59. $this->assertEquals('entry', $data['data']['type']);
  60. $this->assertNotContains('<script>', $html);
  61. $this->assertNotContains('<style>', $html);
  62. $this->assertNotContains('visiblity', $html); // from the CSS
  63. $this->assertNotContains('alert', $html); // from the JS
  64. $this->assertNotContains('visiblity', $text);
  65. $this->assertNotContains('alert', $text);
  66. }
  67. public function testAllowsMF2Classes() {
  68. $url = 'http://sanitize.example/entry-with-mf2-classes';
  69. $response = $this->parse(['url' => $url]);
  70. $body = $response->getContent();
  71. $this->assertEquals(200, $response->getStatusCode());
  72. $data = json_decode($body, true);
  73. $html = $data['data']['content']['html'];
  74. $this->assertEquals('entry', $data['data']['type']);
  75. $this->assertContains('<h2 class="p-name">Hello World</h2>', $html);
  76. $this->assertContains('<h3>Utility Class</h3>', $html);
  77. }
  78. public function testEscapingHTMLTagsInText() {
  79. $url = 'http://sanitize.example/html-escaping-in-text';
  80. $response = $this->parse(['url' => $url]);
  81. $body = $response->getContent();
  82. $this->assertEquals(200, $response->getStatusCode());
  83. $data = json_decode($body, true);
  84. $this->assertEquals('entry', $data['data']['type']);
  85. $this->assertEquals('This content has some HTML escaped entities such as & ampersand, " quote, escaped <code> HTML tags, an ümlaut, an @at sign.', $data['data']['content']['text']);
  86. }
  87. public function testEscapingHTMLTagsInHTML() {
  88. $url = 'http://sanitize.example/html-escaping-in-html';
  89. $response = $this->parse(['url' => $url]);
  90. $body = $response->getContent();
  91. $this->assertEquals(200, $response->getStatusCode());
  92. $data = json_decode($body, true);
  93. $this->assertEquals('entry', $data['data']['type']);
  94. $this->assertArrayNotHasKey('name', $data['data']);
  95. $this->assertEquals('This content has some HTML escaped entities such as & ampersand, " quote, escaped <code> HTML tags, an ümlaut, an @at sign.', $data['data']['content']['text']);
  96. $this->assertEquals('This content has some <i>HTML escaped</i> entities such as &amp; ampersand, " quote, escaped &lt;code&gt; HTML tags, an ümlaut, an @at sign.', $data['data']['content']['html']);
  97. }
  98. public function testAllowIframeVideo() {
  99. $url = 'http://sanitize.example/entry-with-iframe-video';
  100. $response = $this->parse(['url' => $url]);
  101. $body = $response->getContent();
  102. $data = json_decode($body, true);
  103. $html = $data['data']['content']['html'];
  104. $this->assertNotContains('<iframe>', $html);
  105. $response = $this->parse(['url' => $url, 'allow-iframe-video' => 'true']);
  106. $body = $response->getContent();
  107. $data = json_decode($body, true);
  108. $html = $data['data']['content']['html'];
  109. file_put_contents('html', $html);
  110. $this->assertContains('youtube.com', $html);
  111. $this->assertNotContains('https://attack-domain.com', $html);
  112. $this->assertNotContains('<iframe width="580" height="345"', $html);
  113. }
  114. public function testSanitizeJavascriptURLs() {
  115. $url = 'http://sanitize.example/h-entry-with-javascript-urls';
  116. $response = $this->parse(['url' => $url]);
  117. $body = $response->getContent();
  118. $this->assertEquals(200, $response->getStatusCode());
  119. $data = json_decode($body, true);
  120. $this->assertEquals('entry', $data['data']['type']);
  121. $this->assertEquals('', $data['data']['author']['url']);
  122. $this->assertArrayNotHasKey('url', $data['data']);
  123. $this->assertArrayNotHasKey('photo', $data['data']);
  124. $this->assertArrayNotHasKey('audio', $data['data']);
  125. $this->assertArrayNotHasKey('video', $data['data']);
  126. $this->assertArrayNotHasKey('syndication', $data['data']);
  127. $this->assertArrayNotHasKey('in-reply-to', $data['data']);
  128. $this->assertArrayNotHasKey('like-of', $data['data']);
  129. $this->assertArrayNotHasKey('repost-of', $data['data']);
  130. $this->assertArrayNotHasKey('bookmark-of', $data['data']);
  131. $this->assertEquals('Author', $data['data']['author']['name']);
  132. $this->assertEquals('', $data['data']['author']['photo']);
  133. }
  134. public function testSanitizeEmailAuthorURL() {
  135. $url = 'http://sanitize.example/h-entry-with-email-author';
  136. $response = $this->parse(['url' => $url]);
  137. $body = $response->getContent();
  138. $this->assertEquals(200, $response->getStatusCode());
  139. $data = json_decode($body);
  140. $this->assertEquals('entry', $data->data->type);
  141. $this->assertEquals('', $data->data->author->url);
  142. $this->assertEquals('Author', $data->data->author->name);
  143. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo);
  144. }
  145. public function testPhotoInContentNoAlt() {
  146. // https://github.com/aaronpk/XRay/issues/52
  147. $url = 'http://sanitize.example/photo-in-content';
  148. $response = $this->parse(['url' => $url]);
  149. $body = $response->getContent();
  150. $this->assertEquals(200, $response->getStatusCode());
  151. $data = json_decode($body);
  152. $this->assertObjectNotHasAttribute('name', $data->data);
  153. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  154. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  155. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  156. }
  157. /*
  158. // Commented out until #56 is resolved
  159. // https://github.com/aaronpk/XRay/issues/56
  160. public function testPhotoInTextContentNoAlt() {
  161. $url = 'http://sanitize.example/photo-in-text-content';
  162. $response = $this->parse(['url' => $url]);
  163. $body = $response->getContent();
  164. $this->assertEquals(200, $response->getStatusCode());
  165. $data = json_decode($body);
  166. $this->assertObjectNotHasAttribute('name', $data->data);
  167. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  168. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  169. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  170. }
  171. */
  172. public function testRelativePhotoInContent() {
  173. $url = 'http://sanitize.example/photo-in-content-relative';
  174. $response = $this->parse(['url' => $url]);
  175. $body = $response->getContent();
  176. $this->assertEquals(200, $response->getStatusCode());
  177. $data = json_decode($body);
  178. $this->assertContains('http://sanitize.example/photo1.jpg', $data->data->content->html);
  179. }
  180. public function testRelativePhotoProperty() {
  181. $url = 'http://sanitize.example/photo-relative';
  182. $response = $this->parse(['url' => $url]);
  183. $body = $response->getContent();
  184. $this->assertEquals(200, $response->getStatusCode());
  185. $data = json_decode($body);
  186. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
  187. }
  188. public function testPhotoInContentEmptyAltAttribute() {
  189. // https://github.com/aaronpk/XRay/issues/52
  190. $url = 'http://sanitize.example/photo-in-content-empty-alt';
  191. $response = $this->parse(['url' => $url]);
  192. $body = $response->getContent();
  193. $this->assertEquals(200, $response->getStatusCode());
  194. $data = json_decode($body);
  195. $this->assertObjectNotHasAttribute('name', $data->data);
  196. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  197. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  198. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  199. }
  200. public function testPhotoInContentWithAlt() {
  201. // https://github.com/aaronpk/XRay/issues/52
  202. $url = 'http://sanitize.example/photo-in-content-with-alt';
  203. $response = $this->parse(['url' => $url]);
  204. $body = $response->getContent();
  205. $this->assertEquals(200, $response->getStatusCode());
  206. $data = json_decode($body);
  207. $this->assertObjectNotHasAttribute('name', $data->data);
  208. $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
  209. $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
  210. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
  211. }
  212. public function testPhotoInContentWithNameAndNoText() {
  213. $url = 'http://sanitize.example/cleverdevil';
  214. $response = $this->parse(['url' => $url]);
  215. $body = $response->getContent();
  216. $this->assertEquals(200, $response->getStatusCode());
  217. $data = json_decode($body);
  218. $this->assertObjectHasAttribute('name', $data->data);
  219. $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name);
  220. $this->assertObjectNotHasAttribute('content', $data->data);
  221. $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]);
  222. }
  223. public function testPhotoWithDupeNameAndAlt1() {
  224. // https://github.com/aaronpk/XRay/issues/57
  225. $url = 'http://sanitize.example/photo-with-dupe-name-alt';
  226. $response = $this->parse(['url' => $url]);
  227. $body = $response->getContent();
  228. $this->assertEquals(200, $response->getStatusCode());
  229. $data = json_decode($body);
  230. $this->assertObjectHasAttribute('name', $data->data);
  231. $this->assertEquals('Photo caption', $data->data->name);
  232. $this->assertObjectNotHasAttribute('content', $data->data);
  233. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
  234. }
  235. public function testPhotoWithDupeNameAndAlt2() {
  236. // This is simliar to adactio's markup
  237. // https://adactio.com/notes/13301
  238. $url = 'http://sanitize.example/photo-with-dupe-name-alt-2';
  239. $response = $this->parse(['url' => $url]);
  240. $body = $response->getContent();
  241. $this->assertEquals(200, $response->getStatusCode());
  242. $data = json_decode($body);
  243. $this->assertObjectHasAttribute('content', $data->data);
  244. $this->assertEquals('Photo caption', $data->data->content->text);
  245. $this->assertObjectNotHasAttribute('name', $data->data);
  246. $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
  247. }
  248. public function testPhotoInContentWithNoText() {
  249. $url = 'http://sanitize.example/photo-in-content-with-alt-no-text';
  250. $response = $this->parse(['url' => $url]);
  251. $body = $response->getContent();
  252. $this->assertEquals(200, $response->getStatusCode());
  253. $data = json_decode($body, true);
  254. $this->assertEquals('<p><img src="http://sanitize.example/photo.jpg" alt="test" /></p>', $data['data']['content']['html']);
  255. $this->assertEquals('', $data['data']['content']['text']);
  256. }
  257. public function testPhotoInContentWithPNoAlt() {
  258. $url = 'http://sanitize.example/photo-in-content-with-p-no-alt';
  259. $response = $this->parse(['url' => $url]);
  260. $body = $response->getContent();
  261. $this->assertEquals(200, $response->getStatusCode());
  262. $data = json_decode($body, true);
  263. $this->assertEquals('<p><img src="http://sanitize.example/photo.jpg" alt="photo.jpg" /></p>', $data['data']['content']['html']);
  264. $this->assertEquals('', $data['data']['content']['text']);
  265. }
  266. public function testPhotoInContentNoPWithURLPhoto() {
  267. $url = 'http://sanitize.example/photo-in-content-no-p-with-url-photo';
  268. $response = $this->parse(['url' => $url]);
  269. $body = $response->getContent();
  270. $this->assertEquals(200, $response->getStatusCode());
  271. $data = json_decode($body, true);
  272. $this->assertEquals('<img src="http://sanitize.example/photo.jpg" alt="test" />', $data['data']['content']['html']);
  273. $this->assertEquals('', $data['data']['content']['text']);
  274. }
  275. public function testPhotoInContentNoPWithAlt() {
  276. // This h-entry has no u-url so has an implied u-photo. we don't actually care what happens with it because
  277. // this should never happen in the wild
  278. $url = 'http://sanitize.example/photo-in-content-no-p-with-alt';
  279. $response = $this->parse(['url' => $url]);
  280. $body = $response->getContent();
  281. $this->assertEquals(200, $response->getStatusCode());
  282. $data = json_decode($body, true);
  283. }
  284. /*
  285. // TODO: add support for embedded video and audio tags in html content
  286. public function testContentIsOnlyVideo() {
  287. $url = 'http://sanitize.example/content-is-only-video';
  288. $response = $this->parse(['url' => $url]);
  289. $body = $response->getContent();
  290. $this->assertEquals(200, $response->getStatusCode());
  291. $data = json_decode($body, true);
  292. print_r($data);
  293. }
  294. */
  295. public function testPhotosWithAlt() {
  296. // https://github.com/microformats/microformats2-parsing/issues/16
  297. $url = 'http://sanitize.example/photos-with-alt';
  298. $response = $this->parse(['url' => $url]);
  299. $body = $response->getContent();
  300. $this->assertEquals(200, $response->getStatusCode());
  301. $data = json_decode($body);
  302. $this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD'."\n\n".'#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text);
  303. $this->assertObjectNotHasAttribute('name', $data->data);
  304. $this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]);
  305. $this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]);
  306. }
  307. // Ignoring this issue for now. This should not happen in the wild.
  308. // https://github.com/aaronpk/XRay/issues/55
  309. // Skipping the implied photo check because in the wild, h-entrys should not exist withou a u-url, which stops implied parsing.
  310. public function testEntryWithImgNoImpliedPhoto() {
  311. // See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985
  312. // and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683
  313. // and https://github.com/microformats/microformats2-parsing/issues/16
  314. $url = 'http://sanitize.example/entry-with-img-no-implied-photo';
  315. $response = $this->parse(['url' => $url]);
  316. $body = $response->getContent();
  317. $this->assertEquals(200, $response->getStatusCode());
  318. $data = json_decode($body);
  319. $this->assertObjectNotHasAttribute('photo', $data->data);
  320. $this->assertObjectNotHasAttribute('name', $data->data);
  321. $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text);
  322. $this->assertEquals('This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo" />', $data->data->content->html);
  323. }
  324. public function testWhitespaceWithBreakTags() {
  325. $url = 'http://sanitize.example/entry-with-br-tags';
  326. $response = $this->parse(['url' => $url]);
  327. $body = $response->getContent();
  328. $this->assertEquals(200, $response->getStatusCode());
  329. $data = json_decode($body);
  330. $this->assertEquals('This content has two break tags to indicate a paragraph break.<br /><br />This is how tantek\'s autolinker works.', $data->data->content->html);
  331. $this->assertEquals("This content has two break tags to indicate a paragraph break.\n\nThis is how tantek's autolinker works.", $data->data->content->text);
  332. }
  333. }