You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

756 lines
25 KiB

8 years ago
8 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use HTMLPurifier, HTMLPurifier_Config;
  4. class Mf2 {
  5. public static function parse($mf2, $url, $http) {
  6. if(count($mf2['items']) == 0)
  7. return false;
  8. // If there is only one item on the page, just use that
  9. if(count($mf2['items']) == 1) {
  10. $item = $mf2['items'][0];
  11. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  12. #Parse::debug("mf2:0: Recognized $url as an h-entry it is the only item on the page");
  13. return self::parseAsHEntry($mf2, $item, $http);
  14. }
  15. if(in_array('h-event', $item['type'])) {
  16. #Parse::debug("mf2:0: Recognized $url as an h-event it is the only item on the page");
  17. return self::parseAsHEvent($mf2, $item, $http);
  18. }
  19. if(in_array('h-review', $item['type'])) {
  20. #Parse::debug("mf2:0: Recognized $url as an h-review it is the only item on the page");
  21. return self::parseAsHReview($mf2, $item, $http);
  22. }
  23. if(in_array('h-recipe', $item['type'])) {
  24. #Parse::debug("mf2:0: Recognized $url as an h-recipe it is the only item on the page");
  25. return self::parseAsHRecipe($mf2, $item, $http);
  26. }
  27. if(in_array('h-product', $item['type'])) {
  28. #Parse::debug("mf2:0: Recognized $url as an h-product it is the only item on the page");
  29. return self::parseAsHProduct($mf2, $item, $http);
  30. }
  31. if(in_array('h-item', $item['type'])) {
  32. #Parse::debug("mf2:0: Recognized $url as an h-product it is the only item on the page");
  33. return self::parseAsHItem($mf2, $item, $http);
  34. }
  35. if(in_array('h-feed', $item['type'])) {
  36. #Parse::debug("mf2:0: Recognized $url as an h-feed because it is the only item on the page");
  37. return self::parseAsHFeed($mf2, $http);
  38. }
  39. if(in_array('h-card', $item['type'])) {
  40. #Parse::debug("mf2:0: Recognized $url as an h-card it is the only item on the page");
  41. return self::parseAsHCard($item, $http, $url);
  42. }
  43. }
  44. // Check the list of items on the page to see if one matches the URL of the page,
  45. // and treat as a permalink for that object if so. Otherwise, parse as a feed.
  46. foreach($mf2['items'] as $item) {
  47. if(array_key_exists('url', $item['properties'])) {
  48. $urls = $item['properties']['url'];
  49. $urls = array_map('\p3k\XRay\normalize_url', $urls);
  50. if(in_array($url, $urls)) {
  51. #Parse::debug("mf2:1: Recognized $url as a permalink because an object on the page matched the URL of the request");
  52. if(in_array('h-card', $item['type'])) {
  53. return self::parseAsHCard($item, $http, $url);
  54. } elseif(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  55. return self::parseAsHEntry($mf2, $item, $http);
  56. } elseif(in_array('h-event', $item['type'])) {
  57. return self::parseAsHEvent($mf2, $item, $http);
  58. } elseif(in_array('h-review', $item['type'])) {
  59. return self::parseAsHReview($mf2, $item, $http);
  60. } elseif(in_array('h-recipe', $item['type'])) {
  61. return self::parseAsHRecipe($mf2, $item, $http);
  62. } elseif(in_array('h-product', $item['type'])) {
  63. return self::parseAsHProduct($mf2, $item, $http);
  64. } elseif(in_array('h-item', $item['type'])) {
  65. return self::parseAsHItem($mf2, $item, $http);
  66. } else {
  67. #Parse::debug('This object was not a recognized type.');
  68. return false;
  69. }
  70. }
  71. }
  72. }
  73. // Check for an h-card matching rel=author or the author URL of any h-* on the page,
  74. // and return the h-* object if so
  75. if(isset($mf2['rels']['author'])) {
  76. foreach($mf2['items'] as $card) {
  77. if(in_array('h-card', $card['type']) && array_key_exists('url', $card['properties'])) {
  78. $urls = $card['properties']['url'];
  79. $urls = array_map('\p3k\XRay\normalize_url', $urls);
  80. if(count(array_intersect($urls, $mf2['rels']['author'])) > 0) {
  81. // There is an author h-card on this page
  82. // Now look for the first h-* object other than an h-card and use that as the object
  83. foreach($mf2['items'] as $item) {
  84. if(!in_array('h-card', $item['type'])) {
  85. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  86. return self::parseAsHEntry($mf2, $item, $http);
  87. } elseif(in_array('h-event', $item['type'])) {
  88. return self::parseAsHEvent($mf2, $item, $http);
  89. } elseif(in_array('h-review', $item['type'])) {
  90. return self::parseAsHReview($mf2, $item, $http);
  91. } elseif(in_array('h-recipe', $item['type'])) {
  92. return self::parseAsHRecipe($mf2, $item, $http);
  93. } elseif(in_array('h-product', $item['type'])) {
  94. return self::parseAsHProduct($mf2, $item, $http);
  95. } elseif(in_array('h-item', $item['type'])) {
  96. return self::parseAsHItem($mf2, $item, $http);
  97. }
  98. }
  99. }
  100. }
  101. }
  102. }
  103. }
  104. // If there was more than one h-entry on the page, treat the whole page as a feed
  105. if(count($mf2['items']) > 1) {
  106. if(count(array_filter($mf2['items'], function($item){
  107. return in_array('h-entry', $item['type']);
  108. })) > 1) {
  109. #Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one object on the page");
  110. return self::parseAsHFeed($mf2, $http);
  111. }
  112. }
  113. // If the first item is an h-feed, parse as a feed
  114. $first = $mf2['items'][0];
  115. if(in_array('h-feed', $first['type'])) {
  116. #Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed");
  117. return self::parseAsHFeed($mf2, $http);
  118. }
  119. // Fallback case, but hopefully we have found something before this point
  120. foreach($mf2['items'] as $item) {
  121. // Otherwise check for a recognized h-entr* object
  122. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  123. #Parse::debug("mf2:6: $url is falling back to the first h-entry on the page");
  124. return self::parseAsHEntry($mf2, $item, $http);
  125. } elseif(in_array('h-event', $item['type'])) {
  126. #Parse::debug("mf2:6: $url is falling back to the first h-event on the page");
  127. return self::parseAsHEvent($mf2, $item, $http);
  128. } elseif(in_array('h-review', $item['type'])) {
  129. #Parse::debug("mf2:6: $url is falling back to the first h-review on the page");
  130. return self::parseAsHReview($mf2, $item, $http);
  131. } elseif(in_array('h-recipe', $item['type'])) {
  132. #Parse::debug("mf2:6: $url is falling back to the first h-recipe on the page");
  133. return self::parseAsHReview($mf2, $item, $http);
  134. } elseif(in_array('h-product', $item['type'])) {
  135. #Parse::debug("mf2:6: $url is falling back to the first h-product on the page");
  136. return self::parseAsHProduct($mf2, $item, $http);
  137. } elseif(in_array('h-item', $item['type'])) {
  138. #Parse::debug("mf2:6: $url is falling back to the first h-item on the page");
  139. return self::parseAsHItem($mf2, $item, $http);
  140. }
  141. }
  142. #Parse::debug("mf2:E: No object at $url was recognized");
  143. return false;
  144. }
  145. private static function collectSingleValues($properties, $urlProperties, $item, &$data) {
  146. foreach($properties as $p) {
  147. if(($v = self::getPlaintext($item, $p)) !== null) {
  148. $data[$p] = $v;
  149. }
  150. }
  151. foreach($urlProperties as $p) {
  152. if(($v = self::getPlaintext($item, $p)) !== null) {
  153. if(self::isURL($v))
  154. $data[$p] = $v;
  155. }
  156. }
  157. }
  158. private static function parseHTMLValue($property, $item) {
  159. if(!array_key_exists($property, $item['properties']))
  160. return null;
  161. $textContent = false;
  162. $htmlContent = false;
  163. $content = $item['properties'][$property][0];
  164. if(is_string($content)) {
  165. $textContent = $content;
  166. } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
  167. if(array_key_exists('html', $content)) {
  168. $htmlContent = trim(self::sanitizeHTML($content['html']));
  169. #$textContent = trim(str_replace("&#xD;","\r",strip_tags($htmlContent)));
  170. $textContent = trim(str_replace("&#xD;","\r",$content['value']));
  171. } else {
  172. $textContent = trim($content['value']);
  173. }
  174. }
  175. $data = [
  176. 'text' => $textContent
  177. ];
  178. if($htmlContent && $textContent != $htmlContent) {
  179. $data['html'] = $htmlContent;
  180. }
  181. return $data;
  182. }
  183. // Always return arrays, and may contain plaintext content
  184. // Nested objects are added to refs and the URL is used as the value if present
  185. private static function collectArrayValues($properties, $item, &$data, &$refs, &$http) {
  186. foreach($properties as $p) {
  187. if(array_key_exists($p, $item['properties'])) {
  188. foreach($item['properties'][$p] as $v) {
  189. if(is_string($v)) {
  190. if(!array_key_exists($p, $data)) $data[$p] = [];
  191. if(!in_array($v, $data[$p]))
  192. $data[$p][] = $v;
  193. } elseif(self::isMicroformat($v)) {
  194. if(($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  195. if(!array_key_exists($p, $data)) $data[$p] = [];
  196. if(!in_array($u, $data[$p]))
  197. $data[$p][] = $u;
  198. $ref = self::parse(['items'=>[$v]], $u, $http);
  199. if($ref) {
  200. $refs[$u] = $ref['data'];
  201. }
  202. } else {
  203. if(!array_key_exists($p, $data)) $data[$p] = [];
  204. if(!in_array($v['value'], $data[$p]))
  205. $data[$p][] = $v['value'];
  206. }
  207. }
  208. }
  209. }
  210. }
  211. }
  212. private static function collectArrayURLValues($properties, $item, &$data, &$refs, &$http) {
  213. foreach($properties as $p) {
  214. if(array_key_exists($p, $item['properties'])) {
  215. foreach($item['properties'][$p] as $v) {
  216. if(is_string($v) && self::isURL($v)) {
  217. if(!array_key_exists($p, $data)) $data[$p] = [];
  218. $data[$p][] = $v;
  219. }
  220. elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  221. if(!array_key_exists($p, $data)) $data[$p] = [];
  222. $data[$p][] = $u;
  223. // parse the object and put the result in the "refs" object
  224. $ref = self::parse(['items'=>[$v]], $u, $http);
  225. if($ref) {
  226. $refs[$u] = $ref['data'];
  227. }
  228. }
  229. }
  230. }
  231. }
  232. }
  233. private static function determineNameAndContent($item, &$data) {
  234. // Determine if the name is distinct from the content
  235. $name = self::getPlaintext($item, 'name');
  236. $textContent = null;
  237. $htmlContent = null;
  238. $content = self::parseHTMLValue('content', $item);
  239. if($content) {
  240. $htmlContent = array_key_exists('html', $content) ? $content['html'] : null;
  241. $textContent = array_key_exists('text', $content) ? $content['text'] : null;
  242. }
  243. if($content) {
  244. // Trim ellipses from the name
  245. $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name);
  246. // Remove all whitespace when checking equality
  247. $nameCompare = preg_replace('/\s/','',trim($name));
  248. $contentCompare = preg_replace('/\s/','',trim($textContent));
  249. // Check if the name is a prefix of the content
  250. if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) {
  251. $name = null;
  252. }
  253. }
  254. if($name) {
  255. $data['name'] = $name;
  256. }
  257. // If there is content, always return the plaintext content, and return HTML content if it's different
  258. if($content) {
  259. $data['content']['text'] = $content['text'];
  260. if(array_key_exists('html', $content))
  261. $data['content']['html'] = $content['html'];
  262. }
  263. }
  264. private static function parseAsHEntry($mf2, $item, $http) {
  265. $data = [
  266. 'type' => 'entry'
  267. ];
  268. $refs = [];
  269. // Single plaintext and URL values
  270. self::collectSingleValues(['published','summary','rsvp','swarm-coins'], ['url'], $item, $data);
  271. // These properties are always returned as arrays and may contain plaintext content
  272. // First strip leading hashtags from category values if present
  273. if(array_key_exists('category', $item['properties'])) {
  274. foreach($item['properties']['category'] as $i=>$c) {
  275. if(is_string($c))
  276. $item['properties']['category'][$i] = ltrim($c, '#');
  277. }
  278. }
  279. self::collectArrayValues(['category','invitee'], $item, $data, $refs, $http);
  280. // These properties are always returned as arrays and always URLs
  281. // If the value is an h-* object with a URL, the URL is used and a "ref" is added as well
  282. self::collectArrayURLValues(['photo','video','audio','syndication','in-reply-to','like-of','repost-of','bookmark-of'], $item, $data, $refs, $http);
  283. self::determineNameAndContent($item, $data);
  284. if($author = self::findAuthor($mf2, $item, $http))
  285. $data['author'] = $author;
  286. $response = [
  287. 'data' => $data
  288. ];
  289. if(count($refs)) {
  290. $response['data']['refs'] = $refs;
  291. }
  292. return $response;
  293. }
  294. private static function parseAsHReview($mf2, $item, $http) {
  295. $data = [
  296. 'type' => 'review'
  297. ];
  298. $refs = [];
  299. self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $data);
  300. // Fallback for Mf1 "description" as content. The PHP parser does not properly map this to "content"
  301. $description = self::parseHTMLValue('description', $item);
  302. if($description) {
  303. $data['content'] = $description;
  304. }
  305. self::collectArrayValues(['category'], $item, $data, $refs, $http);
  306. self::collectArrayURLValues(['item'], $item, $data, $refs, $http);
  307. self::determineNameAndContent($item, $data);
  308. if($author = self::findAuthor($mf2, $item, $http))
  309. $data['author'] = $author;
  310. $response = [
  311. 'data' => $data
  312. ];
  313. if(count($refs)) {
  314. $response['data']['refs'] = $refs;
  315. }
  316. return $response;
  317. }
  318. private static function parseAsHRecipe($mf2, $item, $http) {
  319. $data = [
  320. 'type' => 'recipe'
  321. ];
  322. $refs = [];
  323. self::collectSingleValues(['name','summary','published','duration','yield','nutrition'], ['url'], $item, $data);
  324. $instructions = self::parseHTMLValue('instructions', $item);
  325. if($instructions) {
  326. $data['instructions'] = $instructions;
  327. }
  328. self::collectArrayValues(['category','ingredient'], $item, $data, $refs, $http);
  329. self::collectArrayURLValues(['photo'], $item, $data, $refs, $http);
  330. if($author = self::findAuthor($mf2, $item, $http))
  331. $data['author'] = $author;
  332. $response = [
  333. 'data' => $data
  334. ];
  335. if(count($refs)) {
  336. $response['data']['refs'] = $refs;
  337. }
  338. return $response;
  339. }
  340. private static function parseAsHProduct($mf2, $item, $http) {
  341. $data = [
  342. 'type' => 'product'
  343. ];
  344. self::collectSingleValues(['name','identifier','price'], ['url'], $item, $data);
  345. $description = self::parseHTMLValue('description', $item);
  346. if($description) {
  347. $data['description'] = $description;
  348. }
  349. self::collectArrayValues(['category','brand'], $item, $data, $refs, $http);
  350. self::collectArrayURLValues(['photo','video','audio'], $item, $data, $refs, $http);
  351. $response = [
  352. 'data' => $data
  353. ];
  354. if(count($refs)) {
  355. $response['data']['refs'] = $refs;
  356. }
  357. return $response;
  358. }
  359. private static function parseAsHItem($mf2, $item, $http) {
  360. $data = [
  361. 'type' => 'item'
  362. ];
  363. self::collectSingleValues(['name'], ['url'], $item, $data);
  364. self::collectArrayURLValues(['photo','video','audio'], $item, $data, $refs, $http);
  365. $response = [
  366. 'data' => $data
  367. ];
  368. if(count($refs)) {
  369. $response['data']['refs'] = $refs;
  370. }
  371. return $response;
  372. }
  373. private static function parseAsHEvent($mf2, $item, $http) {
  374. $data = [
  375. 'type' => 'event'
  376. ];
  377. $refs = [];
  378. // Single plaintext and URL values
  379. self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $data);
  380. // These properties are always returned as arrays and may contain plaintext content
  381. self::collectArrayValues(['category','location','attendee'], $item, $data, $refs, $http);
  382. // These properties are always returned as arrays and always URLs
  383. // If the value is an h-* object with a URL, the URL is used and a "ref" is added as well
  384. self::collectArrayURLValues(['photo','video','audio','syndication'], $item, $data, $refs, $http);
  385. // If there is a description, always return the plaintext description, and return HTML description if it's different
  386. $textDescription = null;
  387. $htmlDescription = null;
  388. if(array_key_exists('description', $item['properties'])) {
  389. $description = $item['properties']['description'][0];
  390. if(is_string($description)) {
  391. $textDescription = $description;
  392. } elseif(!is_string($description) && is_array($description) && array_key_exists('value', $description)) {
  393. if(array_key_exists('html', $description)) {
  394. $htmlDescription = trim(self::sanitizeHTML($description['html']));
  395. $textDescription = trim(str_replace("&#xD;","\r",strip_tags($htmlDescription)));
  396. $textDescription = trim(str_replace("&#xD;","\r",$description['value']));
  397. } else {
  398. $textDescription = trim($description['value']);
  399. }
  400. }
  401. }
  402. if($textDescription) {
  403. $data['description'] = [
  404. 'text' => $textDescription
  405. ];
  406. if($htmlDescription && $textDescription != $htmlDescription) {
  407. $data['description']['html'] = $htmlDescription;
  408. }
  409. }
  410. $response = [
  411. 'data' => $data
  412. ];
  413. if(count($refs)) {
  414. $response['data']['refs'] = $refs;
  415. }
  416. return $response;
  417. }
  418. private static function parseAsHFeed($mf2, $http) {
  419. $data = [
  420. 'type' => 'feed',
  421. 'author' => [
  422. 'type' => 'card',
  423. 'name' => null,
  424. 'url' => null,
  425. 'photo' => null
  426. ],
  427. 'todo' => 'Not yet implemented. Please see https://github.com/aaronpk/XRay/issues/1'
  428. ];
  429. return [
  430. 'data' => $data,
  431. 'entries' => []
  432. ];
  433. }
  434. private static function parseAsHCard($item, $http, $authorURL=false) {
  435. $data = [
  436. 'type' => 'card',
  437. 'name' => null,
  438. 'url' => null,
  439. 'photo' => null
  440. ];
  441. $properties = ['url','name','photo'];
  442. foreach($properties as $p) {
  443. if($p == 'url' && $authorURL) {
  444. // If there is a matching author URL, use that one
  445. $found = false;
  446. foreach($item['properties']['url'] as $url) {
  447. if(self::isURL($url)) {
  448. $url = \p3k\XRay\normalize_url($url);
  449. if($url == $authorURL) {
  450. $data['url'] = $url;
  451. $found = true;
  452. }
  453. }
  454. }
  455. if(!$found && self::isURL($item['properties']['url'][0])) {
  456. $data['url'] = $item['properties']['url'][0];
  457. }
  458. } else if(($v = self::getPlaintext($item, $p)) !== null) {
  459. // Make sure the URL property is actually a URL
  460. if($p == 'url' || $p == 'photo') {
  461. if(self::isURL($v))
  462. $data[$p] = $v;
  463. } else {
  464. $data[$p] = $v;
  465. }
  466. }
  467. }
  468. // If no URL property was found, use the $authorURL provided
  469. if(!$data['url'])
  470. $data['url'] = $authorURL;
  471. $response = [
  472. 'data' => $data
  473. ];
  474. return $response;
  475. }
  476. private static function findAuthor($mf2, $item, $http) {
  477. $author = [
  478. 'type' => 'card',
  479. 'name' => null,
  480. 'url' => null,
  481. 'photo' => null
  482. ];
  483. // Author Discovery
  484. // http://indiewebcamp.com/authorship
  485. $authorPage = false;
  486. if(array_key_exists('author', $item['properties'])) {
  487. // Check if any of the values of the author property are an h-card
  488. foreach($item['properties']['author'] as $a) {
  489. if(self::isHCard($a)) {
  490. // 5.1 "if it has an h-card, use it, exit."
  491. return self::parseAsHCard($a, $http)['data'];
  492. } elseif(is_string($a)) {
  493. if(self::isURL($a)) {
  494. // 5.2 "otherwise if author property is an http(s) URL, let the author-page have that URL"
  495. $authorPage = $a;
  496. } else {
  497. // 5.3 "otherwise use the author property as the author name, exit"
  498. // We can only set the name, no h-card or URL was found
  499. $author['name'] = self::getPlaintext($item, 'author');
  500. return $author;
  501. }
  502. } else {
  503. // This case is only hit when the author property is an mf2 object that is not an h-card
  504. $author['name'] = self::getPlaintext($item, 'author');
  505. return $author;
  506. }
  507. }
  508. }
  509. // 6. "if no author page was found" ... check for rel-author link
  510. if(!$authorPage) {
  511. if(isset($mf2['rels']) && isset($mf2['rels']['author']))
  512. $authorPage = $mf2['rels']['author'][0];
  513. }
  514. // 7. "if there is an author-page URL" ...
  515. if($authorPage) {
  516. // 7.1 "get the author-page from that URL and parse it for microformats2"
  517. $authorPageContents = self::getURL($authorPage, $http);
  518. if($authorPageContents) {
  519. foreach($authorPageContents['items'] as $i) {
  520. if(self::isHCard($i)) {
  521. // 7.2 "if author-page has 1+ h-card with url == uid == author-page's URL, then use first such h-card, exit."
  522. if(array_key_exists('url', $i['properties'])
  523. and in_array($authorPage, $i['properties']['url'])
  524. and array_key_exists('uid', $i['properties'])
  525. and in_array($authorPage, $i['properties']['uid'])
  526. ) {
  527. return self::parseAsHCard($i, $http, $authorPage)['data'];
  528. }
  529. // 7.3 "else if author-page has 1+ h-card with url property which matches the href of a rel-me link on the author-page"
  530. $relMeLinks = (isset($authorPageContents['rels']) && isset($authorPageContents['rels']['me'])) ? $authorPageContents['rels']['me'] : [];
  531. if(count($relMeLinks) > 0
  532. and array_key_exists('url', $i['properties'])
  533. and count(array_intersect($i['properties']['url'], $relMeLinks)) > 0
  534. ) {
  535. return self::parseAsHCard($i, $http, $authorPage)['data'];
  536. }
  537. }
  538. }
  539. }
  540. // 7.4 "if the h-entry's page has 1+ h-card with url == author-page URL, use first such h-card, exit."
  541. foreach($mf2['items'] as $i) {
  542. if(self::isHCard($i)) {
  543. if(array_key_exists('url', $i['properties'])
  544. and in_array($authorPage, $i['properties']['url'])
  545. ) {
  546. return self::parseAsHCard($i, $http)['data'];
  547. }
  548. }
  549. }
  550. }
  551. if(!$author['name'] && !$author['photo'] && !$author['url'])
  552. return null;
  553. return $author;
  554. }
  555. private static function sanitizeHTML($html) {
  556. $config = HTMLPurifier_Config::createDefault();
  557. $config->set('Cache.DefinitionImpl', null);
  558. $config->set('HTML.AllowedElements', [
  559. 'a',
  560. 'abbr',
  561. 'b',
  562. 'code',
  563. 'del',
  564. 'em',
  565. 'i',
  566. 'img',
  567. 'q',
  568. 'strike',
  569. 'strong',
  570. 'time',
  571. 'blockquote',
  572. 'pre',
  573. 'p',
  574. 'h1',
  575. 'h2',
  576. 'h3',
  577. 'h4',
  578. 'h5',
  579. 'h6',
  580. 'ul',
  581. 'li',
  582. 'ol'
  583. ]);
  584. $def = $config->getHTMLDefinition(true);
  585. $def->addElement(
  586. 'time',
  587. 'Inline',
  588. 'Inline',
  589. 'Common',
  590. [
  591. 'datetime' => 'Text'
  592. ]
  593. );
  594. // Override the allowed classes to only support Microformats2 classes
  595. $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
  596. $purifier = new HTMLPurifier($config);
  597. $sanitized = $purifier->purify($html);
  598. $sanitized = str_replace("&#xD;","\r",$sanitized);
  599. return $sanitized;
  600. }
  601. private static function hasNumericKeys(array $arr) {
  602. foreach($arr as $key=>$val)
  603. if (is_numeric($key))
  604. return true;
  605. return false;
  606. }
  607. private static function isMicroformat($mf) {
  608. return is_array($mf)
  609. and !self::hasNumericKeys($mf)
  610. and !empty($mf['type'])
  611. and isset($mf['properties']);
  612. }
  613. private static function isHCard($mf) {
  614. return is_array($mf)
  615. and !empty($mf['type'])
  616. and is_array($mf['type'])
  617. and in_array('h-card', $mf['type']);
  618. }
  619. private static function isURL($string) {
  620. return preg_match('/^https?:\/\/.+\..+$/', $string);
  621. }
  622. // Given an array of microformats properties and a key name, return the plaintext value
  623. // at that property
  624. // e.g.
  625. // {"properties":{"published":["foo"]}} results in "foo"
  626. private static function getPlaintext($mf2, $k, $fallback=null) {
  627. if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
  628. // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
  629. $value = $mf2['properties'][$k][0];
  630. if(is_string($value)) {
  631. return $value;
  632. } elseif(self::isMicroformat($value) && array_key_exists('value', $value)) {
  633. return $value['value'];
  634. }
  635. }
  636. return $fallback;
  637. }
  638. private static function getURL($url, $http) {
  639. if(!$url) return null;
  640. // TODO: consider adding caching here
  641. $result = $http->get($url);
  642. if($result['error'] || !$result['body']) {
  643. return null;
  644. }
  645. return \mf2\Parse($result['body'], $url);
  646. }
  647. }