You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

600 lines
20 KiB

9 years ago
9 years ago
8 years ago
  1. <?php
  2. namespace XRay\Formats;
  3. use HTMLPurifier, HTMLPurifier_Config;
  4. use Parse;
  5. class Mf2 {
  6. public static function parse($mf2, $url, $http) {
  7. if(count($mf2['items']) == 0)
  8. return false;
  9. // If there is only one item on the page, just use that
  10. if(count($mf2['items']) == 1) {
  11. $item = $mf2['items'][0];
  12. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  13. Parse::debug("mf2:0: Recognized $url as an h-entry it is the only item on the page");
  14. return self::parseAsHEntry($mf2, $item, $http);
  15. }
  16. if(in_array('h-event', $item['type'])) {
  17. Parse::debug("mf2:0: Recognized $url as an h-event it is the only item on the page");
  18. return self::parseAsHEvent($mf2, $item, $http);
  19. }
  20. }
  21. // Check if the list of items is a bunch of h-entrys and return as a feed
  22. // Unless this page's URL matches one of the entries, then treat it as a permalink
  23. $hentrys = 0;
  24. $lastSeenEntry = false;
  25. foreach($mf2['items'] as $item) {
  26. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  27. if(array_key_exists('url', $item['properties'])) {
  28. $urls = $item['properties']['url'];
  29. $urls = array_map('self::normalize_url', $urls);
  30. if(in_array($url, $urls)) {
  31. Parse::debug("mf2:1: Recognized $url as an h-entry because an h-entry on the page matched the URL of the request");
  32. return self::parseAsHEntry($mf2, $item, $http);
  33. }
  34. $lastSeenEntry = $item;
  35. }
  36. $hentrys++;
  37. }
  38. }
  39. // If there was more than one h-entry on the page, treat the whole page as a feed
  40. if($hentrys > 1) {
  41. Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one h-entry on the page");
  42. return self::parseAsHFeed($mf2, $http);
  43. }
  44. // If the first item is an h-feed, parse as a feed
  45. $first = $mf2['items'][0];
  46. if(in_array('h-feed', $first['type'])) {
  47. Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed");
  48. return self::parseAsHFeed($mf2, $http);
  49. }
  50. // Check each top-level h-card and h-event, and if there is one that matches this URL, the page is an h-card
  51. foreach($mf2['items'] as $item) {
  52. if((in_array('h-card', $item['type']) or in_array('h-event', $item['type']))
  53. and array_key_exists('url', $item['properties'])
  54. ) {
  55. $urls = $item['properties']['url'];
  56. $urls = array_map('self::normalize_url', $urls);
  57. if(in_array($url, $urls)) {
  58. // TODO: check for children h-entrys (like tantek.com), or sibling h-entries (like aaronparecki.com)
  59. // and return the result as a feed instead
  60. if(in_array('h-card', $item['type'])) {
  61. Parse::debug("mf2:4: Recognized $url as an h-card because an h-card on the page matched the URL of the request");
  62. return self::parseAsHCard($item, $http, $url);
  63. } else {
  64. Parse::debug("mf2:4: Recognized $url as an h-event because an h-event on the page matched the URL of the request");
  65. return self::parseAsHEvent($mf2, $item, $http);
  66. }
  67. }
  68. }
  69. }
  70. // If there was only one h-entry, but the URL for it is not the same as this page, then treat as a feed
  71. if($hentrys == 1) {
  72. if($lastSeenEntry) {
  73. $urls = $lastSeenEntry['properties']['url'];
  74. $urls = array_map('self::normalize_url', $urls);
  75. if(count($urls) && !in_array($url, $urls)) {
  76. Parse::debug("mf2:5: Recognized $url as an h-feed no h-entrys on the page matched the URL of the request");
  77. return self::parseAsHFeed($mf2, $http);
  78. }
  79. }
  80. }
  81. // Fallback case, but hopefully we have found something before this point
  82. foreach($mf2['items'] as $item) {
  83. // Otherwise check for an h-entry
  84. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  85. Parse::debug("mf2:6: $url is falling back to the first h-entry on the page");
  86. return self::parseAsHEntry($mf2, $item, $http);
  87. }
  88. }
  89. Parse::debug("mf2:E: No object at $url was recognized");
  90. return false;
  91. }
  92. private static function parseAsHEntry($mf2, $item, $http) {
  93. $data = [
  94. 'type' => 'entry'
  95. ];
  96. $refs = [];
  97. // Single plaintext values
  98. $properties = ['url','published','summary','rsvp'];
  99. foreach($properties as $p) {
  100. if(($v = self::getPlaintext($item, $p)) !== null) {
  101. if($p == 'url') {
  102. if(self::isURL($v))
  103. $data[$p] = $v;
  104. } else {
  105. $data[$p] = $v;
  106. }
  107. }
  108. }
  109. // Always arrays
  110. $properties = ['photo','video','audio','syndication'];
  111. foreach($properties as $p) {
  112. if(array_key_exists($p, $item['properties'])) {
  113. foreach($item['properties'][$p] as $v) {
  114. if(is_string($v) && self::isURL($v)) {
  115. if(!array_key_exists($p, $data)) $data[$p] = [];
  116. $data[$p][] = $v;
  117. }
  118. elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
  119. if(!array_key_exists($p, $data)) $data[$p] = [];
  120. $data[$p][] = $v['value'];
  121. }
  122. }
  123. }
  124. }
  125. // Always returned as arrays, and may also create external references
  126. // If these are not objects, they must be URLs
  127. $set = [
  128. 'normal' => ['category','invitee'],
  129. 'url' => ['in-reply-to','like-of','repost-of','bookmark-of']
  130. ];
  131. foreach($set as $type=>$properties) {
  132. foreach($properties as $p) {
  133. if(array_key_exists($p, $item['properties'])) {
  134. foreach($item['properties'][$p] as $v) {
  135. if(is_string($v) && ($type == 'normal' || self::isURL($v))) {
  136. if(!array_key_exists($p, $data)) $data[$p] = [];
  137. $data[$p][] = $v;
  138. }
  139. elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  140. if(!array_key_exists($p, $data)) $data[$p] = [];
  141. $data[$p][] = $u;
  142. // parse the object and put the result in the "refs" object
  143. $ref = self::parse(['items'=>[$v]], $u, $http);
  144. if($ref) {
  145. $refs[$u] = $ref['data'];
  146. }
  147. }
  148. }
  149. }
  150. }
  151. }
  152. // Determine if the name is distinct from the content
  153. $name = self::getPlaintext($item, 'name');
  154. $content = null;
  155. $textContent = null;
  156. $htmlContent = null;
  157. if(array_key_exists('content', $item['properties'])) {
  158. $content = $item['properties']['content'][0];
  159. if(is_string($content)) {
  160. $textContent = $content;
  161. } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
  162. if(array_key_exists('html', $content)) {
  163. $htmlContent = trim(self::sanitizeHTML($content['html']));
  164. $textContent = trim(str_replace("&#xD;","\r",strip_tags($htmlContent)));
  165. $textContent = trim(str_replace("&#xD;","\r",$content['value']));
  166. } else {
  167. $textContent = trim($content['value']);
  168. }
  169. }
  170. // Trim ellipses from the name
  171. $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name);
  172. // Remove all whitespace when checking equality
  173. $nameCompare = preg_replace('/\s/','',trim($name));
  174. $contentCompare = preg_replace('/\s/','',trim($textContent));
  175. // Check if the name is a prefix of the content
  176. if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) {
  177. $name = null;
  178. }
  179. }
  180. if($name) {
  181. $data['name'] = $name;
  182. }
  183. // If there is content, always return the plaintext content, and return HTML content if it's different
  184. if($content) {
  185. $data['content'] = [
  186. 'text' => $textContent
  187. ];
  188. if($htmlContent && $textContent != $htmlContent) {
  189. $data['content']['html'] = $htmlContent;
  190. }
  191. // TODO: If no HTML content was included in the post, create HTML by autolinking?
  192. }
  193. if($author = self::findAuthor($mf2, $item, $http))
  194. $data['author'] = $author;
  195. $response = [
  196. 'data' => $data
  197. ];
  198. if(count($refs)) {
  199. $response['refs'] = $refs;
  200. }
  201. return $response;
  202. }
  203. private static function parseAsHEvent($mf2, $item, $http) {
  204. $data = [
  205. 'type' => 'event'
  206. ];
  207. $refs = [];
  208. // Single plaintext values
  209. $properties = ['name','summary','url','published','start','end','duration'];
  210. foreach($properties as $p) {
  211. if(($v = self::getPlaintext($item, $p)) !== null) {
  212. if($p == 'url') {
  213. if(self::isURL($v))
  214. $data[$p] = $v;
  215. } else {
  216. $data[$p] = $v;
  217. }
  218. }
  219. }
  220. // Always arrays
  221. $properties = ['photo','video','audio','syndication'];
  222. foreach($properties as $p) {
  223. if(array_key_exists($p, $item['properties'])) {
  224. foreach($item['properties'][$p] as $v) {
  225. if(is_string($v) && self::isURL($v)) {
  226. if(!array_key_exists($p, $data)) $data[$p] = [];
  227. $data[$p][] = $v;
  228. }
  229. elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
  230. if(!array_key_exists($p, $data)) $data[$p] = [];
  231. $data[$p][] = $v['value'];
  232. }
  233. }
  234. }
  235. }
  236. // Always returned as arrays, and may also create external references
  237. $properties = ['category','location','attendee'];
  238. foreach($properties as $p) {
  239. if(array_key_exists($p, $item['properties'])) {
  240. $data[$p] = [];
  241. foreach($item['properties'][$p] as $v) {
  242. if(is_string($v))
  243. $data[$p][] = $v;
  244. elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  245. $data[$p][] = $u;
  246. // parse the object and put the result in the "refs" object
  247. $ref = self::parse(['items'=>[$v]], $u, $http);
  248. if($ref) {
  249. $refs[$u] = $ref['data'];
  250. }
  251. }
  252. }
  253. }
  254. }
  255. // If there is a description, always return the plaintext description, and return HTML description if it's different
  256. $textDescription = null;
  257. $htmlDescription = null;
  258. if(array_key_exists('description', $item['properties'])) {
  259. $description = $item['properties']['description'][0];
  260. if(is_string($description)) {
  261. $textDescription = $description;
  262. } elseif(!is_string($description) && is_array($description) && array_key_exists('value', $description)) {
  263. if(array_key_exists('html', $description)) {
  264. $htmlDescription = trim(self::sanitizeHTML($description['html']));
  265. $textDescription = trim(str_replace("&#xD;","\r",strip_tags($htmlDescription)));
  266. $textDescription = trim(str_replace("&#xD;","\r",$description['value']));
  267. } else {
  268. $textDescription = trim($description['value']);
  269. }
  270. }
  271. }
  272. if($textDescription) {
  273. $data['description'] = [
  274. 'text' => $textDescription
  275. ];
  276. if($htmlDescription && $textDescription != $htmlDescription) {
  277. $data['description']['html'] = $htmlDescription;
  278. }
  279. }
  280. $response = [
  281. 'data' => $data
  282. ];
  283. if(count($refs)) {
  284. $response['refs'] = $refs;
  285. }
  286. return $response;
  287. }
  288. private static function parseAsHFeed($mf2, $http) {
  289. $data = [
  290. 'type' => 'feed',
  291. 'author' => [
  292. 'type' => 'card',
  293. 'name' => null,
  294. 'url' => null,
  295. 'photo' => null
  296. ],
  297. 'todo' => 'Not yet implemented. Please see https://github.com/aaronpk/XRay/issues/1'
  298. ];
  299. return [
  300. 'data' => $data,
  301. 'entries' => []
  302. ];
  303. }
  304. private static function parseAsHCard($item, $http, $authorURL=false) {
  305. $data = [
  306. 'type' => 'card',
  307. 'name' => null,
  308. 'url' => null,
  309. 'photo' => null
  310. ];
  311. $properties = ['url','name','photo'];
  312. foreach($properties as $p) {
  313. if($p == 'url' && $authorURL) {
  314. // If there is a matching author URL, use that one
  315. $found = false;
  316. foreach($item['properties']['url'] as $url) {
  317. if(self::isURL($url)) {
  318. $url = self::normalize_url($url);
  319. if($url == $authorURL) {
  320. $data['url'] = $url;
  321. $found = true;
  322. }
  323. }
  324. }
  325. if(!$found && self::isURL($item['properties']['url'][0])) {
  326. $data['url'] = $item['properties']['url'][0];
  327. }
  328. } else if(($v = self::getPlaintext($item, $p)) !== null) {
  329. // Make sure the URL property is actually a URL
  330. if($p == 'url' || $p == 'photo') {
  331. if(self::isURL($v))
  332. $data[$p] = $v;
  333. } else {
  334. $data[$p] = $v;
  335. }
  336. }
  337. }
  338. $response = [
  339. 'data' => $data
  340. ];
  341. return $response;
  342. }
  343. private static function findAuthor($mf2, $item, $http) {
  344. $author = [
  345. 'type' => 'card',
  346. 'name' => null,
  347. 'url' => null,
  348. 'photo' => null
  349. ];
  350. // Author Discovery
  351. // http://indiewebcamp.com/authorship
  352. $authorPage = false;
  353. if(array_key_exists('author', $item['properties'])) {
  354. // Check if any of the values of the author property are an h-card
  355. foreach($item['properties']['author'] as $a) {
  356. if(self::isHCard($a)) {
  357. // 5.1 "if it has an h-card, use it, exit."
  358. return self::parseAsHCard($a, $http)['data'];
  359. } elseif(is_string($a)) {
  360. if(self::isURL($a)) {
  361. // 5.2 "otherwise if author property is an http(s) URL, let the author-page have that URL"
  362. $authorPage = $a;
  363. } else {
  364. // 5.3 "otherwise use the author property as the author name, exit"
  365. // We can only set the name, no h-card or URL was found
  366. $author['name'] = self::getPlaintext($item, 'author');
  367. return $author;
  368. }
  369. } else {
  370. // This case is only hit when the author property is an mf2 object that is not an h-card
  371. $author['name'] = self::getPlaintext($item, 'author');
  372. return $author;
  373. }
  374. }
  375. }
  376. // 6. "if no author page was found" ... check for rel-author link
  377. if(!$authorPage) {
  378. if(isset($mf2['rels']) && isset($mf2['rels']['author']))
  379. $authorPage = $mf2['rels']['author'][0];
  380. }
  381. // 7. "if there is an author-page URL" ...
  382. if($authorPage) {
  383. // 7.1 "get the author-page from that URL and parse it for microformats2"
  384. $authorPageContents = self::getURL($authorPage, $http);
  385. if($authorPageContents) {
  386. foreach($authorPageContents['items'] as $i) {
  387. if(self::isHCard($i)) {
  388. // 7.2 "if author-page has 1+ h-card with url == uid == author-page's URL, then use first such h-card, exit."
  389. if(array_key_exists('url', $i['properties'])
  390. and in_array($authorPage, $i['properties']['url'])
  391. and array_key_exists('uid', $i['properties'])
  392. and in_array($authorPage, $i['properties']['uid'])
  393. ) {
  394. return self::parseAsHCard($i, $http, $authorPage)['data'];
  395. }
  396. // 7.3 "else if author-page has 1+ h-card with url property which matches the href of a rel-me link on the author-page"
  397. $relMeLinks = (isset($authorPageContents['rels']) && isset($authorPageContents['rels']['me'])) ? $authorPageContents['rels']['me'] : [];
  398. if(count($relMeLinks) > 0
  399. and array_key_exists('url', $i['properties'])
  400. and count(array_intersect($i['properties']['url'], $relMeLinks)) > 0
  401. ) {
  402. return self::parseAsHCard($i, $http, $authorPage)['data'];
  403. }
  404. }
  405. }
  406. }
  407. // 7.4 "if the h-entry's page has 1+ h-card with url == author-page URL, use first such h-card, exit."
  408. foreach($mf2['items'] as $i) {
  409. if(self::isHCard($i)) {
  410. if(array_key_exists('url', $i['properties'])
  411. and in_array($authorPage, $i['properties']['url'])
  412. ) {
  413. return self::parseAsHCard($i, $http)['data'];
  414. }
  415. }
  416. }
  417. }
  418. if(!$author['name'] && !$author['photo'] && !$author['url'])
  419. return null;
  420. return $author;
  421. }
  422. private static function sanitizeHTML($html) {
  423. $config = HTMLPurifier_Config::createDefault();
  424. $config->set('Cache.DefinitionImpl', null);
  425. $config->set('HTML.AllowedElements', [
  426. 'a',
  427. 'abbr',
  428. 'b',
  429. 'code',
  430. 'del',
  431. 'em',
  432. 'i',
  433. 'img',
  434. 'q',
  435. 'strike',
  436. 'strong',
  437. 'time',
  438. 'blockquote',
  439. 'pre',
  440. 'p',
  441. 'h1',
  442. 'h2',
  443. 'h3',
  444. 'h4',
  445. 'h5',
  446. 'h6',
  447. 'ul',
  448. 'li',
  449. 'ol'
  450. ]);
  451. $def = $config->getHTMLDefinition(true);
  452. $def->addElement(
  453. 'time',
  454. 'Inline',
  455. 'Inline',
  456. 'Common',
  457. [
  458. 'datetime' => 'Text'
  459. ]
  460. );
  461. // Override the allowed classes to only support Microformats2 classes
  462. $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
  463. $purifier = new HTMLPurifier($config);
  464. $sanitized = $purifier->purify($html);
  465. $sanitized = str_replace("&#xD;","\r",$sanitized);
  466. return $sanitized;
  467. }
  468. private static function hasNumericKeys(array $arr) {
  469. foreach($arr as $key=>$val)
  470. if (is_numeric($key))
  471. return true;
  472. return false;
  473. }
  474. private static function isMicroformat($mf) {
  475. return is_array($mf)
  476. and !self::hasNumericKeys($mf)
  477. and !empty($mf['type'])
  478. and isset($mf['properties']);
  479. }
  480. private static function isHCard($mf) {
  481. return is_array($mf)
  482. and !empty($mf['type'])
  483. and is_array($mf['type'])
  484. and in_array('h-card', $mf['type']);
  485. }
  486. private static function isURL($string) {
  487. return preg_match('/^https?:\/\/.+\..+$/', $string);
  488. }
  489. // Given an array of microformats properties and a key name, return the plaintext value
  490. // at that property
  491. // e.g.
  492. // {"properties":{"published":["foo"]}} results in "foo"
  493. private static function getPlaintext($mf2, $k, $fallback=null) {
  494. if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
  495. // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
  496. $value = $mf2['properties'][$k][0];
  497. if(is_string($value)) {
  498. return $value;
  499. } elseif(self::isMicroformat($value) && array_key_exists('value', $value)) {
  500. return $value['value'];
  501. }
  502. }
  503. return $fallback;
  504. }
  505. private static function getURL($url, $http) {
  506. if(!$url) return null;
  507. // TODO: consider adding caching here
  508. $result = $http->get($url);
  509. if($result['error'] || !$result['body']) {
  510. return null;
  511. }
  512. return \mf2\Parse($result['body'], $url);
  513. }
  514. private static function normalize_url($url) {
  515. $parts = parse_url($url);
  516. if(empty($parts['path']))
  517. $parts['path'] = '/';
  518. $parts['host'] = strtolower($parts['host']);
  519. return self::build_url($parts);
  520. }
  521. private static function build_url($parsed_url) {
  522. $scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : '';
  523. $host = isset($parsed_url['host']) ? $parsed_url['host'] : '';
  524. $port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : '';
  525. $user = isset($parsed_url['user']) ? $parsed_url['user'] : '';
  526. $pass = isset($parsed_url['pass']) ? ':' . $parsed_url['pass'] : '';
  527. $pass = ($user || $pass) ? "$pass@" : '';
  528. $path = isset($parsed_url['path']) ? $parsed_url['path'] : '';
  529. $query = isset($parsed_url['query']) ? '?' . $parsed_url['query'] : '';
  530. $fragment = isset($parsed_url['fragment']) ? '#' . $parsed_url['fragment'] : '';
  531. return "$scheme$user$pass$host$port$path$query$fragment";
  532. }
  533. }