You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

600 lines
20 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. <?php
  2. namespace XRay\Formats;
  3. use HTMLPurifier, HTMLPurifier_Config;
  4. use Parse;
  5. class Mf2 {
  6. public static function parse($mf2, $url, $http) {
  7. if(count($mf2['items']) == 0)
  8. return false;
  9. // If there is only one item on the page, just use that
  10. if(count($mf2['items']) == 1) {
  11. $item = $mf2['items'][0];
  12. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  13. Parse::debug("mf2:0: Recognized $url as an h-entry it is the only item on the page");
  14. return self::parseAsHEntry($mf2, $item, $http);
  15. }
  16. if(in_array('h-event', $item['type'])) {
  17. Parse::debug("mf2:0: Recognized $url as an h-event it is the only item on the page");
  18. return self::parseAsHEvent($mf2, $item, $http);
  19. }
  20. }
  21. // Check if the list of items is a bunch of h-entrys and return as a feed
  22. // Unless this page's URL matches one of the entries, then treat it as a permalink
  23. $hentrys = 0;
  24. $lastSeenEntry = false;
  25. foreach($mf2['items'] as $item) {
  26. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  27. if(array_key_exists('url', $item['properties'])) {
  28. $urls = $item['properties']['url'];
  29. $urls = array_map('self::normalize_url', $urls);
  30. if(in_array($url, $urls)) {
  31. Parse::debug("mf2:1: Recognized $url as an h-entry because an h-entry on the page matched the URL of the request");
  32. return self::parseAsHEntry($mf2, $item, $http);
  33. }
  34. $lastSeenEntry = $item;
  35. }
  36. $hentrys++;
  37. }
  38. }
  39. // If there was more than one h-entry on the page, treat the whole page as a feed
  40. if($hentrys > 1) {
  41. Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one h-entry on the page");
  42. return self::parseAsHFeed($mf2, $http);
  43. }
  44. // If the first item is an h-feed, parse as a feed
  45. $first = $mf2['items'][0];
  46. if(in_array('h-feed', $first['type'])) {
  47. Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed");
  48. return self::parseAsHFeed($mf2, $http);
  49. }
  50. // Check each top-level h-card and h-event, and if there is one that matches this URL, the page is an h-card
  51. foreach($mf2['items'] as $item) {
  52. if((in_array('h-card', $item['type']) or in_array('h-event', $item['type']))
  53. and array_key_exists('url', $item['properties'])
  54. ) {
  55. $urls = $item['properties']['url'];
  56. $urls = array_map('self::normalize_url', $urls);
  57. if(in_array($url, $urls)) {
  58. // TODO: check for children h-entrys (like tantek.com), or sibling h-entries (like aaronparecki.com)
  59. // and return the result as a feed instead
  60. if(in_array('h-card', $item['type'])) {
  61. Parse::debug("mf2:4: Recognized $url as an h-card because an h-card on the page matched the URL of the request");
  62. return self::parseAsHCard($item, $http, $url);
  63. } else {
  64. Parse::debug("mf2:4: Recognized $url as an h-event because an h-event on the page matched the URL of the request");
  65. return self::parseAsHEvent($mf2, $item, $http);
  66. }
  67. }
  68. }
  69. }
  70. // If there was only one h-entry, but the URL for it is not the same as this page, then treat as a feed
  71. if($hentrys == 1) {
  72. if($lastSeenEntry) {
  73. $urls = $lastSeenEntry['properties']['url'];
  74. $urls = array_map('self::normalize_url', $urls);
  75. if(count($urls) && !in_array($url, $urls)) {
  76. Parse::debug("mf2:5: Recognized $url as an h-feed no h-entrys on the page matched the URL of the request");
  77. return self::parseAsHFeed($mf2, $http);
  78. }
  79. }
  80. }
  81. // Fallback case, but hopefully we have found something before this point
  82. foreach($mf2['items'] as $item) {
  83. // Otherwise check for an h-entry
  84. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  85. Parse::debug("mf2:6: $url is falling back to the first h-entry on the page");
  86. return self::parseAsHEntry($mf2, $item, $http);
  87. }
  88. }
  89. Parse::debug("mf2:E: No object at $url was recognized");
  90. return false;
  91. }
  92. private static function parseAsHEntry($mf2, $item, $http) {
  93. $data = [
  94. 'type' => 'entry'
  95. ];
  96. $refs = [];
  97. // Single plaintext values
  98. $properties = ['url','published','summary','rsvp'];
  99. foreach($properties as $p) {
  100. if(($v = self::getPlaintext($item, $p)) !== null) {
  101. if($p == 'url') {
  102. if(self::isURL($v))
  103. $data[$p] = $v;
  104. } else {
  105. $data[$p] = $v;
  106. }
  107. }
  108. }
  109. // Always arrays
  110. $properties = ['photo','video','audio','syndication'];
  111. foreach($properties as $p) {
  112. if(array_key_exists($p, $item['properties'])) {
  113. foreach($item['properties'][$p] as $v) {
  114. if(is_string($v) && self::isURL($v)) {
  115. if(!array_key_exists($p, $data)) $data[$p] = [];
  116. $data[$p][] = $v;
  117. }
  118. elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
  119. if(!array_key_exists($p, $data)) $data[$p] = [];
  120. $data[$p][] = $v['value'];
  121. }
  122. }
  123. }
  124. }
  125. // Always returned as arrays, and may also create external references
  126. // If these are not objects, they must be URLs
  127. $set = [
  128. 'normal' => ['category','invitee'],
  129. 'url' => ['in-reply-to','like-of','repost-of','bookmark-of']
  130. ];
  131. foreach($set as $type=>$properties) {
  132. foreach($properties as $p) {
  133. if(array_key_exists($p, $item['properties'])) {
  134. foreach($item['properties'][$p] as $v) {
  135. if(is_string($v) && ($type == 'normal' || self::isURL($v))) {
  136. if(!array_key_exists($p, $data)) $data[$p] = [];
  137. $data[$p][] = $v;
  138. }
  139. elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  140. if(!array_key_exists($p, $data)) $data[$p] = [];
  141. $data[$p][] = $u;
  142. // parse the object and put the result in the "refs" object
  143. $ref = self::parse(['items'=>[$v]], $u, $http);
  144. if($ref) {
  145. $refs[$u] = $ref['data'];
  146. }
  147. }
  148. }
  149. }
  150. }
  151. }
  152. // Determine if the name is distinct from the content
  153. $name = self::getPlaintext($item, 'name');
  154. $content = null;
  155. $textContent = null;
  156. $htmlContent = null;
  157. if(array_key_exists('content', $item['properties'])) {
  158. $content = $item['properties']['content'][0];
  159. if(is_string($content)) {
  160. $textContent = $content;
  161. } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
  162. if(array_key_exists('html', $content)) {
  163. $htmlContent = trim(self::sanitizeHTML($content['html']));
  164. $textContent = trim(str_replace("&#xD;","\r",strip_tags($htmlContent)));
  165. $textContent = trim(str_replace("&#xD;","\r",$content['value']));
  166. } else {
  167. $textContent = trim($content['value']);
  168. }
  169. }
  170. // Trim ellipses from the name
  171. $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name);
  172. // Remove all whitespace when checking equality
  173. $nameCompare = preg_replace('/\s/','',trim($name));
  174. $contentCompare = preg_replace('/\s/','',trim($textContent));
  175. // Check if the name is a prefix of the content
  176. if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) {
  177. $name = null;
  178. }
  179. }
  180. if($name) {
  181. $data['name'] = $name;
  182. }
  183. // If there is content, always return the plaintext content, and return HTML content if it's different
  184. if($content) {
  185. $data['content'] = [
  186. 'text' => $textContent
  187. ];
  188. if($htmlContent && $textContent != $htmlContent) {
  189. $data['content']['html'] = $htmlContent;
  190. }
  191. // TODO: If no HTML content was included in the post, create HTML by autolinking?
  192. }
  193. if($author = self::findAuthor($mf2, $item, $http))
  194. $data['author'] = $author;
  195. $response = [
  196. 'data' => $data
  197. ];
  198. if(count($refs)) {
  199. $response['refs'] = $refs;
  200. }
  201. return $response;
  202. }
  203. private static function parseAsHEvent($mf2, $item, $http) {
  204. $data = [
  205. 'type' => 'event'
  206. ];
  207. $refs = [];
  208. // Single plaintext values
  209. $properties = ['name','summary','url','published','start','end','duration'];
  210. foreach($properties as $p) {
  211. if(($v = self::getPlaintext($item, $p)) !== null) {
  212. if($p == 'url') {
  213. if(self::isURL($v))
  214. $data[$p] = $v;
  215. } else {
  216. $data[$p] = $v;
  217. }
  218. }
  219. }
  220. // Always arrays
  221. $properties = ['photo','video','audio','syndication'];
  222. foreach($properties as $p) {
  223. if(array_key_exists($p, $item['properties'])) {
  224. foreach($item['properties'][$p] as $v) {
  225. if(is_string($v) && self::isURL($v)) {
  226. if(!array_key_exists($p, $data)) $data[$p] = [];
  227. $data[$p][] = $v;
  228. }
  229. elseif(is_array($v) and array_key_exists('value', $v) && self::isURL($v['value'])) {
  230. if(!array_key_exists($p, $data)) $data[$p] = [];
  231. $data[$p][] = $v['value'];
  232. }
  233. }
  234. }
  235. }
  236. // Always returned as arrays, and may also create external references
  237. $properties = ['category','location','attendee'];
  238. foreach($properties as $p) {
  239. if(array_key_exists($p, $item['properties'])) {
  240. $data[$p] = [];
  241. foreach($item['properties'][$p] as $v) {
  242. if(is_string($v))
  243. $data[$p][] = $v;
  244. elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  245. $data[$p][] = $u;
  246. // parse the object and put the result in the "refs" object
  247. $ref = self::parse(['items'=>[$v]], $u, $http);
  248. if($ref) {
  249. $refs[$u] = $ref['data'];
  250. }
  251. }
  252. }
  253. }
  254. }
  255. // If there is a description, always return the plaintext description, and return HTML description if it's different
  256. $textDescription = null;
  257. $htmlDescription = null;
  258. if(array_key_exists('description', $item['properties'])) {
  259. $description = $item['properties']['description'][0];
  260. if(is_string($description)) {
  261. $textDescription = $description;
  262. } elseif(!is_string($description) && is_array($description) && array_key_exists('value', $description)) {
  263. if(array_key_exists('html', $description)) {
  264. $htmlDescription = trim(self::sanitizeHTML($description['html']));
  265. $textDescription = trim(str_replace("&#xD;","\r",strip_tags($htmlDescription)));
  266. $textDescription = trim(str_replace("&#xD;","\r",$description['value']));
  267. } else {
  268. $textDescription = trim($description['value']);
  269. }
  270. }
  271. }
  272. if($textDescription) {
  273. $data['description'] = [
  274. 'text' => $textDescription
  275. ];
  276. if($htmlDescription && $textDescription != $htmlDescription) {
  277. $data['description']['html'] = $htmlDescription;
  278. }
  279. }
  280. $response = [
  281. 'data' => $data
  282. ];
  283. if(count($refs)) {
  284. $response['refs'] = $refs;
  285. }
  286. return $response;
  287. }
  288. private static function parseAsHFeed($mf2, $http) {
  289. $data = [
  290. 'type' => 'feed',
  291. 'author' => [
  292. 'type' => 'card',
  293. 'name' => null,
  294. 'url' => null,
  295. 'photo' => null
  296. ],
  297. 'todo' => 'Not yet implemented. Please see https://github.com/aaronpk/XRay/issues/1'
  298. ];
  299. return [
  300. 'data' => $data,
  301. 'entries' => []
  302. ];
  303. }
  304. private static function parseAsHCard($item, $http, $authorURL=false) {
  305. $data = [
  306. 'type' => 'card',
  307. 'name' => null,
  308. 'url' => null,
  309. 'photo' => null
  310. ];
  311. $properties = ['url','name','photo'];
  312. foreach($properties as $p) {
  313. if($p == 'url' && $authorURL) {
  314. // If there is a matching author URL, use that one
  315. $found = false;
  316. foreach($item['properties']['url'] as $url) {
  317. if(self::isURL($url)) {
  318. $url = self::normalize_url($url);
  319. if($url == $authorURL) {
  320. $data['url'] = $url;
  321. $found = true;
  322. }
  323. }
  324. }
  325. if(!$found && self::isURL($item['properties']['url'][0])) {
  326. $data['url'] = $item['properties']['url'][0];
  327. }
  328. } else if(($v = self::getPlaintext($item, $p)) !== null) {
  329. // Make sure the URL property is actually a URL
  330. if($p == 'url' || $p == 'photo') {
  331. if(self::isURL($v))
  332. $data[$p] = $v;
  333. } else {
  334. $data[$p] = $v;
  335. }
  336. }
  337. }
  338. $response = [
  339. 'data' => $data
  340. ];
  341. return $response;
  342. }
  343. private static function findAuthor($mf2, $item, $http) {
  344. $author = [
  345. 'type' => 'card',
  346. 'name' => null,
  347. 'url' => null,
  348. 'photo' => null
  349. ];
  350. // Author Discovery
  351. // http://indiewebcamp.com/authorship
  352. $authorPage = false;
  353. if(array_key_exists('author', $item['properties'])) {
  354. // Check if any of the values of the author property are an h-card
  355. foreach($item['properties']['author'] as $a) {
  356. if(self::isHCard($a)) {
  357. // 5.1 "if it has an h-card, use it, exit."
  358. return self::parseAsHCard($a, $http)['data'];
  359. } elseif(is_string($a)) {
  360. if(self::isURL($a)) {
  361. // 5.2 "otherwise if author property is an http(s) URL, let the author-page have that URL"
  362. $authorPage = $a;
  363. } else {
  364. // 5.3 "otherwise use the author property as the author name, exit"
  365. // We can only set the name, no h-card or URL was found
  366. $author['name'] = self::getPlaintext($item, 'author');
  367. return $author;
  368. }
  369. } else {
  370. // This case is only hit when the author property is an mf2 object that is not an h-card
  371. $author['name'] = self::getPlaintext($item, 'author');
  372. return $author;
  373. }
  374. }
  375. }
  376. // 6. "if no author page was found" ... check for rel-author link
  377. if(!$authorPage) {
  378. if(isset($mf2['rels']) && isset($mf2['rels']['author']))
  379. $authorPage = $mf2['rels']['author'][0];
  380. }
  381. // 7. "if there is an author-page URL" ...
  382. if($authorPage) {
  383. // 7.1 "get the author-page from that URL and parse it for microformats2"
  384. $authorPageContents = self::getURL($authorPage, $http);
  385. if($authorPageContents) {
  386. foreach($authorPageContents['items'] as $i) {
  387. if(self::isHCard($i)) {
  388. // 7.2 "if author-page has 1+ h-card with url == uid == author-page's URL, then use first such h-card, exit."
  389. if(array_key_exists('url', $i['properties'])
  390. and in_array($authorPage, $i['properties']['url'])
  391. and array_key_exists('uid', $i['properties'])
  392. and in_array($authorPage, $i['properties']['uid'])
  393. ) {
  394. return self::parseAsHCard($i, $http, $authorPage)['data'];
  395. }
  396. // 7.3 "else if author-page has 1+ h-card with url property which matches the href of a rel-me link on the author-page"
  397. $relMeLinks = (isset($authorPageContents['rels']) && isset($authorPageContents['rels']['me'])) ? $authorPageContents['rels']['me'] : [];
  398. if(count($relMeLinks) > 0
  399. and array_key_exists('url', $i['properties'])
  400. and count(array_intersect($i['properties']['url'], $relMeLinks)) > 0
  401. ) {
  402. return self::parseAsHCard($i, $http, $authorPage)['data'];
  403. }
  404. }
  405. }
  406. }
  407. // 7.4 "if the h-entry's page has 1+ h-card with url == author-page URL, use first such h-card, exit."
  408. foreach($mf2['items'] as $i) {
  409. if(self::isHCard($i)) {
  410. if(array_key_exists('url', $i['properties'])
  411. and in_array($authorPage, $i['properties']['url'])
  412. ) {
  413. return self::parseAsHCard($i, $http)['data'];
  414. }
  415. }
  416. }
  417. }
  418. if(!$author['name'] && !$author['photo'] && !$author['url'])
  419. return null;
  420. return $author;
  421. }
  422. private static function sanitizeHTML($html) {
  423. $config = HTMLPurifier_Config::createDefault();
  424. $config->set('Cache.DefinitionImpl', null);
  425. $config->set('HTML.AllowedElements', [
  426. 'a',
  427. 'abbr',
  428. 'b',
  429. 'code',
  430. 'del',
  431. 'em',
  432. 'i',
  433. 'img',
  434. 'q',
  435. 'strike',
  436. 'strong',
  437. 'time',
  438. 'blockquote',
  439. 'pre',
  440. 'p',
  441. 'h1',
  442. 'h2',
  443. 'h3',
  444. 'h4',
  445. 'h5',
  446. 'h6',
  447. 'ul',
  448. 'li',
  449. 'ol'
  450. ]);
  451. $def = $config->getHTMLDefinition(true);
  452. $def->addElement(
  453. 'time',
  454. 'Inline',
  455. 'Inline',
  456. 'Common',
  457. [
  458. 'datetime' => 'Text'
  459. ]
  460. );
  461. // Override the allowed classes to only support Microformats2 classes
  462. $def->manager->attrTypes->set('Class', new HTMLPurifier_AttrDef_HTML_Microformats2());
  463. $purifier = new HTMLPurifier($config);
  464. $sanitized = $purifier->purify($html);
  465. $sanitized = str_replace("&#xD;","\r",$sanitized);
  466. return $sanitized;
  467. }
  468. private static function hasNumericKeys(array $arr) {
  469. foreach($arr as $key=>$val)
  470. if (is_numeric($key))
  471. return true;
  472. return false;
  473. }
  474. private static function isMicroformat($mf) {
  475. return is_array($mf)
  476. and !self::hasNumericKeys($mf)
  477. and !empty($mf['type'])
  478. and isset($mf['properties']);
  479. }
  480. private static function isHCard($mf) {
  481. return is_array($mf)
  482. and !empty($mf['type'])
  483. and is_array($mf['type'])
  484. and in_array('h-card', $mf['type']);
  485. }
  486. private static function isURL($string) {
  487. return preg_match('/^https?:\/\/.+\..+$/', $string);
  488. }
  489. // Given an array of microformats properties and a key name, return the plaintext value
  490. // at that property
  491. // e.g.
  492. // {"properties":{"published":["foo"]}} results in "foo"
  493. private static function getPlaintext($mf2, $k, $fallback=null) {
  494. if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
  495. // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
  496. $value = $mf2['properties'][$k][0];
  497. if(is_string($value)) {
  498. return $value;
  499. } elseif(self::isMicroformat($value) && array_key_exists('value', $value)) {
  500. return $value['value'];
  501. }
  502. }
  503. return $fallback;
  504. }
  505. private static function getURL($url, $http) {
  506. if(!$url) return null;
  507. // TODO: consider adding caching here
  508. $result = $http->get($url);
  509. if($result['error'] || !$result['body']) {
  510. return null;
  511. }
  512. return \mf2\Parse($result['body'], $url);
  513. }
  514. private static function normalize_url($url) {
  515. $parts = parse_url($url);
  516. if(empty($parts['path']))
  517. $parts['path'] = '/';
  518. $parts['host'] = strtolower($parts['host']);
  519. return self::build_url($parts);
  520. }
  521. private static function build_url($parsed_url) {
  522. $scheme = isset($parsed_url['scheme']) ? $parsed_url['scheme'] . '://' : '';
  523. $host = isset($parsed_url['host']) ? $parsed_url['host'] : '';
  524. $port = isset($parsed_url['port']) ? ':' . $parsed_url['port'] : '';
  525. $user = isset($parsed_url['user']) ? $parsed_url['user'] : '';
  526. $pass = isset($parsed_url['pass']) ? ':' . $parsed_url['pass'] : '';
  527. $pass = ($user || $pass) ? "$pass@" : '';
  528. $path = isset($parsed_url['path']) ? $parsed_url['path'] : '';
  529. $query = isset($parsed_url['query']) ? '?' . $parsed_url['query'] : '';
  530. $fragment = isset($parsed_url['fragment']) ? '#' . $parsed_url['fragment'] : '';
  531. return "$scheme$user$pass$host$port$path$query$fragment";
  532. }
  533. }