You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

923 lines
32 KiB

6 years ago
6 years ago
8 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. class Mf2 extends Format {
  4. use Mf2Feed;
  5. public static function matches_host($url) {
  6. return true;
  7. }
  8. public static function matches($url) {
  9. return true;
  10. }
  11. public static function parse($mf2, $url, $http, $opts=[]) {
  12. if(!isset($mf2['items']) || count($mf2['items']) == 0)
  13. return false;
  14. // If they are expecting a feed, always return a feed or an error
  15. if(isset($opts['expect']) && $opts['expect'] == 'feed') {
  16. return self::parseAsHFeed($mf2, $http, $url);
  17. }
  18. // Remove h-breadcrumb since we never use it and it causes problems determining
  19. // whether a page is a feed or permalink
  20. $mf2['items'] = array_values(array_filter($mf2['items'], function($item){
  21. return !in_array('h-breadcrumb', $item['type']);
  22. }));
  23. $items = $mf2['items'];
  24. // If there is more than one item on the page, it may be a feed.
  25. // Remove an h-card if there is one that doesn't match the page URL, then try again.
  26. // (Don't modify the actual tree, but compare on the modified tree)
  27. if(count($items) > 1) {
  28. $tmpmf2 = array_filter($items, function($item) use($url){
  29. return !(in_array('h-card', $item['type']) && isset($item['properties']['url'][0]) && $item['properties']['url'][0] != $url);
  30. });
  31. $items = array_values($tmpmf2);
  32. }
  33. // If there is only one item left on the page, it's a permalink, and just use that
  34. if(count($items) == 1) {
  35. $item = $items[0];
  36. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  37. #Parse::debug("mf2:0: Recognized $url as an h-entry it is the only item on the page");
  38. return self::parseAsHEntry($mf2, $item, $http, $url);
  39. }
  40. if(in_array('h-event', $item['type'])) {
  41. #Parse::debug("mf2:0: Recognized $url as an h-event it is the only item on the page");
  42. return self::parseAsHEvent($mf2, $item, $http, $url);
  43. }
  44. if(in_array('h-review', $item['type'])) {
  45. #Parse::debug("mf2:0: Recognized $url as an h-review it is the only item on the page");
  46. return self::parseAsHReview($mf2, $item, $http, $url);
  47. }
  48. if(in_array('h-recipe', $item['type'])) {
  49. #Parse::debug("mf2:0: Recognized $url as an h-recipe it is the only item on the page");
  50. return self::parseAsHRecipe($mf2, $item, $http, $url);
  51. }
  52. if(in_array('h-product', $item['type'])) {
  53. #Parse::debug("mf2:0: Recognized $url as an h-product it is the only item on the page");
  54. return self::parseAsHProduct($mf2, $item, $http, $url);
  55. }
  56. if(in_array('h-item', $item['type'])) {
  57. #Parse::debug("mf2:0: Recognized $url as an h-product it is the only item on the page");
  58. return self::parseAsHItem($mf2, $item, $http, $url);
  59. }
  60. if(in_array('h-card', $item['type'])) {
  61. #Parse::debug("mf2:0: Recognized $url as an h-card it is the only item on the page");
  62. return self::parseAsHCard($item, $http, $url, $url);
  63. }
  64. if(in_array('h-app', $item['type']) || in_array('h-x-app', $item['type'])) {
  65. #Parse::debug("mf2:0: Recognized $url as an h-feed because it is the only item on the page");
  66. return self::parseAsHApp($mf2, $item, $http, $url);
  67. }
  68. if(in_array('h-feed', $item['type'])) {
  69. #Parse::debug("mf2:0: Recognized $url as an h-feed because it is the only item on the page");
  70. return self::parseAsHFeed($mf2, $http, $url);
  71. }
  72. }
  73. // Check the list of items on the page to see if one matches the URL of the page,
  74. // and treat as a permalink for that object if so.
  75. foreach($mf2['items'] as $item) {
  76. if(array_key_exists('url', $item['properties'])) {
  77. $urls = $item['properties']['url'];
  78. $urls = array_map('\p3k\XRay\normalize_url', $urls);
  79. if(in_array($url, $urls)) {
  80. #Parse::debug("mf2:1: Recognized $url as a permalink because an object on the page matched the URL of the request");
  81. if(in_array('h-card', $item['type'])) {
  82. return self::parseAsHCard($item, $http, $url, $url);
  83. } elseif(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  84. return self::parseAsHEntry($mf2, $item, $http, $url);
  85. } elseif(in_array('h-event', $item['type'])) {
  86. return self::parseAsHEvent($mf2, $item, $http, $url);
  87. } elseif(in_array('h-review', $item['type'])) {
  88. return self::parseAsHReview($mf2, $item, $http, $url);
  89. } elseif(in_array('h-recipe', $item['type'])) {
  90. return self::parseAsHRecipe($mf2, $item, $http, $url);
  91. } elseif(in_array('h-product', $item['type'])) {
  92. return self::parseAsHProduct($mf2, $item, $http, $url);
  93. } elseif(in_array('h-item', $item['type'])) {
  94. return self::parseAsHItem($mf2, $item, $http, $url);
  95. } elseif(in_array('h-app', $item['type']) || in_array('h-x-app', $item['type'])) {
  96. return self::parseAsHApp($mf2, $item, $http, $url);
  97. } elseif(in_array('h-feed', $item['type'])) {
  98. return self::parseAsHFeed($mf2, $http, $url);
  99. } else {
  100. #Parse::debug('This object was not a recognized type.');
  101. return false;
  102. }
  103. }
  104. }
  105. }
  106. // Check for an h-card matching rel=author or the author URL of any h-* on the page,
  107. // and return the h-* object if so
  108. if(isset($mf2['rels']['author'])) {
  109. foreach($mf2['items'] as $card) {
  110. if(in_array('h-card', $card['type']) && array_key_exists('url', $card['properties'])) {
  111. $urls = \p3k\XRay\normalize_urls($card['properties']['url']);
  112. if(count(array_intersect($urls, \p3k\XRay\normalize_urls($mf2['rels']['author']))) > 0) {
  113. // There is an author h-card on this page
  114. // Now look for the first h-* object other than an h-card and use that as the object
  115. foreach($mf2['items'] as $item) {
  116. if(!in_array('h-card', $item['type'])) {
  117. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  118. return self::parseAsHEntry($mf2, $item, $http, $url);
  119. } elseif(in_array('h-event', $item['type'])) {
  120. return self::parseAsHEvent($mf2, $item, $http, $url);
  121. } elseif(in_array('h-review', $item['type'])) {
  122. return self::parseAsHReview($mf2, $item, $http, $url);
  123. } elseif(in_array('h-recipe', $item['type'])) {
  124. return self::parseAsHRecipe($mf2, $item, $http, $url);
  125. } elseif(in_array('h-product', $item['type'])) {
  126. return self::parseAsHProduct($mf2, $item, $http, $url);
  127. } elseif(in_array('h-item', $item['type'])) {
  128. return self::parseAsHItem($mf2, $item, $http, $url);
  129. } elseif(in_array('h-app', $item['type']) || in_array('h-x-app', $item['type'])) {
  130. return self::parseAsHApp($mf2, $item, $http, $url);
  131. }
  132. }
  133. }
  134. }
  135. }
  136. }
  137. }
  138. // At this point, if there are any h-entrys left on the page, it's probably a feed.
  139. if(count($items) > 0) {
  140. if(count(array_filter($items, function($item){
  141. return in_array('h-entry', $item['type']);
  142. })) > 0) {
  143. #Parse::debug("mf2:2: Recognized $url as an h-feed because there are more than one object on the page");
  144. return self::parseAsHFeed($mf2, $http, $url);
  145. }
  146. }
  147. // If the first item is an h-feed, parse as a feed
  148. $first = $items[0];
  149. if(in_array('h-feed', $first['type'])) {
  150. #Parse::debug("mf2:3: Recognized $url as an h-feed because the first item is an h-feed");
  151. return self::parseAsHFeed($mf2, $http, $url);
  152. }
  153. // Fallback case, but hopefully we have found something before this point
  154. foreach($mf2['items'] as $item) {
  155. // Otherwise check for a recognized h-* object
  156. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  157. #Parse::debug("mf2:6: $url is falling back to the first h-entry on the page");
  158. return self::parseAsHEntry($mf2, $item, $http, $url);
  159. } elseif(in_array('h-event', $item['type'])) {
  160. #Parse::debug("mf2:6: $url is falling back to the first h-event on the page");
  161. return self::parseAsHEvent($mf2, $item, $http, $url);
  162. } elseif(in_array('h-review', $item['type'])) {
  163. #Parse::debug("mf2:6: $url is falling back to the first h-review on the page");
  164. return self::parseAsHReview($mf2, $item, $http, $url);
  165. } elseif(in_array('h-recipe', $item['type'])) {
  166. #Parse::debug("mf2:6: $url is falling back to the first h-recipe on the page");
  167. return self::parseAsHRecipe($mf2, $item, $http, $url);
  168. } elseif(in_array('h-product', $item['type'])) {
  169. #Parse::debug("mf2:6: $url is falling back to the first h-product on the page");
  170. return self::parseAsHProduct($mf2, $item, $http, $url);
  171. } elseif(in_array('h-item', $item['type'])) {
  172. #Parse::debug("mf2:6: $url is falling back to the first h-item on the page");
  173. return self::parseAsHItem($mf2, $item, $http, $url);
  174. } elseif(in_array('h-app', $item['type']) || in_array('h-x-app', $item['type'])) {
  175. #Parse::debug("mf2:6: $url is falling back to the first h-item on the page");
  176. return self::parseAsHApp($mf2, $item, $http, $url);
  177. }
  178. }
  179. #Parse::debug("mf2:E: No object at $url was recognized");
  180. return false;
  181. }
  182. private static function collectSingleValues($properties, $urlProperties, $item, $url, &$data) {
  183. foreach($properties as $p) {
  184. if(($v = self::getPlaintext($item, $p)) !== null) {
  185. $data[$p] = $v;
  186. }
  187. }
  188. foreach($urlProperties as $p) {
  189. if($p == 'url') {
  190. // Special handling for the 'url' property to prioritize finding the URL on the same domain
  191. if($values = self::getPlaintextValues($item, 'url')) {
  192. if(count($values) == 1) {
  193. if(self::isURL($values[0]))
  194. $data['url'] = $values[0];
  195. }
  196. else {
  197. $set = false;
  198. foreach($values as $v) {
  199. if(self::isURL($v) && parse_url($v, PHP_URL_HOST) == parse_url($url, PHP_URL_HOST)) {
  200. $set = true;
  201. $data['url'] = $v;
  202. }
  203. }
  204. if(!$set) {
  205. // Fall back to the first URL if there isn't one on the domain
  206. if(self::isURL($values[0]))
  207. $data['url'] = $values[0];
  208. }
  209. }
  210. }
  211. } else {
  212. if(($v = self::getPlaintext($item, $p)) !== null) {
  213. if(self::isURL($v))
  214. $data[$p] = $v;
  215. }
  216. }
  217. }
  218. }
  219. private static function parseHTMLValue($property, $item) {
  220. if(!array_key_exists($property, $item['properties']))
  221. return null;
  222. $textContent = false;
  223. $htmlContent = false;
  224. $content = $item['properties'][$property][0];
  225. if(is_string($content)) {
  226. $textContent = $content;
  227. } elseif(!is_string($content) && is_array($content) && array_key_exists('html', $content)) {
  228. if(array_key_exists('html', $content)) {
  229. // Only allow images in the content if there is no photo property set
  230. if(isset($item['properties']['photo']))
  231. $allowImg = false;
  232. else
  233. $allowImg = true;
  234. $htmlContent = trim(self::sanitizeHTML($content['html'], $allowImg));
  235. #$textContent = trim(str_replace("&#xD;","\r",$content['value']));
  236. $textContent = trim(self::stripHTML($htmlContent));
  237. } else {
  238. if(isset($content['value']))
  239. $textContent = trim($content['value']);
  240. }
  241. }
  242. if($textContent || $htmlContent) {
  243. $data = [
  244. 'text' => $textContent
  245. ];
  246. // Only add HTML content if there is actual content.
  247. // If the text content ends up empty, then the HTML should be too
  248. // e.g. <div class="e-content"><a href=""><img src="" class="u-photo"></a></div>
  249. // should not return content of <a href=""></a>
  250. // TODO: still need to remove empty <a> tags when there is other text in the content
  251. if($htmlContent && $textContent && $textContent != $htmlContent) {
  252. $data['html'] = $htmlContent;
  253. }
  254. if(!$data['text'])
  255. return null;
  256. return $data;
  257. } else {
  258. return null;
  259. }
  260. }
  261. // Always return arrays, and may contain plaintext content
  262. // Nested objects are added to refs and the URL is used as the value if present
  263. private static function collectArrayValues($properties, $item, &$data, &$refs, &$http) {
  264. foreach($properties as $p) {
  265. if(array_key_exists($p, $item['properties'])) {
  266. foreach($item['properties'][$p] as $v) {
  267. if(is_string($v)) {
  268. if(!array_key_exists($p, $data)) $data[$p] = [];
  269. if(!in_array($v, $data[$p]))
  270. $data[$p][] = $v;
  271. } elseif(self::isMicroformat($v)) {
  272. if(($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  273. if(!array_key_exists($p, $data)) $data[$p] = [];
  274. if(!in_array($u, $data[$p]))
  275. $data[$p][] = $u;
  276. $ref = self::parse(['items'=>[$v]], $u, $http);
  277. if($ref) {
  278. $refs[$u] = $ref['data'];
  279. }
  280. } else {
  281. if(!array_key_exists($p, $data)) $data[$p] = [];
  282. if(!in_array($v['value'], $data[$p]))
  283. $data[$p][] = $v['value'];
  284. }
  285. }
  286. }
  287. }
  288. }
  289. }
  290. private static function parseEmbeddedHCard($property, $item, &$http) {
  291. if(array_key_exists($property, $item['properties'])) {
  292. $mf2 = $item['properties'][$property][0];
  293. if(is_string($mf2) && self::isURL($mf2)) {
  294. $hcard = [
  295. 'type' => 'card',
  296. 'url' => $mf2
  297. ];
  298. return $hcard;
  299. } if(self::isMicroformat($mf2) && in_array('h-card', $mf2['type'])) {
  300. $hcard = [
  301. 'type' => 'card',
  302. ];
  303. $properties = ['name','latitude','longitude','locality','region','country','url'];
  304. foreach($properties as $p) {
  305. if($v=self::getPlaintext($mf2, $p)) {
  306. $hcard[$p] = $v;
  307. }
  308. }
  309. return $hcard;
  310. }
  311. }
  312. return false;
  313. }
  314. private static function collectArrayURLValues($properties, $item, &$data, &$refs, &$http) {
  315. $keys = [];
  316. foreach($properties as $p) {
  317. if(array_key_exists($p, $item['properties'])) {
  318. foreach($item['properties'][$p] as $v) {
  319. if(is_string($v) && self::isURL($v)) {
  320. if(!array_key_exists($p, $data)) $data[$p] = [];
  321. $data[$p][] = $v;
  322. $keys[] = $p;
  323. }
  324. elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url')) && self::isURL($u)) {
  325. if(!array_key_exists($p, $data)) $data[$p] = [];
  326. $data[$p][] = $u;
  327. $keys[] = $p;
  328. // parse the object and put the result in the "refs" object
  329. $ref = self::parse(['items'=>[$v]], $u, $http);
  330. if($ref) {
  331. $refs[$u] = $ref['data'];
  332. }
  333. }
  334. }
  335. }
  336. }
  337. // Remove duplicate values
  338. foreach(array_unique($keys) as $key) {
  339. $data[$key] = array_unique($data[$key]);
  340. }
  341. }
  342. private static function determineNameAndContent($item, &$data) {
  343. // Determine if the name is distinct from the content
  344. $name = self::getPlaintext($item, 'name');
  345. $textContent = null;
  346. $htmlContent = null;
  347. $content = self::getHTMLValue($item, 'content');
  348. if(is_string($content)) {
  349. $textContent = $content;
  350. } elseif($content) {
  351. $htmlContent = array_key_exists('html', $content) ? $content['html'] : null;
  352. $textContent = array_key_exists('value', $content) ? $content['value'] : null;
  353. }
  354. $checkedname = $name;
  355. if($content) {
  356. // Trim ellipses from the name
  357. $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name);
  358. // Remove all whitespace when checking equality
  359. $nameCompare = preg_replace('/\s/','',trim($name));
  360. $contentCompare = preg_replace('/\s/','',trim($textContent));
  361. // Check if the name is a prefix of the content
  362. if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) {
  363. $checkedname = null;
  364. }
  365. }
  366. if($checkedname) {
  367. $data['name'] = $checkedname;
  368. }
  369. // If there is content, always return the plaintext content, and return HTML content if it's different
  370. if($content) {
  371. $content = self::parseHTMLValue('content', $item);
  372. if($content['text']) {
  373. $data['content']['text'] = $content['text'];
  374. if(isset($content['html']))
  375. $data['content']['html'] = $content['html'];
  376. } else {
  377. // If the content text was blank because the img was removed and that was the only content,
  378. // then put the name back as the name if it was previously set.
  379. // See https://github.com/aaronpk/XRay/issues/57
  380. if($name) {
  381. $data['name'] = $name;
  382. }
  383. }
  384. }
  385. }
  386. private static function parseAsHEntry($mf2, $item, $http, $url) {
  387. $data = [
  388. 'type' => 'entry'
  389. ];
  390. $refs = [];
  391. // Single plaintext and URL values
  392. self::collectSingleValues(['published','summary','rsvp','swarm-coins'], ['url','featured','follow-of'], $item, $url, $data);
  393. if(isset($data['rsvp']))
  394. $data['rsvp'] = strtolower($data['rsvp']);
  395. // These properties are always returned as arrays and may contain plaintext content
  396. // First strip leading hashtags from category values if present
  397. if(array_key_exists('category', $item['properties'])) {
  398. foreach($item['properties']['category'] as $i=>$c) {
  399. if(is_string($c))
  400. $item['properties']['category'][$i] = ltrim($c, '#');
  401. }
  402. }
  403. self::collectArrayValues(['category','invitee'], $item, $data, $refs, $http);
  404. // These properties are always returned as arrays and always URLs
  405. // If the value is an h-* object with a URL, the URL is used and a "ref" is added as well
  406. self::collectArrayURLValues(['photo','video','audio','syndication','in-reply-to','like-of','repost-of','bookmark-of','quotation-of'], $item, $data, $refs, $http);
  407. // Hack to make quotation-of a single value
  408. if(isset($data['quotation-of']))
  409. $data['quotation-of'] = $data['quotation-of'][0];
  410. self::determineNameAndContent($item, $data);
  411. if($author = self::findAuthor($mf2, $item, $http, $url))
  412. $data['author'] = $author;
  413. if($checkin = self::parseEmbeddedHCard('checkin', $item, $http))
  414. $data['checkin'] = $checkin;
  415. $data['post-type'] = \p3k\XRay\PostType::discover($data);
  416. $response = [
  417. 'data' => $data,
  418. ];
  419. if(count($refs)) {
  420. $response['data']['refs'] = $refs;
  421. }
  422. return $response;
  423. }
  424. private static function parseAsHReview($mf2, $item, $http, $url) {
  425. $data = [
  426. 'type' => 'review'
  427. ];
  428. $refs = [];
  429. self::collectSingleValues(['summary','published','rating','best','worst'], ['url'], $item, $url, $data);
  430. // Fallback for Mf1 "description" as content. The PHP parser does not properly map this to "content"
  431. $description = self::parseHTMLValue('description', $item);
  432. if($description) {
  433. $data['content'] = $description;
  434. }
  435. self::collectArrayValues(['category'], $item, $data, $refs, $http);
  436. self::collectArrayURLValues(['item'], $item, $data, $refs, $http);
  437. self::determineNameAndContent($item, $data);
  438. if($author = self::findAuthor($mf2, $item, $http, $url))
  439. $data['author'] = $author;
  440. $data['post-type'] = \p3k\XRay\PostType::discover($data);
  441. $response = [
  442. 'data' => $data
  443. ];
  444. if(count($refs)) {
  445. $response['data']['refs'] = $refs;
  446. }
  447. return $response;
  448. }
  449. private static function parseAsHRecipe($mf2, $item, $http, $url) {
  450. $data = [
  451. 'type' => 'recipe',
  452. ];
  453. $refs = [];
  454. self::collectSingleValues(['name','summary','published','duration','yield','nutrition'], ['url'], $item, $url, $data);
  455. $instructions = self::parseHTMLValue('instructions', $item);
  456. if($instructions) {
  457. $data['instructions'] = $instructions;
  458. }
  459. self::collectArrayValues(['category','ingredient'], $item, $data, $refs, $http);
  460. self::collectArrayURLValues(['photo'], $item, $data, $refs, $http);
  461. if($author = self::findAuthor($mf2, $item, $http, $url))
  462. $data['author'] = $author;
  463. $data['post-type'] = \p3k\XRay\PostType::discover($data);
  464. $response = [
  465. 'data' => $data
  466. ];
  467. if(count($refs)) {
  468. $response['data']['refs'] = $refs;
  469. }
  470. return $response;
  471. }
  472. private static function parseAsHProduct($mf2, $item, $http, $url) {
  473. $data = [
  474. 'type' => 'product'
  475. ];
  476. $refs = [];
  477. self::collectSingleValues(['name','identifier','price'], ['url'], $item, $url, $data);
  478. $description = self::parseHTMLValue('description', $item);
  479. if($description) {
  480. $data['description'] = $description;
  481. }
  482. self::collectArrayValues(['category','brand'], $item, $data, $refs, $http);
  483. self::collectArrayURLValues(['photo','video','audio'], $item, $data, $refs, $http);
  484. $response = [
  485. 'data' => $data
  486. ];
  487. if(count($refs)) {
  488. $response['data']['refs'] = $refs;
  489. }
  490. return $response;
  491. }
  492. private static function parseAsHItem($mf2, $item, $http, $url) {
  493. $data = [
  494. 'type' => 'item'
  495. ];
  496. $refs = [];
  497. self::collectSingleValues(['name'], ['url'], $item, $url, $data);
  498. self::collectArrayURLValues(['photo','video','audio'], $item, $data, $refs, $http);
  499. $response = [
  500. 'data' => $data
  501. ];
  502. if(count($refs)) {
  503. $response['data']['refs'] = $refs;
  504. }
  505. return $response;
  506. }
  507. private static function parseAsHApp($mf2, $item, $http, $url) {
  508. $data = [
  509. 'type' => 'app'
  510. ];
  511. self::collectSingleValues(['name'], ['url','logo'], $item, $url, $data);
  512. self::collectArrayURLValues(['redirect-uri'], $item, $data, $refs, $http);
  513. if(!isset($data['url']))
  514. $data['url'] = $url;
  515. if(isset($mf2['rels']['redirect_uri'])) {
  516. if(!isset($data['redirect-uri'])) $data['redirect-uri'] = [];
  517. $data['redirect-uri'] = array_merge($data['redirect-uri'], $mf2['rels']['redirect_uri']);
  518. }
  519. if(isset($data['redirect-uri'])) {
  520. $data['redirect-uri'] = array_values(array_unique($data['redirect-uri']));
  521. }
  522. $response = [
  523. 'data' => $data
  524. ];
  525. return $response;
  526. }
  527. private static function parseAsHEvent($mf2, $item, $http, $url) {
  528. $data = [
  529. 'type' => 'event'
  530. ];
  531. $refs = [];
  532. // Single plaintext and URL values
  533. self::collectSingleValues(['name','summary','published','start','end','duration'], ['url'], $item, $url, $data);
  534. // These properties are always returned as arrays and may contain plaintext content
  535. self::collectArrayValues(['category','location','attendee'], $item, $data, $refs, $http);
  536. // These properties are always returned as arrays and always URLs
  537. // If the value is an h-* object with a URL, the URL is used and a "ref" is added as well
  538. self::collectArrayURLValues(['photo','video','audio','syndication'], $item, $data, $refs, $http);
  539. // If there is a description, always return the plaintext content, and return HTML content if it's different
  540. $content = self::parseHTMLValue('content', $item);
  541. if($content) {
  542. $data['content'] = $content;
  543. } else {
  544. // Fall back to looking for "description"
  545. $content = self::parseHTMLValue('description', $item);
  546. if($content)
  547. $data['content'] = $content;
  548. }
  549. $data['post-type'] = \p3k\XRay\PostType::discover($data);
  550. $response = [
  551. 'data' => $data
  552. ];
  553. if(count($refs)) {
  554. $response['data']['refs'] = $refs;
  555. }
  556. return $response;
  557. }
  558. private static function parseAsHCard($item, $http, $url, $authorURL=false) {
  559. $data = [
  560. 'type' => 'card',
  561. 'name' => null,
  562. 'url' => null,
  563. 'photo' => null
  564. ];
  565. $properties = ['url','name','photo'];
  566. foreach($properties as $p) {
  567. if($p == 'url' && $authorURL) {
  568. // If there is a matching author URL, use that one
  569. $found = false;
  570. foreach($item['properties']['url'] as $url) {
  571. if(self::isURL($url)) {
  572. $url = \p3k\XRay\normalize_url($url);
  573. if($url == \p3k\XRay\normalize_url($authorURL)) {
  574. $data['url'] = $url;
  575. $found = true;
  576. }
  577. }
  578. }
  579. if(!$found && self::isURL($item['properties']['url'][0])) {
  580. $data['url'] = $item['properties']['url'][0];
  581. }
  582. } else if(($v = self::getPlaintext($item, $p)) !== null) {
  583. // Make sure the URL property is actually a URL
  584. if($p == 'url' || $p == 'photo') {
  585. if(self::isURL($v))
  586. $data[$p] = $v;
  587. } else {
  588. $data[$p] = $v;
  589. }
  590. }
  591. }
  592. // If no URL property was found, use the $authorURL provided
  593. if(!$data['url'])
  594. $data['url'] = $authorURL;
  595. $response = [
  596. 'data' => $data
  597. ];
  598. return $response;
  599. }
  600. private static function findAuthor($mf2, $item, $http, $url) {
  601. $author = [
  602. 'type' => 'card',
  603. 'name' => null,
  604. 'url' => null,
  605. 'photo' => null
  606. ];
  607. // Author Discovery
  608. // http://indiewebcamp.com/authorship
  609. $authorPage = false;
  610. if(array_key_exists('author', $item['properties'])) {
  611. // Check if any of the values of the author property are an h-card
  612. foreach($item['properties']['author'] as $a) {
  613. if(self::isHCard($a)) {
  614. // 5.1 "if it has an h-card, use it, exit."
  615. return self::parseAsHCard($a, $http, $url)['data'];
  616. } elseif(is_string($a)) {
  617. if(self::isURL($a)) {
  618. // 5.2 "otherwise if author property is an http(s) URL, let the author-page have that URL"
  619. $authorPage = $a;
  620. } else {
  621. // 5.3 "otherwise use the author property as the author name, exit"
  622. // We can only set the name, no h-card or URL was found
  623. $author['name'] = self::getPlaintext($item, 'author');
  624. return $author;
  625. }
  626. } else {
  627. // This case is only hit when the author property is an mf2 object that is not an h-card
  628. $author['name'] = self::getPlaintext($item, 'author');
  629. return $author;
  630. }
  631. }
  632. }
  633. // 6. "if no author page was found" ... check for rel-author link
  634. if(!$authorPage) {
  635. if(isset($mf2['rels']) && isset($mf2['rels']['author']))
  636. $authorPage = $mf2['rels']['author'][0];
  637. }
  638. // 7. "if there is an author-page URL" ...
  639. if($authorPage) {
  640. // 7.1 "get the author-page from that URL and parse it for microformats2"
  641. $authorPageContents = self::getURL($authorPage, $http);
  642. if($authorPageContents) {
  643. foreach($authorPageContents['items'] as $i) {
  644. if(self::isHCard($i)) {
  645. // 7.2 "if author-page has 1+ h-card with url == uid == author-page's URL, then use first such h-card, exit."
  646. if(array_key_exists('url', $i['properties'])
  647. and in_array(\p3k\XRay\normalize_url($authorPage), \p3k\XRay\normalize_urls($i['properties']['url']))
  648. and array_key_exists('uid', $i['properties'])
  649. and in_array(\p3k\XRay\normalize_url($authorPage), \p3k\XRay\normalize_urls($i['properties']['uid']))
  650. ) {
  651. return self::parseAsHCard($i, $http, $url, $authorPage)['data'];
  652. }
  653. // 7.3 "else if author-page has 1+ h-card with url property which matches the href of a rel-me link on the author-page"
  654. $relMeLinks = (isset($authorPageContents['rels']) && isset($authorPageContents['rels']['me'])) ? $authorPageContents['rels']['me'] : [];
  655. if(count($relMeLinks) > 0
  656. and array_key_exists('url', $i['properties'])
  657. and count(array_intersect(\p3k\XRay\normalize_urls($i['properties']['url']), \p3k\XRay\normalize_urls($relMeLinks))) > 0
  658. ) {
  659. return self::parseAsHCard($i, $http, $url, $authorPage)['data'];
  660. }
  661. }
  662. }
  663. }
  664. // 7.4 "if the h-entry's page has 1+ h-card with url == author-page URL, use first such h-card, exit."
  665. foreach($mf2['items'] as $i) {
  666. if(self::isHCard($i)) {
  667. if(array_key_exists('url', $i['properties'])
  668. and in_array(\p3k\XRay\normalize_url($authorPage), \p3k\XRay\normalize_urls($i['properties']['url']))
  669. ) {
  670. return self::parseAsHCard($i, $http, $url)['data'];
  671. }
  672. }
  673. // Also check the "author" property
  674. // (for finding the author of an h-feed's children when the author is the p-author property of the h-feed)
  675. if(isset($i['properties']['author'])) {
  676. foreach($i['properties']['author'] as $ic) {
  677. if(self::isHCard($ic)) {
  678. if(array_key_exists('url', $ic['properties'])
  679. and in_array(\p3k\XRay\normalize_url($authorPage), \p3k\XRay\normalize_urls($ic['properties']['url']))
  680. ) {
  681. return self::parseAsHCard($ic, $http, $url)['data'];
  682. }
  683. }
  684. }
  685. }
  686. }
  687. }
  688. // The below is not yet in the authorship algorithm.
  689. // If the top object is an h-feed, check for an author property there
  690. if(isset($mf2['items'][0]['type'][0]) && in_array('h-feed', $mf2['items'][0]['type'])) {
  691. if(isset($mf2['items'][0]['properties']['author'][0])) {
  692. $potentialAuthor = $mf2['items'][0]['properties']['author'][0];
  693. if(is_array($potentialAuthor['type']) && in_array('h-card', $potentialAuthor['type'])) {
  694. return self::parseAsHCard($potentialAuthor, $http, $url)['data'];
  695. }
  696. }
  697. }
  698. // If still no author is found, and this page is a feed (list of h-*),
  699. // then use the first h-card in the list of items.
  700. $items = array_filter($mf2['items'], function($item){
  701. return !in_array('h-card', $item['type']);
  702. });
  703. if(count($items) > 1) {
  704. $card = self::_findFirstOfType($mf2, 'h-card');
  705. if($card) {
  706. return self::parseAsHCard($card, $http, $url)['data'];
  707. }
  708. }
  709. if(!$author['name'] && !$author['photo'] && !$author['url'])
  710. return null;
  711. return $author;
  712. }
  713. private static function hasNumericKeys(array $arr) {
  714. foreach($arr as $key=>$val)
  715. if (is_numeric($key))
  716. return true;
  717. return false;
  718. }
  719. private static function isMicroformat($mf) {
  720. return is_array($mf)
  721. and !self::hasNumericKeys($mf)
  722. and !empty($mf['type'])
  723. and isset($mf['properties']);
  724. }
  725. private static function isHCard($mf) {
  726. return is_array($mf)
  727. and !empty($mf['type'])
  728. and is_array($mf['type'])
  729. and in_array('h-card', $mf['type']);
  730. }
  731. private static function isURL($string) {
  732. return preg_match('/^https?:\/\/.+\..+$/', $string);
  733. }
  734. // Given an array of microformats properties and a key name, return the plaintext value
  735. // at that property
  736. // e.g.
  737. // {"properties":{"published":["foo"]}} results in "foo"
  738. private static function getPlaintext($mf2, $k, $fallback=null) {
  739. if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
  740. // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
  741. $value = $mf2['properties'][$k][0];
  742. if(is_string($value)) {
  743. return $value;
  744. } elseif(self::isMicroformat($value) && array_key_exists('value', $value)) {
  745. return $value['value'];
  746. }
  747. }
  748. return $fallback;
  749. }
  750. private static function getHTMLValue($mf2, $k, $fallback=null) {
  751. // Return an array with html and value if the value is html, otherwise return a string
  752. if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
  753. // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
  754. $value = $mf2['properties'][$k][0];
  755. if(is_string($value)) {
  756. return $value;
  757. } elseif(isset($value['html'])) {
  758. return $value;
  759. }
  760. }
  761. return $fallback;
  762. }
  763. private static function getPlaintextValues($mf2, $k, $values=[]) {
  764. if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
  765. foreach($mf2['properties'][$k] as $value) {
  766. if(is_string($value)) {
  767. $values[] = $value;
  768. } elseif(self::isMicroformat($value) && array_key_exists('value', $value)) {
  769. $values[] = $value['value'];
  770. }
  771. }
  772. }
  773. return $values;
  774. }
  775. private static function getURL($url, $http) {
  776. if(!$url || !$http) return null;
  777. // TODO: consider adding caching here
  778. $result = $http->get($url);
  779. if($result['error'] || !$result['body']) {
  780. return null;
  781. }
  782. return \mf2\Parse($result['body'], $url);
  783. }
  784. }