You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

414 lines
12 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. <?php
  2. namespace XRay\Formats;
  3. use HTMLPurifier, HTMLPurifier_Config;
  4. class Mf2 {
  5. public static function parse($mf2, $url, $http) {
  6. if($item = $mf2['items'][0]) {
  7. // If the first item is a feed, the page is a feed
  8. if(in_array('h-feed', $item['type'])) {
  9. return self::parseHFeed($mf2, $http);
  10. }
  11. // Check each top-level h-card, and if there is one that matches this URL, the page is an h-card
  12. foreach($mf2['items'] as $i) {
  13. if(in_array('h-card', $i['type'])
  14. and array_key_exists('url', $i['properties'])
  15. ) {
  16. $urls = $i['properties']['url'];
  17. $urls = array_map('\normalize_url', $urls);
  18. if(in_array($url, $urls)) {
  19. // TODO: check for children h-entrys (like tantek.com), or sibling h-entries (like aaronparecki.com)
  20. // and return the result as a feed instead
  21. return self::parseHCard($i, $http, $url);
  22. }
  23. }
  24. }
  25. // Otherwise check for an h-entry
  26. if(in_array('h-entry', $item['type']) || in_array('h-cite', $item['type'])) {
  27. return self::parseHEntry($mf2, $http);
  28. }
  29. }
  30. return false;
  31. }
  32. private static function parseHEntry($mf2, $http) {
  33. $data = [
  34. 'type' => 'entry',
  35. 'author' => [
  36. 'type' => 'card',
  37. 'name' => null,
  38. 'url' => null,
  39. 'photo' => null
  40. ]
  41. ];
  42. $refs = [];
  43. $item = $mf2['items'][0];
  44. // Single plaintext values
  45. $properties = ['url','published','summary','rsvp'];
  46. foreach($properties as $p) {
  47. if($v = self::getPlaintext($item, $p))
  48. $data[$p] = $v;
  49. }
  50. // Always arrays
  51. $properties = ['photo','video','syndication'];
  52. foreach($properties as $p) {
  53. if(array_key_exists($p, $item['properties'])) {
  54. $data[$p] = [];
  55. foreach($item['properties'][$p] as $v) {
  56. if(is_string($v))
  57. $data[$p][] = $v;
  58. elseif(is_array($v) and array_key_exists('value', $v))
  59. $data[$p][] = $v['value'];
  60. }
  61. }
  62. }
  63. // Always returned as arrays, and may also create external references
  64. $properties = ['in-reply-to','like-of','repost-of','category'];
  65. foreach($properties as $p) {
  66. if(array_key_exists($p, $item['properties'])) {
  67. $data[$p] = [];
  68. foreach($item['properties'][$p] as $v) {
  69. if(is_string($v))
  70. $data[$p][] = $v;
  71. elseif(self::isMicroformat($v) && ($u=self::getPlaintext($v, 'url'))) {
  72. $data[$p][] = $u;
  73. // parse the object and put the result in the "refs" object
  74. $ref = self::parse(['items'=>[$v]], $u, $http);
  75. if($ref) {
  76. $refs[$u] = $ref['data'];
  77. }
  78. }
  79. }
  80. }
  81. }
  82. // Determine if the name is distinct from the content
  83. $name = self::getPlaintext($item, 'name');
  84. $content = null;
  85. $textContent = null;
  86. $htmlContent = null;
  87. if(array_key_exists('content', $item['properties'])) {
  88. $content = $item['properties']['content'][0];
  89. if(is_string($content)) {
  90. $textContent = $content;
  91. } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
  92. if(array_key_exists('html', $content)) {
  93. $htmlContent = trim(self::sanitizeHTML($content['html']));
  94. $textContent = trim(str_replace("&#xD;","\r",strip_tags($htmlContent)));
  95. } else {
  96. $textContent = trim($content['value']);
  97. }
  98. }
  99. // Trim ellipses from the name
  100. $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name);
  101. // Remove all whitespace when checking equality
  102. $nameCompare = preg_replace('/\s/','',trim($name));
  103. $contentCompare = preg_replace('/\s/','',trim($textContent));
  104. // Check if the name is a prefix of the content
  105. if(strpos($contentCompare, $nameCompare) === 0) {
  106. $name = null;
  107. }
  108. }
  109. if($name) {
  110. $data['name'] = $name;
  111. }
  112. if($content) {
  113. $data['content'] = [
  114. 'text' => $textContent
  115. ];
  116. if($textContent != $htmlContent) {
  117. $data['content']['html'] = $htmlContent;
  118. }
  119. }
  120. $data['author'] = self::findAuthor($mf2, $item, $http);
  121. $response = [
  122. 'data' => $data
  123. ];
  124. if(count($refs)) {
  125. $response['refs'] = $refs;
  126. }
  127. return $response;
  128. }
  129. private static function parseHFeed($mf2, $http) {
  130. $data = [
  131. 'type' => 'feed',
  132. 'author' => [
  133. 'type' => 'card',
  134. 'name' => null,
  135. 'url' => null,
  136. 'photo' => null
  137. ],
  138. 'items' => [],
  139. 'todo' => 'Not yet implemented. Please see https://github.com/aaronpk/XRay/issues/1'
  140. ];
  141. return [
  142. 'data' => $data
  143. ];
  144. }
  145. private static function parseHCard($item, $http, $authorURL=false) {
  146. $data = [
  147. 'type' => 'card',
  148. 'name' => null,
  149. 'url' => null,
  150. 'photo' => null
  151. ];
  152. $properties = ['url','name','photo'];
  153. foreach($properties as $p) {
  154. if($p == 'url' && $authorURL) {
  155. // If there is a matching author URL, use that one
  156. $found = false;
  157. foreach($item['properties']['url'] as $url) {
  158. $url = \normalize_url($url);
  159. if($url == $authorURL) {
  160. $data['url'] = $url;
  161. $found = true;
  162. }
  163. }
  164. if(!$found) $data['url'] = $item['properties']['url'][0];
  165. } else if($v = self::getPlaintext($item, $p)) {
  166. $data[$p] = $v;
  167. }
  168. }
  169. $response = [
  170. 'data' => $data
  171. ];
  172. return $response;
  173. }
  174. private static function findAuthor($mf2, $item, $http) {
  175. $author = [
  176. 'type' => 'card',
  177. 'name' => null,
  178. 'url' => null,
  179. 'photo' => null
  180. ];
  181. // Author Discovery
  182. // http://indiewebcamp.com/authorship
  183. $authorPage = false;
  184. if(array_key_exists('author', $item['properties'])) {
  185. // Check if any of the values of the author property are an h-card
  186. foreach($item['properties']['author'] as $a) {
  187. if(self::isHCard($a)) {
  188. // 5.1 "if it has an h-card, use it, exit."
  189. return self::parseHCard($a, $http)['data'];
  190. } elseif(is_string($a)) {
  191. if(self::isURL($a)) {
  192. // 5.2 "otherwise if author property is an http(s) URL, let the author-page have that URL"
  193. $authorPage = $a;
  194. } else {
  195. // 5.3 "otherwise use the author property as the author name, exit"
  196. // We can only set the name, no h-card or URL was found
  197. $author['name'] = self::getPlaintext($item, 'author');
  198. return $author;
  199. }
  200. } else {
  201. // This case is only hit when the author property is an mf2 object that is not an h-card
  202. $author['name'] = self::getPlaintext($item, 'author');
  203. return $author;
  204. }
  205. }
  206. }
  207. // 6. "if no author page was found" ... check for rel-author link
  208. if(!$authorPage) {
  209. if(isset($mf2['rels']) && isset($mf2['rels']['author']))
  210. $authorPage = $mf2['rels']['author'][0];
  211. }
  212. // 7. "if there is an author-page URL" ...
  213. if($authorPage) {
  214. // 7.1 "get the author-page from that URL and parse it for microformats2"
  215. $authorPageContents = self::getURL($authorPage, $http);
  216. if($authorPageContents) {
  217. foreach($authorPageContents['items'] as $i) {
  218. if(self::isHCard($i)) {
  219. // 7.2 "if author-page has 1+ h-card with url == uid == author-page's URL, then use first such h-card, exit."
  220. if(array_key_exists('url', $i['properties'])
  221. and in_array($authorPage, $i['properties']['url'])
  222. and array_key_exists('uid', $i['properties'])
  223. and in_array($authorPage, $i['properties']['uid'])
  224. ) {
  225. return self::parseHCard($i, $http, $authorPage)['data'];
  226. }
  227. // 7.3 "else if author-page has 1+ h-card with url property which matches the href of a rel-me link on the author-page"
  228. $relMeLinks = (isset($authorPageContents['rels']) && isset($authorPageContents['rels']['me'])) ? $authorPageContents['rels']['me'] : [];
  229. if(count($relMeLinks) > 0
  230. and array_key_exists('url', $i['properties'])
  231. and count(array_intersect($i['properties']['url'], $relMeLinks)) > 0
  232. ) {
  233. return self::parseHCard($i, $http, $authorPage)['data'];
  234. }
  235. }
  236. }
  237. }
  238. // 7.4 "if the h-entry's page has 1+ h-card with url == author-page URL, use first such h-card, exit."
  239. foreach($mf2['items'] as $i) {
  240. if(self::isHCard($i)) {
  241. if(array_key_exists('url', $i['properties'])
  242. and in_array($authorPage, $i['properties']['url'])
  243. ) {
  244. return self::parseHCard($i, $http)['data'];
  245. }
  246. }
  247. }
  248. }
  249. return $author;
  250. }
  251. private static function sanitizeHTML($html) {
  252. $config = HTMLPurifier_Config::createDefault();
  253. $config->set('Cache.DefinitionImpl', null);
  254. $config->set('HTML.AllowedElements', [
  255. 'a',
  256. 'abbr',
  257. 'b',
  258. 'code',
  259. 'del',
  260. 'em',
  261. 'i',
  262. 'img',
  263. 'q',
  264. 'strike',
  265. 'strong',
  266. 'time',
  267. 'blockquote',
  268. 'pre',
  269. 'h1',
  270. 'h2',
  271. 'h3',
  272. 'h4',
  273. 'h5',
  274. 'h6',
  275. ]);
  276. $def = $config->getHTMLDefinition(true);
  277. $def->addElement(
  278. 'time',
  279. 'Inline',
  280. 'Inline',
  281. 'Common',
  282. [
  283. 'datetime' => 'Text'
  284. ]
  285. );
  286. $purifier = new HTMLPurifier($config);
  287. return $purifier->purify($html);
  288. }
  289. private static function responseDisplayText($name, $summary, $content) {
  290. // Build a fake h-entry to pass to the comments parser
  291. $input = [
  292. 'type' => ['h-entry'],
  293. 'properties' => [
  294. 'name' => [trim($name)],
  295. 'summary' => [trim($summary)],
  296. 'content' => [trim($content)]
  297. ]
  298. ];
  299. if(!trim($name))
  300. unset($input['properties']['name']);
  301. if(!trim($summary))
  302. unset($input['properties']['summary']);
  303. $result = \IndieWeb\comments\parse($input, false, 1024, 4);
  304. return [
  305. 'name' => trim($result['name']),
  306. 'content' => $result['text']
  307. ];
  308. }
  309. private static function hasNumericKeys(array $arr) {
  310. foreach($arr as $key=>$val)
  311. if (is_numeric($key))
  312. return true;
  313. return false;
  314. }
  315. private static function isMicroformat($mf) {
  316. return is_array($mf)
  317. and !self::hasNumericKeys($mf)
  318. and !empty($mf['type'])
  319. and isset($mf['properties']);
  320. }
  321. private static function isHCard($mf) {
  322. return is_array($mf)
  323. and !empty($mf['type'])
  324. and is_array($mf['type'])
  325. and in_array('h-card', $mf['type']);
  326. }
  327. private static function isURL($string) {
  328. return preg_match('/^https?:\/\/.+\..+$/', $string);
  329. }
  330. // Given an array of microformats properties and a key name, return the plaintext value
  331. // at that property
  332. // e.g.
  333. // {"properties":{"published":["foo"]}} results in "foo"
  334. private static function getPlaintext($mf2, $k, $fallback=null) {
  335. if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
  336. // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
  337. $value = $mf2['properties'][$k][0];
  338. if(is_string($value)) {
  339. return $value;
  340. } elseif(self::isMicroformat($value) && array_key_exists('value', $value)) {
  341. return $value['value'];
  342. }
  343. }
  344. return $fallback;
  345. }
  346. private static function getURL($url, $http) {
  347. if(!$url) return null;
  348. // TODO: consider adding caching here
  349. $result = $http->get($url);
  350. if($result['error'] || !$result['body']) {
  351. return null;
  352. }
  353. return \mf2\Parse($result['body'], $url);
  354. }
  355. }