You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

244 lines
8.0 KiB

  1. <?php
  2. namespace p3k\XRay;
  3. use p3k\XRay\Formats;
  4. use DOMDocument, DOMXPath;
  5. class Parser {
  6. private $http;
  7. public function __construct($http) {
  8. $this->http = $http;
  9. }
  10. public function parse($http_response, $opts=[]) {
  11. $allowIframeVideo = isset($opts['allowIframeVideo']) ? $opts['allowIframeVideo'] : false;
  12. allow_iframe_video($allowIframeVideo);
  13. $document = $this->parse_document($http_response, $opts);
  14. // If a target parameter was provided, make sure a link to it exists in the parsed document
  15. if(!isset($document['error']) && !empty($opts['target'])) {
  16. if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') {
  17. if(isset($document['html'])) {
  18. // Couldn't parse the page, check for the link manually assuming HTML content
  19. $found = $this->_findLinkInHTML($opts['target'], $document['html']);
  20. } else {
  21. // Ignore this check for any non-HTML documents since this will be uncommon anyway
  22. $found = false;
  23. }
  24. $error_description = 'The source document does not have a link to the target URL';
  25. } else {
  26. $found = $this->_findLinkInTree($opts['target'], $document['data']);
  27. $error_description = 'The Microformats at the source URL do not contain a link to the target URL. Check the source URL in a Microformats parser such as php.microformats.io';
  28. if(!$found && isset($document['html'])) {
  29. // If no link was found in the parsed mf2 tree, check for a link in the HTML
  30. $found = $this->_findLinkInHTML($opts['target'], $document['html']);
  31. // If there is a link, and if the HTML document has no mf2, then downgrade to a regular mention
  32. if($found) {
  33. $mf2Data = Formats\HTML::parse($this->http, $http_response, ['include-mf1'=>false]);
  34. if(isset($mf2Data['data']['type']) && $mf2Data['data']['type'] == 'unknown') {
  35. // Since the link was found in the HTML, but not in the parsed tree, it shouldn't return the parsed document
  36. $document['data'] = [
  37. 'type' => 'unknown'
  38. ];
  39. } else {
  40. // Otherwise, the document did have mf2, but the link wasn't in it (checked earlier), so set found=false
  41. $found = false;
  42. }
  43. }
  44. }
  45. }
  46. if(!$found) {
  47. return [
  48. 'error' => 'no_link_found',
  49. 'error_description' => $error_description,
  50. 'code' => isset($document['code']) ? $document['code'] : 200,
  51. 'url' => $document['url'],
  52. 'debug' => $document['data']
  53. ];
  54. }
  55. }
  56. // If the HTML parser couldn't parse the page it returns the full HTML for checking the target above,
  57. // but we don't want to return that in the output so remove it here
  58. unset($document['html']);
  59. return $document;
  60. }
  61. public function parse_document($http_response, $opts=[]) {
  62. if(isset($opts['timeout']))
  63. $this->http->set_timeout($opts['timeout']);
  64. if(isset($opts['max_redirects']))
  65. $this->http->set_max_redirects($opts['max_redirects']);
  66. // Check if the URL matches a special parser
  67. $url = $http_response['url'];
  68. if(Formats\Instagram::matches($url)) {
  69. return Formats\Instagram::parse($this->http, $http_response, $opts);
  70. }
  71. if(Formats\GitHub::matches($url)) {
  72. return Formats\GitHub::parse($http_response);
  73. }
  74. if(Formats\Twitter::matches($url)) {
  75. return Formats\Twitter::parse($http_response);
  76. }
  77. if(Formats\Facebook::matches($url)) {
  78. return Formats\Facebook::parse($http_response);
  79. }
  80. if(Formats\XKCD::matches($url)) {
  81. return Formats\XKCD::parse($http_response);
  82. }
  83. if(Formats\Hackernews::matches($url)) {
  84. return Formats\Hackernews::parse($http_response);
  85. }
  86. $body = $http_response['body'];
  87. // Check if an mf2 JSON object was passed in
  88. if(is_array($body) && isset($body['items'])) {
  89. $data = Formats\Mf2::parse($http_response, $this->http, $opts);
  90. if($data == false) {
  91. $data = [
  92. 'data' => [
  93. 'type' => 'unknown',
  94. ]
  95. ];
  96. }
  97. $data['source-format'] = 'mf2+json';
  98. return $data;
  99. }
  100. // Check if an ActivityStreams JSON object was passed in
  101. if(Formats\ActivityStreams::is_as2_json($body)) {
  102. $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
  103. $data['source-format'] = 'activity+json';
  104. return $data;
  105. }
  106. if(is_string($body) && substr($body, 0, 5) == '<?xml') {
  107. return Formats\XML::parse($http_response);
  108. }
  109. if(is_string($body)) {
  110. // Some feeds don't start with <?xml
  111. $begin = trim(substr($body, 0, 40));
  112. if(substr($begin, 0, 4) == '<rss') {
  113. return Formats\XML::parse($http_response);
  114. }
  115. }
  116. if(is_string($body) && substr($body, 0, 1) == '{') {
  117. $parsed = json_decode($body, true);
  118. if($parsed && isset($parsed['version']) && $parsed['version'] == 'https://jsonfeed.org/version/1') {
  119. $http_response['body'] = $parsed;
  120. return Formats\JSONFeed::parse($http_response);
  121. } elseif($parsed && isset($parsed['items'][0]['type']) && isset($parsed['items'][0]['properties'])) {
  122. // Check if an mf2 JSON string was passed in
  123. $http_response['body'] = $parsed;
  124. $data = Formats\Mf2::parse($http_response, $this->http, $opts);
  125. $data['source-format'] = 'mf2+json';
  126. return $data;
  127. } elseif($parsed && Formats\ActivityStreams::is_as2_json($parsed)) {
  128. // Check if an ActivityStreams JSON string was passed in
  129. $http_response['body'] = $parsed;
  130. $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
  131. $data['source-format'] = 'activity+json';
  132. return $data;
  133. }
  134. }
  135. // No special parsers matched, parse for Microformats now
  136. $data = Formats\HTML::parse($this->http, $http_response, $opts);
  137. if(!isset($data['source-format']) && isset($data['type']) && $data['type'] != 'unknown')
  138. $data['source-format'] = 'mf2+html';
  139. return $data;
  140. }
  141. private function _findLinkInTree($link, $document) {
  142. if(!$document)
  143. return false;
  144. if(is_string($document) || is_numeric($document)) {
  145. return $document == $link;
  146. }
  147. if(is_array($document)) {
  148. foreach($document as $key=>$value) {
  149. if($key === 'html') {
  150. $found = $this->_findLinkInHTML($link, $value);
  151. if($found) {
  152. return true;
  153. }
  154. } else {
  155. $found = $this->_findLinkInTree($link, $value);
  156. if($found) {
  157. return true;
  158. }
  159. }
  160. }
  161. return false;
  162. }
  163. throw new Exception('Unexpected value in tree');
  164. }
  165. private function _findLinkInHTML($link, $html) {
  166. $doc = new DOMDocument();
  167. @$doc->loadHTML(self::_toHtmlEntities($html));
  168. if(!$doc)
  169. return false;
  170. $xpath = new DOMXPath($doc);
  171. return self::_findLinksInDOMDocument($xpath, $link);
  172. }
  173. private static function _findLinksInDOMDocument(&$xpath, $target) {
  174. $found = [];
  175. self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
  176. if($u == $target) {
  177. $found[$u] = null;
  178. }
  179. });
  180. self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
  181. if($u == $target) {
  182. $found[$u] = null;
  183. }
  184. });
  185. self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
  186. if($u == $target) {
  187. $found[$u] = null;
  188. }
  189. });
  190. self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
  191. if($u == $target) {
  192. $found[$u] = null;
  193. }
  194. });
  195. return $found;
  196. }
  197. private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
  198. foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
  199. $v = $el->getAttribute($attr);
  200. $callback($v);
  201. }
  202. }
  203. private static function _toHtmlEntities($input) {
  204. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  205. }
  206. }