You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

223 lines
7.0 KiB

  1. <?php
  2. namespace p3k\XRay;
  3. use p3k\XRay\Formats;
  4. use DOMDocument, DOMXPath;
  5. class Parser {
  6. private $http;
  7. public function __construct($http) {
  8. $this->http = $http;
  9. }
  10. public function parse($http_response, $opts=[]) {
  11. $document = $this->parse_document($http_response, $opts);
  12. // If a target parameter was provided, make sure a link to it exists in the parsed document
  13. if(!isset($document['error']) && !empty($opts['target'])) {
  14. if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') {
  15. if(isset($document['html'])) {
  16. // Couldn't parse the page, check for the link manually assuming HTML content
  17. $found = $this->_findLinkInHTML($opts['target'], $document['html']);
  18. } else {
  19. // Ignore this check for any non-HTML documents since this will be uncommon anyway
  20. $found = false;
  21. }
  22. $error_description = 'The source document does not have a link to the target URL';
  23. } else {
  24. $found = $this->_findLinkInTree($opts['target'], $document['data']);
  25. $error_description = 'The Microformats at the source URL do not contain a link to the target URL. Check the source URL in a Microformats parser such as php.microformats.io';
  26. }
  27. if(!$found) {
  28. return [
  29. 'error' => 'no_link_found',
  30. 'error_description' => $error_description,
  31. 'code' => isset($document['code']) ? $document['code'] : 200,
  32. 'url' => $document['url'],
  33. 'debug' => $document['data']
  34. ];
  35. }
  36. }
  37. // If the HTML parser couldn't parse the page it returns the full HTML for checking the target above,
  38. // but we don't want to return that in the output so remove it here
  39. unset($document['html']);
  40. return $document;
  41. }
  42. public function parse_document($http_response, $opts=[]) {
  43. if(isset($opts['timeout']))
  44. $this->http->set_timeout($opts['timeout']);
  45. if(isset($opts['max_redirects']))
  46. $this->http->set_max_redirects($opts['max_redirects']);
  47. // Check if the URL matches a special parser
  48. $url = $http_response['url'];
  49. if(Formats\Instagram::matches($url)) {
  50. return Formats\Instagram::parse($this->http, $http_response, $opts);
  51. }
  52. if(Formats\GitHub::matches($url)) {
  53. return Formats\GitHub::parse($http_response);
  54. }
  55. if(Formats\Twitter::matches($url)) {
  56. return Formats\Twitter::parse($http_response);
  57. }
  58. if(Formats\Facebook::matches($url)) {
  59. return Formats\Facebook::parse($http_response);
  60. }
  61. if(Formats\XKCD::matches($url)) {
  62. return Formats\XKCD::parse($http_response);
  63. }
  64. if(Formats\Hackernews::matches($url)) {
  65. return Formats\Hackernews::parse($http_response);
  66. }
  67. $body = $http_response['body'];
  68. // Check if an mf2 JSON object was passed in
  69. if(is_array($body) && isset($body['items'])) {
  70. $data = Formats\Mf2::parse($http_response, $this->http, $opts);
  71. if($data == false) {
  72. $data = [
  73. 'data' => [
  74. 'type' => 'unknown',
  75. ]
  76. ];
  77. }
  78. $data['source-format'] = 'mf2+json';
  79. return $data;
  80. }
  81. // Check if an ActivityStreams JSON object was passed in
  82. if(Formats\ActivityStreams::is_as2_json($body)) {
  83. $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
  84. $data['source-format'] = 'activity+json';
  85. return $data;
  86. }
  87. if(is_string($body) && substr($body, 0, 5) == '<?xml') {
  88. return Formats\XML::parse($http_response);
  89. }
  90. if(is_string($body)) {
  91. // Some feeds don't start with <?xml
  92. $begin = trim(substr($body, 0, 40));
  93. if(substr($begin, 0, 4) == '<rss') {
  94. return Formats\XML::parse($http_response);
  95. }
  96. }
  97. if(is_string($body) && substr($body, 0, 1) == '{') {
  98. $parsed = json_decode($body, true);
  99. if($parsed && isset($parsed['version']) && $parsed['version'] == 'https://jsonfeed.org/version/1') {
  100. $http_response['body'] = $parsed;
  101. return Formats\JSONFeed::parse($http_response);
  102. } elseif($parsed && isset($parsed['items'][0]['type']) && isset($parsed['items'][0]['properties'])) {
  103. // Check if an mf2 JSON string was passed in
  104. $http_response['body'] = $parsed;
  105. $data = Formats\Mf2::parse($http_response, $this->http, $opts);
  106. $data['source-format'] = 'mf2+json';
  107. return $data;
  108. } elseif($parsed && Formats\ActivityStreams::is_as2_json($parsed)) {
  109. // Check if an ActivityStreams JSON string was passed in
  110. $http_response['body'] = $parsed;
  111. $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
  112. $data['source-format'] = 'activity+json';
  113. return $data;
  114. }
  115. }
  116. // No special parsers matched, parse for Microformats now
  117. $data = Formats\HTML::parse($this->http, $http_response, $opts);
  118. if(!isset($data['source-format']) && isset($data['type']) && $data['type'] != 'unknown')
  119. $data['source-format'] = 'mf2+html';
  120. return $data;
  121. }
  122. private function _findLinkInTree($link, $document) {
  123. if(!$document)
  124. return false;
  125. if(is_string($document) || is_numeric($document)) {
  126. return $document == $link;
  127. }
  128. if(is_array($document)) {
  129. foreach($document as $key=>$value) {
  130. if($key === 'html') {
  131. $found = $this->_findLinkInHTML($link, $value);
  132. if($found) {
  133. return true;
  134. }
  135. } else {
  136. $found = $this->_findLinkInTree($link, $value);
  137. if($found) {
  138. return true;
  139. }
  140. }
  141. }
  142. return false;
  143. }
  144. throw new Exception('Unexpected value in tree');
  145. }
  146. private function _findLinkInHTML($link, $html) {
  147. $doc = new DOMDocument();
  148. @$doc->loadHTML(self::_toHtmlEntities($html));
  149. if(!$doc)
  150. return false;
  151. $xpath = new DOMXPath($doc);
  152. return self::_findLinksInDOMDocument($xpath, $link);
  153. }
  154. private static function _findLinksInDOMDocument(&$xpath, $target) {
  155. $found = [];
  156. self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
  157. if($u == $target) {
  158. $found[$u] = null;
  159. }
  160. });
  161. self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
  162. if($u == $target) {
  163. $found[$u] = null;
  164. }
  165. });
  166. self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
  167. if($u == $target) {
  168. $found[$u] = null;
  169. }
  170. });
  171. self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
  172. if($u == $target) {
  173. $found[$u] = null;
  174. }
  175. });
  176. return $found;
  177. }
  178. private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
  179. foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
  180. $v = $el->getAttribute($attr);
  181. $callback($v);
  182. }
  183. }
  184. private static function _toHtmlEntities($input) {
  185. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  186. }
  187. }