You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

221 lines
6.7 KiB

  1. <?php
  2. namespace p3k\XRay;
  3. use p3k\XRay\Formats;
  4. use DOMDocument, DOMXPath;
  5. class Parser {
  6. private $http;
  7. public function __construct($http) {
  8. $this->http = $http;
  9. }
  10. public function parse($http_response, $opts=[]) {
  11. $document = $this->parse_document($http_response, $opts);
  12. // If a target parameter was provided, make sure a link to it exists in the parsed document
  13. if(!isset($document['error']) && !empty($opts['target'])) {
  14. if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') {
  15. if(isset($document['html'])) {
  16. // Couldn't parse the page, check for the link manually assuming HTML content
  17. $found = $this->_findLinkInHTML($opts['target'], $document['html']);
  18. } else {
  19. // Ignore this check for any non-HTML documents since this will be uncommon anyway
  20. $found = false;
  21. }
  22. } else {
  23. $found = $this->_findLinkInTree($opts['target'], $document['data']);
  24. }
  25. if(!$found) {
  26. return [
  27. 'error' => 'no_link_found',
  28. 'error_description' => 'The source document does not have a link to the target URL',
  29. 'code' => isset($document['code']) ? $document['code'] : 200,
  30. 'url' => $document['url'],
  31. 'debug' => $document['data']
  32. ];
  33. }
  34. }
  35. // If the HTML parser couldn't parse the page it returns the full HTML for checking the target above,
  36. // but we don't want to return that in the out put so remove it here
  37. unset($document['html']);
  38. return $document;
  39. }
  40. public function parse_document($http_response, $opts=[]) {
  41. if(isset($opts['timeout']))
  42. $this->http->set_timeout($opts['timeout']);
  43. if(isset($opts['max_redirects']))
  44. $this->http->set_max_redirects($opts['max_redirects']);
  45. // Check if the URL matches a special parser
  46. $url = $http_response['url'];
  47. if(Formats\Instagram::matches($url)) {
  48. return Formats\Instagram::parse($this->http, $http_response, $opts);
  49. }
  50. if(Formats\GitHub::matches($url)) {
  51. return Formats\GitHub::parse($http_response);
  52. }
  53. if(Formats\Twitter::matches($url)) {
  54. return Formats\Twitter::parse($http_response);
  55. }
  56. if(Formats\Facebook::matches($url)) {
  57. return Formats\Facebook::parse($http_response);
  58. }
  59. if(Formats\XKCD::matches($url)) {
  60. return Formats\XKCD::parse($http_response);
  61. }
  62. if(Formats\Hackernews::matches($url)) {
  63. return Formats\Hackernews::parse($http_response);
  64. }
  65. $body = $http_response['body'];
  66. // Check if an mf2 JSON object was passed in
  67. if(is_array($body) && isset($body['items'])) {
  68. $data = Formats\Mf2::parse($http_response, $this->http, $opts);
  69. if($data == false) {
  70. $data = [
  71. 'data' => [
  72. 'type' => 'unknown',
  73. ]
  74. ];
  75. }
  76. $data['source-format'] = 'mf2+json';
  77. return $data;
  78. }
  79. // Check if an ActivityStreams JSON object was passed in
  80. if(Formats\ActivityStreams::is_as2_json($body)) {
  81. $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
  82. $data['source-format'] = 'activity+json';
  83. return $data;
  84. }
  85. if(is_string($body) && substr($body, 0, 5) == '<?xml') {
  86. return Formats\XML::parse($http_response);
  87. }
  88. if(is_string($body)) {
  89. // Some feeds don't start with <?xml
  90. $begin = trim(substr($body, 0, 40));
  91. if(substr($begin, 0, 4) == '<rss') {
  92. return Formats\XML::parse($http_response);
  93. }
  94. }
  95. if(is_string($body) && substr($body, 0, 1) == '{') {
  96. $parsed = json_decode($body, true);
  97. if($parsed && isset($parsed['version']) && $parsed['version'] == 'https://jsonfeed.org/version/1') {
  98. $http_response['body'] = $parsed;
  99. return Formats\JSONFeed::parse($http_response);
  100. } elseif($parsed && isset($parsed['items'][0]['type']) && isset($parsed['items'][0]['properties'])) {
  101. // Check if an mf2 JSON string was passed in
  102. $http_response['body'] = $parsed;
  103. $data = Formats\Mf2::parse($http_response, $this->http, $opts);
  104. $data['source-format'] = 'mf2+json';
  105. return $data;
  106. } elseif($parsed && Formats\ActivityStreams::is_as2_json($parsed)) {
  107. // Check if an ActivityStreams JSON string was passed in
  108. $http_response['body'] = $parsed;
  109. $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
  110. $data['source-format'] = 'activity+json';
  111. return $data;
  112. }
  113. }
  114. // No special parsers matched, parse for Microformats now
  115. $data = Formats\HTML::parse($this->http, $http_response, $opts);
  116. if(!isset($data['source-format']) && isset($data['type']) && $data['type'] != 'unknown')
  117. $data['source-format'] = 'mf2+html';
  118. return $data;
  119. }
  120. private function _findLinkInTree($link, $document) {
  121. if(!$document)
  122. return false;
  123. if(is_string($document) || is_numeric($document)) {
  124. return $document == $link;
  125. }
  126. if(is_array($document)) {
  127. foreach($document as $key=>$value) {
  128. if($key === 'html') {
  129. $found = $this->_findLinkInHTML($link, $value);
  130. if($found) {
  131. return true;
  132. }
  133. } else {
  134. $found = $this->_findLinkInTree($link, $value);
  135. if($found) {
  136. return true;
  137. }
  138. }
  139. }
  140. return false;
  141. }
  142. throw new Exception('Unexpected value in tree');
  143. }
  144. private function _findLinkInHTML($link, $html) {
  145. $doc = new DOMDocument();
  146. @$doc->loadHTML(self::_toHtmlEntities($html));
  147. if(!$doc)
  148. return false;
  149. $xpath = new DOMXPath($doc);
  150. return self::_findLinksInDOMDocument($xpath, $link);
  151. }
  152. private static function _findLinksInDOMDocument(&$xpath, $target) {
  153. $found = [];
  154. self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
  155. if($u == $target) {
  156. $found[$u] = null;
  157. }
  158. });
  159. self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
  160. if($u == $target) {
  161. $found[$u] = null;
  162. }
  163. });
  164. self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
  165. if($u == $target) {
  166. $found[$u] = null;
  167. }
  168. });
  169. self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
  170. if($u == $target) {
  171. $found[$u] = null;
  172. }
  173. });
  174. return $found;
  175. }
  176. private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
  177. foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
  178. $v = $el->getAttribute($attr);
  179. $callback($v);
  180. }
  181. }
  182. private static function _toHtmlEntities($input) {
  183. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  184. }
  185. }