http = $http; } public function parse($http_response, $opts=[]) { $document = $this->parse_document($http_response, $opts); // If a target parameter was provided, make sure a link to it exists in the parsed document if(!isset($document['error']) && !empty($opts['target'])) { if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') { if(isset($document['html'])) { // Couldn't parse the page, check for the link manually assuming HTML content $found = $this->_findLinkInHTML($opts['target'], $document['html']); } else { // Ignore this check for any non-HTML documents since this will be uncommon anyway $found = false; } $error_description = 'The source document does not have a link to the target URL'; } else { $found = $this->_findLinkInTree($opts['target'], $document['data']); $error_description = 'The Microformats at the source URL do not contain a link to the target URL. Check the source URL in a Microformats parser such as php.microformats.io'; } if(!$found) { return [ 'error' => 'no_link_found', 'error_description' => $error_description, 'code' => isset($document['code']) ? $document['code'] : 200, 'url' => $document['url'], 'debug' => $document['data'] ]; } } // If the HTML parser couldn't parse the page it returns the full HTML for checking the target above, // but we don't want to return that in the output so remove it here unset($document['html']); return $document; } public function parse_document($http_response, $opts=[]) { if(isset($opts['timeout'])) $this->http->set_timeout($opts['timeout']); if(isset($opts['max_redirects'])) $this->http->set_max_redirects($opts['max_redirects']); // Check if the URL matches a special parser $url = $http_response['url']; if(Formats\Instagram::matches($url)) { return Formats\Instagram::parse($this->http, $http_response, $opts); } if(Formats\GitHub::matches($url)) { return Formats\GitHub::parse($http_response); } if(Formats\Twitter::matches($url)) { return Formats\Twitter::parse($http_response); } if(Formats\Facebook::matches($url)) { return Formats\Facebook::parse($http_response); } if(Formats\XKCD::matches($url)) { return Formats\XKCD::parse($http_response); } if(Formats\Hackernews::matches($url)) { return Formats\Hackernews::parse($http_response); } $body = $http_response['body']; // Check if an mf2 JSON object was passed in if(is_array($body) && isset($body['items'])) { $data = Formats\Mf2::parse($http_response, $this->http, $opts); if($data == false) { $data = [ 'data' => [ 'type' => 'unknown', ] ]; } $data['source-format'] = 'mf2+json'; return $data; } // Check if an ActivityStreams JSON object was passed in if(Formats\ActivityStreams::is_as2_json($body)) { $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts); $data['source-format'] = 'activity+json'; return $data; } if(is_string($body) && substr($body, 0, 5) == 'http, $opts); $data['source-format'] = 'mf2+json'; return $data; } elseif($parsed && Formats\ActivityStreams::is_as2_json($parsed)) { // Check if an ActivityStreams JSON string was passed in $http_response['body'] = $parsed; $data = Formats\ActivityStreams::parse($http_response, $this->http, $opts); $data['source-format'] = 'activity+json'; return $data; } } // No special parsers matched, parse for Microformats now $data = Formats\HTML::parse($this->http, $http_response, $opts); if(!isset($data['source-format']) && isset($data['type']) && $data['type'] != 'unknown') $data['source-format'] = 'mf2+html'; return $data; } private function _findLinkInTree($link, $document) { if(!$document) return false; if(is_string($document) || is_numeric($document)) { return $document == $link; } if(is_array($document)) { foreach($document as $key=>$value) { if($key === 'html') { $found = $this->_findLinkInHTML($link, $value); if($found) { return true; } } else { $found = $this->_findLinkInTree($link, $value); if($found) { return true; } } } return false; } throw new Exception('Unexpected value in tree'); } private function _findLinkInHTML($link, $html) { $doc = new DOMDocument(); @$doc->loadHTML(self::_toHtmlEntities($html)); if(!$doc) return false; $xpath = new DOMXPath($doc); return self::_findLinksInDOMDocument($xpath, $link); } private static function _findLinksInDOMDocument(&$xpath, $target) { $found = []; self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){ if($u == $target) { $found[$u] = null; } }); self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){ if($u == $target) { $found[$u] = null; } }); self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){ if($u == $target) { $found[$u] = null; } }); self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){ if($u == $target) { $found[$u] = null; } }); return $found; } private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) { foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) { $v = $el->getAttribute($attr); $callback($v); } } private static function _toHtmlEntities($input) { return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); } }