You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

221 lines
6.7 KiB

<?php
namespace p3k\XRay;
use p3k\XRay\Formats;
use DOMDocument, DOMXPath;
class Parser {
private $http;
public function __construct($http) {
$this->http = $http;
}
public function parse($http_response, $opts=[]) {
$document = $this->parse_document($http_response, $opts);
// If a target parameter was provided, make sure a link to it exists in the parsed document
if(!isset($document['error']) && !empty($opts['target'])) {
if(isset($document['data']['type']) && $document['data']['type'] == 'unknown') {
if(isset($document['html'])) {
// Couldn't parse the page, check for the link manually assuming HTML content
$found = $this->_findLinkInHTML($opts['target'], $document['html']);
} else {
// Ignore this check for any non-HTML documents since this will be uncommon anyway
$found = false;
}
} else {
$found = $this->_findLinkInTree($opts['target'], $document['data']);
}
if(!$found) {
return [
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL',
'code' => isset($document['code']) ? $document['code'] : 200,
'url' => $document['url'],
'debug' => $document['data']
];
}
}
// If the HTML parser couldn't parse the page it returns the full HTML for checking the target above,
// but we don't want to return that in the out put so remove it here
unset($document['html']);
return $document;
}
public function parse_document($http_response, $opts=[]) {
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
$this->http->set_max_redirects($opts['max_redirects']);
// Check if the URL matches a special parser
$url = $http_response['url'];
if(Formats\Instagram::matches($url)) {
return Formats\Instagram::parse($this->http, $http_response, $opts);
}
if(Formats\GitHub::matches($url)) {
return Formats\GitHub::parse($http_response);
}
if(Formats\Twitter::matches($url)) {
return Formats\Twitter::parse($http_response);
}
if(Formats\Facebook::matches($url)) {
return Formats\Facebook::parse($http_response);
}
if(Formats\XKCD::matches($url)) {
return Formats\XKCD::parse($http_response);
}
if(Formats\Hackernews::matches($url)) {
return Formats\Hackernews::parse($http_response);
}
$body = $http_response['body'];
// Check if an mf2 JSON object was passed in
if(is_array($body) && isset($body['items'])) {
$data = Formats\Mf2::parse($http_response, $this->http, $opts);
if($data == false) {
$data = [
'data' => [
'type' => 'unknown',
]
];
}
$data['source-format'] = 'mf2+json';
return $data;
}
// Check if an ActivityStreams JSON object was passed in
if(Formats\ActivityStreams::is_as2_json($body)) {
$data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
$data['source-format'] = 'activity+json';
return $data;
}
if(is_string($body) && substr($body, 0, 5) == '<?xml') {
return Formats\XML::parse($http_response);
}
if(is_string($body)) {
// Some feeds don't start with <?xml
$begin = trim(substr($body, 0, 40));
if(substr($begin, 0, 4) == '<rss') {
return Formats\XML::parse($http_response);
}
}
if(is_string($body) && substr($body, 0, 1) == '{') {
$parsed = json_decode($body, true);
if($parsed && isset($parsed['version']) && $parsed['version'] == 'https://jsonfeed.org/version/1') {
$http_response['body'] = $parsed;
return Formats\JSONFeed::parse($http_response);
} elseif($parsed && isset($parsed['items'][0]['type']) && isset($parsed['items'][0]['properties'])) {
// Check if an mf2 JSON string was passed in
$http_response['body'] = $parsed;
$data = Formats\Mf2::parse($http_response, $this->http, $opts);
$data['source-format'] = 'mf2+json';
return $data;
} elseif($parsed && Formats\ActivityStreams::is_as2_json($parsed)) {
// Check if an ActivityStreams JSON string was passed in
$http_response['body'] = $parsed;
$data = Formats\ActivityStreams::parse($http_response, $this->http, $opts);
$data['source-format'] = 'activity+json';
return $data;
}
}
// No special parsers matched, parse for Microformats now
$data = Formats\HTML::parse($this->http, $http_response, $opts);
if(!isset($data['source-format']) && isset($data['type']) && $data['type'] != 'unknown')
$data['source-format'] = 'mf2+html';
return $data;
}
private function _findLinkInTree($link, $document) {
if(!$document)
return false;
if(is_string($document) || is_numeric($document)) {
return $document == $link;
}
if(is_array($document)) {
foreach($document as $key=>$value) {
if($key === 'html') {
$found = $this->_findLinkInHTML($link, $value);
if($found) {
return true;
}
} else {
$found = $this->_findLinkInTree($link, $value);
if($found) {
return true;
}
}
}
return false;
}
throw new Exception('Unexpected value in tree');
}
private function _findLinkInHTML($link, $html) {
$doc = new DOMDocument();
@$doc->loadHTML(self::_toHtmlEntities($html));
if(!$doc)
return false;
$xpath = new DOMXPath($doc);
return self::_findLinksInDOMDocument($xpath, $link);
}
private static function _findLinksInDOMDocument(&$xpath, $target) {
$found = [];
self::_xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
self::_xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){
if($u == $target) {
$found[$u] = null;
}
});
return $found;
}
private static function _xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) {
foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) {
$v = $el->getAttribute($attr);
$callback($v);
}
}
private static function _toHtmlEntities($input) {
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
}
}