Refactors into a library that can be used separately from the APIpull/39/head v1.1.0
@ -1,4 +1,5 @@ | |||
.DS_Store | |||
config.php | |||
vendor/ | |||
XRay-*.json | |||
php_errors.log | |||
XRay-*.json |
@ -1,7 +1,21 @@ | |||
Copyright 2016 by Aaron Parecki | |||
MIT License | |||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at | |||
Copyright (c) 2017 Aaron Parecki | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. | |||
The above copyright notice and this permission notice shall be included in all | |||
copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. |
@ -1,36 +1,39 @@ | |||
{ | |||
"name": "p3k/xray", | |||
"type": "library", | |||
"license": "MIT", | |||
"homepage": "https://github.com/aaronpk/XRay", | |||
"description": "X-Ray returns structured data from any URL", | |||
"require": { | |||
"league/plates": "3.*", | |||
"league/route": "1.*", | |||
"mf2/mf2": "~0.3", | |||
"ezyang/htmlpurifier": "4.*", | |||
"indieweb/link-rel-parser": "0.1.*", | |||
"dg/twitter-php": "^3.6", | |||
"dg/twitter-php": "3.6.*", | |||
"p3k/timezone": "*", | |||
"cebe/markdown": "~1.1.1" | |||
"p3k/http": "0.1.*", | |||
"cebe/markdown": "1.1.*" | |||
}, | |||
"autoload": { | |||
"psr-4": { | |||
"p3k\\XRay\\": "lib/XRay" | |||
}, | |||
"files": [ | |||
"lib/helpers.php", | |||
"controllers/Main.php", | |||
"controllers/Parse.php", | |||
"controllers/Token.php", | |||
"controllers/Rels.php", | |||
"controllers/Certbot.php", | |||
"lib/HTTPCurl.php", | |||
"lib/HTTPStream.php", | |||
"lib/HTTP.php", | |||
"lib/Formats/Mf2.php", | |||
"lib/Formats/Instagram.php", | |||
"lib/Formats/GitHub.php", | |||
"lib/Formats/Twitter.php", | |||
"lib/Formats/XKCD.php", | |||
"lib/Formats/HTMLPurifier_AttrDef_HTML_Microformats2.php" | |||
"lib/XRay.php" | |||
] | |||
}, | |||
"require-dev": { | |||
"league/plates": "3.*", | |||
"league/route": "1.*", | |||
"phpunit/phpunit": "4.8.*" | |||
}, | |||
"autoload-dev": { | |||
"files": [ | |||
"lib/HTTPTest.php" | |||
"controllers/Main.php", | |||
"controllers/Parse.php", | |||
"controllers/Token.php", | |||
"controllers/Rels.php", | |||
"controllers/Certbot.php" | |||
] | |||
} | |||
} |
@ -1,122 +0,0 @@ | |||
<?php | |||
namespace XRay\Formats; | |||
use DateTime, DateTimeZone; | |||
use Parse, Config; | |||
use cebe\markdown\GithubMarkdown; | |||
class GitHub { | |||
public static function parse($http, $url, $creds, $json=null) { | |||
if(!$json) { | |||
// Transform the GitHub URL to an API request | |||
if(preg_match('~https://github.com/([^/]+)/([^/]+)/pull/(\d+)$~', $url, $match)) { | |||
$type = 'pull'; | |||
$org = $match[1]; | |||
$repo = $match[2]; | |||
$pull = $match[3]; | |||
$apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/pulls/'.$pull; | |||
} elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)$~', $url, $match)) { | |||
$type = 'issue'; | |||
$org = $match[1]; | |||
$repo = $match[2]; | |||
$issue = $match[3]; | |||
$apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/issues/'.$issue; | |||
} elseif(preg_match('~https://github.com/([^/]+)/([^/]+)$~', $url, $match)) { | |||
$type = 'repo'; | |||
$org = $match[1]; | |||
$repo = $match[2]; | |||
$apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo; | |||
} elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)#issuecomment-(\d+)~', $url, $match)) { | |||
$type = 'comment'; | |||
$org = $match[1]; | |||
$repo = $match[2]; | |||
$issue = $match[3]; | |||
$comment = $match[4]; | |||
$apiurl = 'https://api.github.com/repos/'.$org.'/'.$repo.'/issues/comments/'.$comment; | |||
} else { | |||
return [null, null, 0]; | |||
} | |||
$response = $http->get($apiurl, ['User-Agent: XRay ('.Config::$base.')']); | |||
if($response['code'] != 200) { | |||
return [null, $response['body'], $response['code']]; | |||
} | |||
$data = json_decode($response['body'], true); | |||
} else { | |||
$data = json_decode($json, true); | |||
} | |||
if(!$data) { | |||
return [null, null, 0]; | |||
} | |||
// Start building the h-entry | |||
$entry = array( | |||
'type' => ($type == 'repo' ? 'repo' : 'entry'), | |||
'url' => $url, | |||
'author' => [ | |||
'type' => 'card', | |||
'name' => null, | |||
'photo' => null, | |||
'url' => null | |||
] | |||
); | |||
if($type == 'repo') | |||
$authorkey = 'owner'; | |||
else | |||
$authorkey = 'user'; | |||
$entry['author']['name'] = $data[$authorkey]['login']; | |||
$entry['author']['photo'] = $data[$authorkey]['avatar_url']; | |||
$entry['author']['url'] = $data[$authorkey]['html_url']; | |||
if($type == 'pull') { | |||
$entry['name'] = '#' . $pull . ' ' . $data['title']; | |||
} elseif($type == 'issue') { | |||
$entry['name'] = '#' . $issue . ' ' . $data['title']; | |||
} elseif($type == 'repo') { | |||
$entry['name'] = $data['name']; | |||
} | |||
if($type == 'repo') { | |||
if(!empty($data['description'])) | |||
$entry['summary'] = $data['description']; | |||
} | |||
if($type != 'repo' && !empty($data['body'])) { | |||
$parser = new GithubMarkdown(); | |||
$entry['content'] = [ | |||
'text' => $data['body'], | |||
'html' => $parser->parse($data['body']) | |||
]; | |||
} | |||
if($type == 'comment') { | |||
$entry['in-reply-to'] = ['https://github.com/'.$org.'/'.$repo.'/issues/'.$issue]; | |||
} | |||
if(!empty($data['labels'])) { | |||
$entry['category'] = array_map(function($l){ | |||
return $l['name']; | |||
}, $data['labels']); | |||
} | |||
$entry['published'] = $data['created_at']; | |||
$r = [ | |||
'data' => $entry | |||
]; | |||
return [$r, $json, $response['code']]; | |||
} | |||
} |
@ -1,56 +0,0 @@ | |||
<?php | |||
namespace p3k; | |||
class HTTP { | |||
public $timeout = 4; | |||
public $max_redirects = 8; | |||
public function get($url, $headers=[]) { | |||
$class = $this->_class($url); | |||
$http = new $class($url); | |||
$http->timeout = $this->timeout; | |||
$http->max_redirects = $this->max_redirects; | |||
return $http->get($url, $headers); | |||
} | |||
public function post($url, $body, $headers=[]) { | |||
$class = $this->_class($url); | |||
$http = new $class($url); | |||
$http->timeout = $this->timeout; | |||
$http->max_redirects = $this->max_redirects; | |||
return $http->post($url, $body, $headers); | |||
} | |||
public function head($url) { | |||
$class = $this->_class($url); | |||
$http = new $class($url); | |||
$http->timeout = $this->timeout; | |||
$http->max_redirects = $this->max_redirects; | |||
return $http->head($url); | |||
} | |||
private function _class($url) { | |||
if(!should_follow_redirects($url)) { | |||
return 'p3k\HTTPStream'; | |||
} else { | |||
return 'p3k\HTTPCurl'; | |||
} | |||
} | |||
public static function link_rels($header_array) { | |||
$headers = ''; | |||
foreach($header_array as $k=>$header) { | |||
if(is_string($header)) { | |||
$headers .= $k . ': ' . $header . "\r\n"; | |||
} else { | |||
foreach($header as $h) { | |||
$headers .= $k . ': ' . $h . "\r\n"; | |||
} | |||
} | |||
} | |||
$rels = \IndieWeb\http_rels($headers); | |||
return $rels; | |||
} | |||
} |
@ -1,127 +0,0 @@ | |||
<?php | |||
namespace p3k; | |||
class HTTPCurl { | |||
public $timeout = 4; | |||
public $max_redirects = 8; | |||
public function get($url, $headers=[]) { | |||
$ch = curl_init($url); | |||
$this->_set_curlopts($ch, $url); | |||
if($headers) | |||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); | |||
$response = curl_exec($ch); | |||
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||
return array( | |||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||
'body' => substr($response, $header_size), | |||
'error' => self::error_string_from_code(curl_errno($ch)), | |||
'error_description' => curl_error($ch), | |||
'error_code' => curl_errno($ch), | |||
'url' => curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), | |||
); | |||
} | |||
public function post($url, $body, $headers=[]) { | |||
$ch = curl_init($url); | |||
$this->_set_curlopts($ch, $url); | |||
curl_setopt($ch, CURLOPT_POST, true); | |||
curl_setopt($ch, CURLOPT_POSTFIELDS, $body); | |||
if($headers) | |||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); | |||
$response = curl_exec($ch); | |||
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||
return array( | |||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||
'body' => substr($response, $header_size), | |||
'error' => self::error_string_from_code(curl_errno($ch)), | |||
'error_description' => curl_error($ch), | |||
'error_code' => curl_errno($ch), | |||
'url' => curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), | |||
); | |||
} | |||
public function head($url) { | |||
$ch = curl_init($url); | |||
$this->_set_curlopts($ch, $url); | |||
curl_setopt($ch, CURLOPT_NOBODY, true); | |||
$response = curl_exec($ch); | |||
return array( | |||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
'headers' => self::parse_headers(trim($response)), | |||
'error' => self::error_string_from_code(curl_errno($ch)), | |||
'error_description' => curl_error($ch), | |||
'error_code' => curl_errno($ch), | |||
'url' => curl_getinfo($ch, CURLINFO_EFFECTIVE_URL), | |||
); | |||
} | |||
private function _set_curlopts($ch, $url) { | |||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||
curl_setopt($ch, CURLOPT_HEADER, true); | |||
// Special-case appspot.com URLs to not follow redirects. | |||
// https://cloud.google.com/appengine/docs/php/urlfetch/ | |||
if(should_follow_redirects($url)) { | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||
} else { | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); | |||
} | |||
curl_setopt($ch, CURLOPT_TIMEOUT_MS, round($this->timeout * 1000)); | |||
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 2000); | |||
} | |||
public static function error_string_from_code($code) { | |||
switch($code) { | |||
case 0: | |||
return ''; | |||
case CURLE_COULDNT_RESOLVE_HOST: | |||
return 'dns_error'; | |||
case CURLE_COULDNT_CONNECT: | |||
return 'connect_error'; | |||
case CURLE_OPERATION_TIMEDOUT: | |||
return 'timeout'; | |||
case CURLE_SSL_CONNECT_ERROR: | |||
return 'ssl_error'; | |||
case CURLE_SSL_CERTPROBLEM: | |||
return 'ssl_cert_error'; | |||
case CURLE_SSL_CIPHER: | |||
return 'ssl_unsupported_cipher'; | |||
case CURLE_SSL_CACERT: | |||
return 'ssl_cert_error'; | |||
case CURLE_TOO_MANY_REDIRECTS: | |||
return 'too_many_redirects'; | |||
default: | |||
return 'unknown'; | |||
} | |||
} | |||
public static function parse_headers($headers) { | |||
$retVal = array(); | |||
$fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $headers)); | |||
foreach($fields as $field) { | |||
if(preg_match('/([^:]+): (.+)/m', $field, $match)) { | |||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
return strtoupper($m[0]); | |||
}, strtolower(trim($match[1]))); | |||
// If there's already a value set for the header name being returned, turn it into an array and add the new value | |||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
return strtoupper($m[0]); | |||
}, strtolower(trim($match[1]))); | |||
if(isset($retVal[$match[1]])) { | |||
if(!is_array($retVal[$match[1]])) | |||
$retVal[$match[1]] = array($retVal[$match[1]]); | |||
$retVal[$match[1]][] = $match[2]; | |||
} else { | |||
$retVal[$match[1]] = trim($match[2]); | |||
} | |||
} | |||
} | |||
return $retVal; | |||
} | |||
} |
@ -1,138 +0,0 @@ | |||
<?php | |||
namespace p3k; | |||
class HTTPStream { | |||
public $timeout = 4; | |||
public $max_redirects = 8; | |||
public static function exception_error_handler($severity, $message, $file, $line) { | |||
if (!(error_reporting() & $severity)) { | |||
// This error code is not included in error_reporting | |||
return; | |||
} | |||
throw new \ErrorException($message, 0, $severity, $file, $line); | |||
} | |||
public function get($url, $headers=[]) { | |||
set_error_handler("p3k\HTTPStream::exception_error_handler"); | |||
$context = $this->_stream_context('GET', $url, false, $headers); | |||
return $this->_fetch($url, $context); | |||
} | |||
public function post($url, $body, $headers=[]) { | |||
set_error_handler("p3k\HTTPStream::exception_error_handler"); | |||
$context = $this->_stream_context('POST', $url, $body, $headers); | |||
return $this->_fetch($url, $context); | |||
} | |||
public function head($url) { | |||
set_error_handler("p3k\HTTPStream::exception_error_handler"); | |||
$context = $this->_stream_context('HEAD', $url); | |||
return $this->_fetch($url, $context); | |||
} | |||
private function _fetch($url, $context) { | |||
$error = false; | |||
try { | |||
$body = file_get_contents($url, false, $context); | |||
// This sets $http_response_header | |||
// see http://php.net/manual/en/reserved.variables.httpresponseheader.php | |||
} catch(\Exception $e) { | |||
$body = false; | |||
$http_response_header = []; | |||
$description = str_replace('file_get_contents(): ', '', $e->getMessage()); | |||
$code = 'unknown'; | |||
if(preg_match('/getaddrinfo failed/', $description)) { | |||
$code = 'dns_error'; | |||
$description = str_replace('php_network_getaddresses: ', '', $description); | |||
} | |||
if(preg_match('/timed out|request failed/', $description)) { | |||
$code = 'timeout'; | |||
} | |||
if(preg_match('/certificate/', $description)) { | |||
$code = 'ssl_error'; | |||
} | |||
$error = [ | |||
'description' => $description, | |||
'code' => $code | |||
]; | |||
} | |||
return array( | |||
'code' => self::parse_response_code($http_response_header), | |||
'headers' => self::parse_headers($http_response_header), | |||
'body' => $body, | |||
'error' => $error ? $error['code'] : false, | |||
'error_description' => $error ? $error['description'] : false, | |||
'url' => $url, | |||
); | |||
} | |||
private function _stream_context($method, $url, $body=false, $headers=[]) { | |||
$options = [ | |||
'method' => $method, | |||
'timeout' => $this->timeout, | |||
'ignore_errors' => true, | |||
]; | |||
if($body) { | |||
$options['content'] = $body; | |||
} | |||
if($headers) { | |||
$options['header'] = implode("\r\n", $headers); | |||
} | |||
// Special-case appspot.com URLs to not follow redirects. | |||
// https://cloud.google.com/appengine/docs/php/urlfetch/ | |||
if(should_follow_redirects($url)) { | |||
$options['follow_location'] = 1; | |||
$options['max_redirects'] = $this->max_redirects; | |||
} else { | |||
$options['follow_location'] = 0; | |||
} | |||
return stream_context_create(['http' => $options]); | |||
} | |||
public static function parse_response_code($headers) { | |||
// When a response is a redirect, we want to find the last occurrence of the HTTP code | |||
$code = false; | |||
foreach($headers as $field) { | |||
if(preg_match('/HTTP\/\d\.\d (\d+)/', $field, $match)) { | |||
$code = $match[1]; | |||
} | |||
} | |||
return $code; | |||
} | |||
public static function parse_headers($headers) { | |||
$retVal = array(); | |||
foreach($headers as $field) { | |||
if(preg_match('/([^:]+): (.+)/m', $field, $match)) { | |||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
return strtoupper($m[0]); | |||
}, strtolower(trim($match[1]))); | |||
// If there's already a value set for the header name being returned, turn it into an array and add the new value | |||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
return strtoupper($m[0]); | |||
}, strtolower(trim($match[1]))); | |||
if(isset($retVal[$match[1]])) { | |||
if(!is_array($retVal[$match[1]])) | |||
$retVal[$match[1]] = array($retVal[$match[1]]); | |||
$retVal[$match[1]][] = $match[2]; | |||
} else { | |||
$retVal[$match[1]] = trim($match[2]); | |||
} | |||
} | |||
} | |||
return $retVal; | |||
} | |||
} |
@ -1,92 +0,0 @@ | |||
<?php | |||
namespace p3k; | |||
class HTTPTest extends HTTPCurl { | |||
private $_testDataPath; | |||
private $_redirects_remaining; | |||
public function __construct($testDataPath) { | |||
$this->_testDataPath = $testDataPath; | |||
} | |||
public function get($url, $headers=[]) { | |||
$this->_redirects_remaining = $this->max_redirects; | |||
$parts = parse_url($url); | |||
unset($parts['fragment']); | |||
$url = \build_url($parts); | |||
return $this->_read_file($url); | |||
} | |||
public function post($url, $body, $headers=[]) { | |||
return $this->_read_file($url); | |||
} | |||
public function head($url) { | |||
$response = $this->_read_file($url); | |||
return array( | |||
'code' => $response['code'], | |||
'headers' => $response['headers'], | |||
'error' => '', | |||
'error_description' => '', | |||
'url' => $response['url'] | |||
); | |||
} | |||
private function _read_file($url) { | |||
$parts = parse_url($url); | |||
if($parts['path']) { | |||
$parts['path'] = '/'.str_replace('/','_',substr($parts['path'],1)); | |||
$url = \build_url($parts); | |||
} | |||
$filename = $this->_testDataPath.preg_replace('/https?:\/\//', '', $url); | |||
if(!file_exists($filename)) { | |||
$filename = $this->_testDataPath.'404.response.txt'; | |||
} | |||
$response = file_get_contents($filename); | |||
$split = explode("\r\n\r\n", $response); | |||
if(count($split) < 2) { | |||
throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url"); | |||
} | |||
$headers = array_shift($split); | |||
$body = implode("\r\n", $split); | |||
if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) { | |||
$code = $match[1]; | |||
} | |||
$headers = preg_replace('/HTTP\/1\.1 \d+ .+/', '', $headers); | |||
$parsedHeaders = self::parse_headers($headers); | |||
if(array_key_exists('Location', $parsedHeaders)) { | |||
$effectiveUrl = \mf2\resolveUrl($url, $parsedHeaders['Location']); | |||
if($this->_redirects_remaining > 0) { | |||
$this->_redirects_remaining--; | |||
return $this->_read_file($effectiveUrl); | |||
} else { | |||
return [ | |||
'code' => 0, | |||
'headers' => $parsedHeaders, | |||
'body' => $body, | |||
'error' => 'too_many_redirects', | |||
'error_description' => '', | |||
'url' => $effectiveUrl | |||
]; | |||
} | |||
} else { | |||
$effectiveUrl = $url; | |||
} | |||
return array( | |||
'code' => $code, | |||
'headers' => $parsedHeaders, | |||
'body' => $body, | |||
'error' => (isset($parsedHeaders['X-Test-Error']) ? $parsedHeaders['X-Test-Error'] : ''), | |||
'error_description' => '', | |||
'url' => $effectiveUrl | |||
); | |||
} | |||
} |
@ -0,0 +1,42 @@ | |||
<?php | |||
namespace p3k; | |||
class XRay { | |||
public $http; | |||
public function __construct() { | |||
$this->http = new HTTP(); | |||
} | |||
public function rels($url, $opts=[]) { | |||
$rels = new XRay\Rels($this->http); | |||
return $rels->parse($url, $opts); | |||
} | |||
public function parse($url, $opts_or_body=false, $opts_for_body=[]) { | |||
if(!$opts_or_body || is_array($opts_or_body)) { | |||
$fetch = new XRay\Fetcher($this->http); | |||
$response = $fetch->fetch($url, $opts_or_body); | |||
if(!empty($response['error'])) | |||
return $response; | |||
$body = $response['body']; | |||
$url = $response['url']; | |||
$code = $response['code']; | |||
$opts = is_array($opts_or_body) ? $opts_or_body : $opts_for_body; | |||
} else { | |||
$body = $opts_or_body; | |||
$opts = $opts_for_body; | |||
$code = null; | |||
} | |||
$parser = new XRay\Parser($this->http); | |||
$result = $parser->parse($body, $url, $opts); | |||
if(!isset($opts['include_original']) || !$opts['include_original']) | |||
unset($result['original']); | |||
$result['url'] = $url; | |||
$result['code'] = isset($result['code']) ? $result['code'] : $code; | |||
return $result; | |||
} | |||
} | |||
@ -0,0 +1,169 @@ | |||
<?php | |||
namespace p3k\XRay; | |||
class Fetcher { | |||
private $http; | |||
public function __construct($http) { | |||
$this->http = $http; | |||
} | |||
public function fetch($url, $opts=[]) { | |||
if($opts == false) $opts = []; | |||
if(isset($opts['timeout'])) | |||
$this->http->set_timeout($opts['timeout']); | |||
if(isset($opts['max_redirects'])) | |||
$this->http->set_max_redirects($opts['max_redirects']); | |||
// Attempt some basic URL validation | |||
$scheme = parse_url($url, PHP_URL_SCHEME); | |||
if(!in_array($scheme, ['http','https'])) { | |||
return [ | |||
'error_code' => 400, | |||
'error' => 'invalid_url', | |||
'error_description' => 'Only http and https URLs are supported' | |||
]; | |||
} | |||
$host = parse_url($url, PHP_URL_HOST); | |||
if(!$host) { | |||
return [ | |||
'error_code' => 400, | |||
'error' => 'invalid_url', | |||
'error_description' => 'The URL provided was not valid' | |||
]; | |||
} | |||
$url = normalize_url($url); | |||
$host = parse_url($url, PHP_URL_HOST); | |||
// Check if this is a Twitter URL and use the API | |||
if(Formats\Twitter::matches_host($url)) { | |||
return $this->_fetch_tweet($url, $opts); | |||
} | |||
// Transform the HTML GitHub URL into an GitHub API request and fetch the API response | |||
if(Formats\GitHub::matches_host($url)) { | |||
return $this->_fetch_github($url, $opts); | |||
} | |||
// All other URLs are fetched normally | |||
// Special-case appspot.com URLs to not follow redirects. | |||
// https://cloud.google.com/appengine/docs/php/urlfetch/ | |||
if(!should_follow_redirects($url)) { | |||
$this->http->set_max_redirects(0); | |||
$this->http->set_transport(new \p3k\HTTP\Stream()); | |||
} else { | |||
$this->http->set_transport(new \p3k\HTTP\Curl()); | |||
} | |||
$headers = []; | |||
if(isset($opts['token'])) | |||
$headers[] = 'Authorization: Bearer ' . $opts['token']; | |||
$result = $this->http->get($url, $headers); | |||
if($result['error']) { | |||
return [ | |||
'error' => $result['error'], | |||
'error_description' => $result['error_description'], | |||
'url' => $result['url'], | |||
'code' => $result['code'], | |||
]; | |||
} | |||
if(trim($result['body']) == '') { | |||
if($result['code'] == 410) { | |||
// 410 Gone responses are valid and should not return an error | |||
return $this->respond($response, 200, [ | |||
'data' => [ | |||
'type' => 'unknown' | |||
], | |||
'url' => $result['url'], | |||
'code' => $result['code'] | |||
]); | |||
} | |||
return [ | |||
'error' => 'no_content', | |||
'error_description' => 'We did not get a response body when fetching the URL', | |||
'url' => $result['url'], | |||
'code' => $result['code'] | |||
]; | |||
} | |||
// Check for HTTP 401/403 | |||
if($result['code'] == 401) { | |||
return [ | |||
'error' => 'unauthorized', | |||
'error_description' => 'The URL returned "HTTP 401 Unauthorized"', | |||
'url' => $result['url'], | |||
'code' => $result['code'] | |||
]; | |||
} | |||
if($result['code'] == 403) { | |||
return [ | |||
'error' => 'forbidden', | |||
'error_description' => 'The URL returned "HTTP 403 Forbidden"', | |||
'url' => $result['url'], | |||
'code' => $result['code'] | |||
]; | |||
} | |||
// If the original URL had a fragment, include it in the final URL | |||
if(($fragment=parse_url($url, PHP_URL_FRAGMENT)) && !parse_url($result['url'], PHP_URL_FRAGMENT)) { | |||
$result['url'] .= '#'.$fragment; | |||
} | |||
return [ | |||
'url' => $result['url'], | |||
'body' => $result['body'], | |||
'code' => $result['code'], | |||
]; | |||
} | |||
private function _fetch_tweet($url, $opts) { | |||
$fields = ['twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret']; | |||
$creds = []; | |||
foreach($fields as $f) { | |||
if(isset($opts[$f])) | |||
$creds[$f] = $opts[$f]; | |||
} | |||
if(count($creds) < 4) { | |||
return [ | |||
'error_code' => 400, | |||
'error' => 'missing_parameters', | |||
'error_description' => 'All 4 Twitter credentials must be included in the request' | |||
]; | |||
} | |||
$tweet = Formats\Twitter::fetch($url, $creds); | |||
if(!$tweet) { | |||
return [ | |||
'error' => 'twitter_error', | |||
'error_description' => $e->getMessage() | |||
]; | |||
} | |||
return [ | |||
'url' => $url, | |||
'body' => $tweet, | |||
'code' => 200, | |||
]; | |||
} | |||
private function _fetch_github($url, $opts) { | |||
$fields = ['github_access_token']; | |||
$creds = []; | |||
foreach($fields as $f) { | |||
if(isset($opts[$f])) | |||
$creds[$f] = $opts[$f]; | |||
} | |||
return Formats\GitHub::fetch($this->http, $url, $creds); | |||
} | |||
} |
@ -0,0 +1,36 @@ | |||
<?php | |||
namespace p3k\XRay\Formats; | |||
use DOMDocument, DOMXPath; | |||
interface iFormat { | |||
public static function matches_host($url); | |||
public static function matches($url); | |||
} | |||
abstract class Format implements iFormat { | |||
protected static function _unknown() { | |||
return [ | |||
'data' => [ | |||
'type' => 'unknown' | |||
] | |||
]; | |||
} | |||
protected static function _loadHTML($html) { | |||
$doc = new DOMDocument(); | |||
@$doc->loadHTML($html); | |||
if(!$doc) { | |||
return [null, null]; | |||
} | |||
$xpath = new DOMXPath($doc); | |||
return [$doc, $xpath]; | |||
} | |||
} |
@ -0,0 +1,166 @@ | |||
<?php | |||
namespace p3k\XRay\Formats; | |||
use DateTime, DateTimeZone; | |||
use Config; | |||
use cebe\markdown\GithubMarkdown; | |||
class GitHub extends Format { | |||
public static function matches_host($url) { | |||
$host = parse_url($url, PHP_URL_HOST); | |||
return $host == 'github.com'; | |||
} | |||
public static function matches($url) { | |||
return preg_match('~https://github.com/([^/]+)/([^/]+)/pull/(\d+)$~', $url, $match) | |||
|| preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)$~', $url, $match) | |||
|| preg_match('~https://github.com/([^/]+)/([^/]+)$~', $url, $match) | |||
|| preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)#issuecomment-(\d+)~', $url, $match); | |||
} | |||
private static function extract_url_parts($url) { | |||
$response = false; | |||
if(preg_match('~https://github.com/([^/]+)/([^/]+)/pull/(\d+)$~', $url, $match)) { | |||
$response = []; | |||
$response['type'] = 'pull'; | |||
$response['org'] = $match[1]; | |||
$response['repo'] = $match[2]; | |||
$response['pull'] = $match[3]; | |||
$response['apiurl'] = 'https://api.github.com/repos/'.$response['org'].'/'.$response['repo'].'/pulls/'.$response['pull']; | |||
} elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)$~', $url, $match)) { | |||
$response = []; | |||
$response['type'] = 'issue'; | |||
$response['org'] = $match[1]; | |||
$response['repo'] = $match[2]; | |||
$response['issue'] = $match[3]; | |||
$response['apiurl'] = 'https://api.github.com/repos/'.$response['org'].'/'.$response['repo'].'/issues/'.$response['issue']; | |||
} elseif(preg_match('~https://github.com/([^/]+)/([^/]+)$~', $url, $match)) { | |||
$response = []; | |||
$response['type'] = 'repo'; | |||
$response['org'] = $match[1]; | |||
$response['repo'] = $match[2]; | |||
$response['apiurl'] = 'https://api.github.com/repos/'.$response['org'].'/'.$response['repo']; | |||
} elseif(preg_match('~https://github.com/([^/]+)/([^/]+)/issues/(\d+)#issuecomment-(\d+)~', $url, $match)) { | |||
$response = []; | |||
$response['type'] = 'comment'; | |||
$response['org'] = $match[1]; | |||
$response['repo'] = $match[2]; | |||
$response['issue'] = $match[3]; | |||
$response['comment'] = $match[4]; | |||
$response['apiurl'] = 'https://api.github.com/repos/'.$response['org'].'/'.$response['repo'].'/issues/comments/'.$response['comment']; | |||
} | |||
return $response; | |||
} | |||
public static function fetch($http, $url, $creds) { | |||
$parts = self::extract_url_parts($url); | |||
if(!$parts) { | |||
return [ | |||
'error' => 'unsupported_url', | |||
'error_description' => 'This GitHub URL is not supported', | |||
'error_code' => 400, | |||
]; | |||
} | |||
$headers = []; | |||
if(isset($creds['github_access_token'])) { | |||
$headers[] = 'Authorization: Bearer ' . $creds['github_access_token']; | |||
} | |||
$response = $http->get($parts['apiurl'], $headers); | |||
if($response['code'] != 200) { | |||
return [ | |||
'error' => 'github_error', | |||
'error_description' => $response['body'], | |||
'code' => $response['code'], | |||
]; | |||
} | |||
return [ | |||
'url' => $url, | |||
'body' => $response['body'], | |||
'code' => $response['code'], | |||
]; | |||
} | |||
public static function parse($json, $url) { | |||
$data = @json_decode($json, true); | |||
if(!$data) | |||
return self::_unknown(); | |||
$parts = self::extract_url_parts($url); | |||
if(!$parts) | |||
return self::_unknown(); | |||
// Start building the h-entry | |||
$entry = array( | |||
'type' => ($parts['type'] == 'repo' ? 'repo' : 'entry'), | |||
'url' => $url, | |||
'author' => [ | |||
'type' => 'card', | |||
'name' => null, | |||
'photo' => null, | |||
'url' => null | |||
] | |||
); | |||
if($parts['type'] == 'repo') | |||
$authorkey = 'owner'; | |||
else | |||
$authorkey = 'user'; | |||
$entry['author']['name'] = $data[$authorkey]['login']; | |||
$entry['author']['photo'] = $data[$authorkey]['avatar_url']; | |||
$entry['author']['url'] = $data[$authorkey]['html_url']; | |||
if($parts['type'] == 'pull') { | |||
$entry['name'] = '#' . $parts['pull'] . ' ' . $data['title']; | |||
} elseif($parts['type'] == 'issue') { | |||
$entry['name'] = '#' . $parts['issue'] . ' ' . $data['title']; | |||
} elseif($parts['type'] == 'repo') { | |||
$entry['name'] = $data['name']; | |||
} | |||
if($parts['type'] == 'repo') { | |||
if(!empty($data['description'])) | |||
$entry['summary'] = $data['description']; | |||
} | |||
if($parts['type'] != 'repo' && !empty($data['body'])) { | |||
$parser = new GithubMarkdown(); | |||
$entry['content'] = [ | |||
'text' => $data['body'], | |||
'html' => $parser->parse($data['body']) | |||
]; | |||
} | |||
if($parts['type'] == 'comment') { | |||
$entry['in-reply-to'] = ['https://github.com/'.$parts['org'].'/'.$parts['repo'].'/issues/'.$parts['issue']]; | |||
} | |||
if(!empty($data['labels'])) { | |||
$entry['category'] = array_map(function($l){ | |||
return $l['name']; | |||
}, $data['labels']); | |||
} | |||
$entry['published'] = $data['created_at']; | |||
return [ | |||
'data' => $entry, | |||
'original' => $json | |||
]; | |||
} | |||
} |
@ -0,0 +1,132 @@ | |||
<?php | |||
namespace p3k\XRay\Formats; | |||
use HTMLPurifier, HTMLPurifier_Config; | |||
use DOMDocument, DOMXPath; | |||
use p3k\XRay\Formats; | |||
class HTML extends Format { | |||
public static function matches_host($url) { return true; } | |||
public static function matches($url) { return true; } | |||
public static function parse($http, $html, $url, $opts=[]) { | |||
$result = [ | |||
'data' => [ | |||
'type' => 'unknown', | |||
], | |||
'url' => $url, | |||
]; | |||
// attempt to parse the page as HTML | |||
$doc = new DOMDocument(); | |||
@$doc->loadHTML(self::toHtmlEntities($html)); | |||
if(!$doc) { | |||
return [ | |||
'error' => 'invalid_content', | |||
'error_description' => 'The document could not be parsed as HTML' | |||
]; | |||
} | |||
$xpath = new DOMXPath($doc); | |||
// Check for meta http equiv and replace the status code if present | |||
foreach($xpath->query('//meta[translate(@http-equiv,\'STATUS\',\'status\')=\'status\']') as $el) { | |||
$equivStatus = ''.$el->getAttribute('content'); | |||
if($equivStatus && is_string($equivStatus)) { | |||
if(preg_match('/^(\d+)/', $equivStatus, $match)) { | |||
$result['code'] = (int)$match[1]; | |||
} | |||
} | |||
} | |||
// If a target parameter was provided, make sure a link to it exists on the page | |||
if(isset($opts['target'])) { | |||
$target = $opts['target']; | |||
$found = []; | |||
if($target) { | |||
self::xPathFindNodeWithAttribute($xpath, 'a', 'href', function($u) use($target, &$found){ | |||
if($u == $target) { | |||
$found[$u] = null; | |||
} | |||
}); | |||
self::xPathFindNodeWithAttribute($xpath, 'img', 'src', function($u) use($target, &$found){ | |||
if($u == $target) { | |||
$found[$u] = null; | |||
} | |||
}); | |||
self::xPathFindNodeWithAttribute($xpath, 'video', 'src', function($u) use($target, &$found){ | |||
if($u == $target) { | |||
$found[$u] = null; | |||
} | |||
}); | |||
self::xPathFindNodeWithAttribute($xpath, 'audio', 'src', function($u) use($target, &$found){ | |||
if($u == $target) { | |||
$found[$u] = null; | |||
} | |||
}); | |||
} | |||
if(!$found) { | |||
return [ | |||
'error' => 'no_link_found', | |||
'error_description' => 'The source document does not have a link to the target URL', | |||
'code' => isset($result['code']) ? $result['code'] : 200, | |||
'url' => $url | |||
]; | |||
} | |||
} | |||
// If the URL has a fragment ID, find the DOM starting at that node and parse it instead | |||
$fragment = parse_url($url, PHP_URL_FRAGMENT); | |||
if($fragment) { | |||
$fragElement = self::xPathGetElementById($xpath, $fragment); | |||
if($fragElement) { | |||
$html = $doc->saveHTML($fragElement); | |||
$foundFragment = true; | |||
} else { | |||
$foundFragment = false; | |||
} | |||
} | |||
// Now start pulling in the data from the page. Start by looking for microformats2 | |||
$mf2 = \mf2\Parse($html, $url); | |||
if($mf2 && count($mf2['items']) > 0) { | |||
$data = Formats\Mf2::parse($mf2, $url, $http); | |||
$result = array_merge($result, $data); | |||
if($data) { | |||
if($fragment) { | |||
$result['info'] = [ | |||
'found_fragment' => $foundFragment | |||
]; | |||
} | |||
$result['original'] = $html; | |||
$result['url'] = $url; // this will be the effective URL after following redirects | |||
} | |||
} | |||
return $result; | |||
} | |||
private static function toHtmlEntities($input) { | |||
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); | |||
} | |||
private static function xPathFindNodeWithAttribute($xpath, $node, $attr, $callback) { | |||
foreach($xpath->query('//'.$node.'[@'.$attr.']') as $el) { | |||
$v = $el->getAttribute($attr); | |||
$callback($v); | |||
} | |||
} | |||
private static function xPathGetElementById($xpath, $id) { | |||
$element = null; | |||
foreach($xpath->query("//*[@id='$id']") as $el) { | |||
$element = $el; | |||
} | |||
return $element; | |||
} | |||
} |
@ -1,5 +1,5 @@ | |||
<?php | |||
namespace XRay\Formats; | |||
namespace p3k\XRay\Formats; | |||
/** | |||
* Allows Microformats2 classes but rejects any others |
@ -0,0 +1,41 @@ | |||
<?php | |||
namespace p3k\XRay; | |||
use p3k\XRay\Formats; | |||
class Parser { | |||
private $http; | |||
public function __construct($http) { | |||
$this->http = $http; | |||
} | |||
public function parse($body, $url, $opts=[]) { | |||
if(isset($opts['timeout'])) | |||
$this->http->set_timeout($opts['timeout']); | |||
if(isset($opts['max_redirects'])) | |||
$this->http->set_max_redirects($opts['max_redirects']); | |||
// Check if the URL matches a special parser | |||
if(Formats\Instagram::matches($url)) { | |||
return Formats\Instagram::parse($this->http, $body, $url); | |||
} | |||
if(Formats\GitHub::matches($url)) { | |||
return Formats\GitHub::parse($body, $url); | |||
} | |||
if(Formats\Twitter::matches($url)) { | |||
return Formats\Twitter::parse($body, $url); | |||
} | |||
if(Formats\XKCD::matches($url)) { | |||
return Formats\XKCD::parse($body, $url); | |||
} | |||
// No special parsers matched, parse for Microformats now | |||
return Formats\HTML::parse($this->http, $body, $url, $opts); | |||
} | |||
} |
@ -0,0 +1,63 @@ | |||
<?php | |||
namespace p3k\XRay; | |||
class Rels { | |||
private $http; | |||
public function __construct($http) { | |||
$this->http = $http; | |||
} | |||
public function parse($url, $opts=[]) { | |||
if(isset($opts['timeout'])) | |||
$this->http->set_timeout($opts['timeout']); | |||
if(isset($opts['max_redirects'])) | |||
$this->http->set_max_redirects($opts['max_redirects']); | |||
$scheme = parse_url($url, PHP_URL_SCHEME); | |||
if(!in_array($scheme, ['http','https'])) { | |||
return [ | |||
'error' => 'invalid_url', | |||
'error_description' => 'Only http and https URLs are supported' | |||
]; | |||
} | |||
$host = parse_url($url, PHP_URL_HOST); | |||
if(!$host) { | |||
return [ | |||
'error' => 'invalid_url', | |||
'error_description' => 'The URL provided was not valid' | |||
]; | |||
} | |||
$url = normalize_url($url); | |||
$result = $this->http->get($url); | |||
$html = $result['body']; | |||
$mf2 = \mf2\Parse($html, $result['url']); | |||
$rels = $result['rels']; | |||
if(isset($mf2['rels'])) { | |||
$rels = array_merge($rels, $mf2['rels']); | |||
} | |||
// Resolve all relative URLs | |||
foreach($rels as $rel=>$values) { | |||
foreach($values as $i=>$value) { | |||
$value = \mf2\resolveUrl($result['url'], $value); | |||
$rels[$rel][$i] = $value; | |||
} | |||
} | |||
if(count($rels) == 0) | |||
$rels = new \StdClass; | |||
return [ | |||
'url' => $result['url'], | |||
'code' => $result['code'], | |||
'rels' => $rels | |||
]; | |||
} | |||
} |