@ -0,0 +1,14 @@ | |||
<?php | |||
use Symfony\Component\HttpFoundation\Request; | |||
use Symfony\Component\HttpFoundation\Response; | |||
class Main { | |||
public function index(Request $request, Response $response) { | |||
$response->setContent(view('index', [ | |||
'title' => 'Percolator' | |||
])); | |||
return $response; | |||
} | |||
} |
@ -0,0 +1,113 @@ | |||
<?php | |||
use Symfony\Component\HttpFoundation\Request; | |||
use Symfony\Component\HttpFoundation\Response; | |||
class Parse { | |||
public $http; | |||
public function __construct() { | |||
$this->http = new p3k\HTTP(); | |||
} | |||
private function respond(Response $response, $code, $params, $headers=[]) { | |||
$response->setStatusCode($code); | |||
foreach($headers as $k=>$v) { | |||
$response->headers->set($k, $v); | |||
} | |||
$response->headers->set('Content-Type', 'application/json'); | |||
$response->setContent(json_encode($params)); | |||
return $response; | |||
} | |||
private static function toHtmlEntities($input) { | |||
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); | |||
} | |||
public function parse(Request $request, Response $response) { | |||
$url = $request->get('url'); | |||
if(!$url) { | |||
return $this->respond($response, 400, [ | |||
'type' => 'error', | |||
'error' => 'missing_url', | |||
'error_description' => 'Provide a URL to fetch' | |||
]); | |||
} | |||
// Attempt some basic URL validation | |||
$scheme = parse_url($url, PHP_URL_SCHEME); | |||
if(!in_array($scheme, ['http','https'])) { | |||
return $this->respond($response, 400, [ | |||
'type' => 'error', | |||
'error' => 'invalid_url', | |||
'error_description' => 'Only http and https URLs are supported' | |||
]); | |||
} | |||
$host = parse_url($url, PHP_URL_HOST); | |||
if(!$host) { | |||
return $this->respond($response, 400, [ | |||
'type' => 'error', | |||
'error' => 'invalid_url', | |||
'error_description' => 'The URL provided was not valid' | |||
]); | |||
} | |||
// Now fetch the URL and check for any curl errors | |||
$result = $this->http->get($url); | |||
if($result['error']) { | |||
return $this->respond($response, 400, [ | |||
'type' => 'error', | |||
'error' => $result['error'], | |||
'error_description' => $result['error_description'] | |||
]); | |||
} | |||
// attempt to parse the page as HTML | |||
$doc = new DOMDocument(); | |||
@$doc->loadHTML(self::toHtmlEntities($result['body'])); | |||
if(!$doc) { | |||
return $this->respond($response, 400, [ | |||
'type' => 'error', | |||
'error' => 'invalid_content', | |||
'error_description' => 'The document could not be parsed as HTML' | |||
]); | |||
} | |||
// If a target parameter was provided, make sure a link to it exists on the page | |||
if($target=$request->get('target')) { | |||
$xpath = new DOMXPath($doc); | |||
$found = []; | |||
foreach($xpath->query('//a[@href]') as $href) { | |||
$url = $href->getAttribute('href'); | |||
if($target) { | |||
# target parameter was provided | |||
if($url == $target) { | |||
$found[$url] = null; | |||
} | |||
} | |||
} | |||
if(!$found) { | |||
return $this->respond($response, 400, [ | |||
'type' => 'error', | |||
'error' => 'no_link_found', | |||
'error_description' => 'The source document does not have a link to the target URL' | |||
]); | |||
} | |||
} | |||
return $this->respond($response, 200, [ | |||
'url' => $url, | |||
]); | |||
} | |||
} |
@ -1,10 +0,0 @@ | |||
<?php | |||
use Symfony\Component\HttpFoundation\Request; | |||
use Symfony\Component\HttpFoundation\Response; | |||
$router->addRoute('GET', '/', function(Request $request, Response $response) { | |||
$response->setContent(view('index', [ | |||
'title' => 'Percolator' | |||
])); | |||
return $response; | |||
}); |
@ -0,0 +1,115 @@ | |||
<?php | |||
namespace p3k; | |||
class HTTP { | |||
public $timeout = 3; | |||
public $max_redirects = 8; | |||
public function get($url) { | |||
$ch = curl_init($url); | |||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||
curl_setopt($ch, CURLOPT_HEADER, true); | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||
$response = curl_exec($ch); | |||
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||
return array( | |||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||
'body' => substr($response, $header_size), | |||
'error' => self::error_string_from_code(curl_errno($ch)), | |||
'error_description' => curl_error($ch), | |||
'error_code' => curl_errno($ch), | |||
); | |||
} | |||
public function post($url, $body, $headers=array()) { | |||
$ch = curl_init($url); | |||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||
curl_setopt($ch, CURLOPT_POST, true); | |||
curl_setopt($ch, CURLOPT_POSTFIELDS, $body); | |||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||
curl_setopt($ch, CURLOPT_HEADER, true); | |||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||
$response = curl_exec($ch); | |||
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||
return array( | |||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||
'body' => substr($response, $header_size), | |||
'error' => self::error_string_from_code(curl_errno($ch)), | |||
'error_description' => curl_error($ch), | |||
'error_code' => curl_errno($ch), | |||
); | |||
} | |||
public function head($url) { | |||
$ch = curl_init($url); | |||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||
curl_setopt($ch, CURLOPT_HEADER, true); | |||
curl_setopt($ch, CURLOPT_NOBODY, true); | |||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||
$response = curl_exec($ch); | |||
return array( | |||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
'headers' => self::parse_headers(trim($response)), | |||
'error' => self::error_string_from_code(curl_errno($ch)), | |||
'error_description' => curl_error($ch), | |||
'error_code' => curl_errno($ch), | |||
); | |||
} | |||
public static function error_string_from_code($code) { | |||
switch($code) { | |||
case 0: | |||
return ''; | |||
case CURLE_COULDNT_RESOLVE_HOST: | |||
return 'dns_error'; | |||
case CURLE_COULDNT_CONNECT: | |||
return 'connect_error'; | |||
case CURLE_OPERATION_TIMEDOUT: | |||
return 'timeout'; | |||
case CURLE_SSL_CONNECT_ERROR: | |||
return 'ssl_error'; | |||
case CURLE_SSL_CERTPROBLEM: | |||
return 'ssl_cert_error'; | |||
case CURLE_SSL_CIPHER: | |||
return 'ssl_unsupported_cipher'; | |||
case CURLE_SSL_CACERT: | |||
return 'ssl_cert_error'; | |||
case CURLE_TOO_MANY_REDIRECTS: | |||
return 'too_many_redirects'; | |||
default: | |||
return 'unknown'; | |||
} | |||
} | |||
public static function parse_headers($headers) { | |||
$retVal = array(); | |||
$fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $headers)); | |||
foreach($fields as $field) { | |||
if(preg_match('/([^:]+): (.+)/m', $field, $match)) { | |||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
return strtoupper($m[0]); | |||
}, strtolower(trim($match[1]))); | |||
// If there's already a value set for the header name being returned, turn it into an array and add the new value | |||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
return strtoupper($m[0]); | |||
}, strtolower(trim($match[1]))); | |||
if(isset($retVal[$match[1]])) { | |||
if(!is_array($retVal[$match[1]])) | |||
$retVal[$match[1]] = array($retVal[$match[1]]); | |||
$retVal[$match[1]][] = $match[2]; | |||
} else { | |||
$retVal[$match[1]] = trim($match[2]); | |||
} | |||
} | |||
} | |||
return $retVal; | |||
} | |||
} |
@ -0,0 +1,58 @@ | |||
<?php | |||
namespace p3k; | |||
class HTTPTest extends HTTP { | |||
private $_testDataPath; | |||
public function __construct($testDataPath) { | |||
$this->_testDataPath = $testDataPath; | |||
} | |||
public function get($url) { | |||
return $this->_read_file($url); | |||
} | |||
public function post($url, $body, $headers=array()) { | |||
return $this->_read_file($url); | |||
} | |||
public function head($url) { | |||
$response = $this->_read_file($url); | |||
return array( | |||
'code' => $response['code'], | |||
'headers' => $response['headers'], | |||
'error' => '', | |||
'error_description' => '' | |||
); | |||
} | |||
private function _read_file($url) { | |||
$filename = $this->_testDataPath.preg_replace('/https?:\/\//', '', $url); | |||
if(!file_exists($filename)) { | |||
$filename = $this->_testDataPath.'404.response.txt'; | |||
} | |||
$response = file_get_contents($filename); | |||
$split = explode("\r\n\r\n", $response); | |||
if(count($split) != 2) { | |||
throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url"); | |||
} | |||
list($headers, $body) = $split; | |||
if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) { | |||
$code = $match[1]; | |||
} | |||
$headers = preg_replace('/HTTP\/1\.1 \d+ .+/', '', $headers); | |||
return array( | |||
'code' => $code, | |||
'headers' => self::parse_headers($headers), | |||
'body' => $body, | |||
'error' => '', | |||
'error_description' => '' | |||
); | |||
} | |||
} |
@ -0,0 +1 @@ | |||
extension = "curl.so" |
@ -0,0 +1,33 @@ | |||
<?php | |||
use Symfony\Component\HttpFoundation\Request; | |||
use Symfony\Component\HttpFoundation\Response; | |||
class FetchTest extends PHPUnit_Framework_TestCase { | |||
private $http; | |||
public function setUp() { | |||
$this->http = new p3k\HTTP(); | |||
} | |||
public function testTimeout() { | |||
$url = 'https://nghttp2.org/httpbin/delay/2'; | |||
$this->http->timeout = 1; | |||
$response = $this->http->get($url); | |||
$this->assertEquals('timeout', $response['error']); | |||
} | |||
public function testRedirectLimit() { | |||
$url = 'https://nghttp2.org/httpbin/redirect/3'; | |||
$this->http->max_redirects = 1; | |||
$response = $this->http->get($url); | |||
$this->assertEquals('too_many_redirects', $response['error']); | |||
} | |||
public function testNoError() { | |||
$url = 'https://nghttp2.org/httpbin/ip'; | |||
$response = $this->http->get($url); | |||
$this->assertEquals('', $response['error']); | |||
} | |||
} |
@ -0,0 +1,64 @@ | |||
<?php | |||
use Symfony\Component\HttpFoundation\Request; | |||
use Symfony\Component\HttpFoundation\Response; | |||
class ParseTest extends PHPUnit_Framework_TestCase { | |||
private $http; | |||
public function setUp() { | |||
$this->client = new Parse(); | |||
$this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); | |||
} | |||
private function parse($params) { | |||
$request = new Request($params); | |||
$response = new Response(); | |||
return $this->client->parse($request, $response); | |||
} | |||
public function testMissingURL() { | |||
$response = $this->parse([]); | |||
$body = $response->getContent(); | |||
$this->assertEquals(400, $response->getStatusCode()); | |||
$data = json_decode($body); | |||
$this->assertEquals('error', $data->type); | |||
$this->assertEquals('missing_url', $data->error); | |||
} | |||
public function testInvalidURL() { | |||
$url = 'ftp://example.com/foo'; | |||
$response = $this->parse(['url' => $url]); | |||
$body = $response->getContent(); | |||
$this->assertEquals(400, $response->getStatusCode()); | |||
$data = json_decode($body); | |||
$this->assertEquals('error', $data->type); | |||
$this->assertEquals('invalid_url', $data->error); | |||
} | |||
public function testTargetNotFound() { | |||
$url = 'http://source.example.com/baseictest'; | |||
$response = $this->parse(['url' => $url, 'target' => 'http://example.net']); | |||
$body = $response->getContent(); | |||
$this->assertEquals(400, $response->getStatusCode()); | |||
$data = json_decode($body); | |||
$this->assertEquals('error', $data->type); | |||
$this->assertEquals('no_link_found', $data->error); | |||
} | |||
public function testTargetFound() { | |||
$url = 'http://source.example.com/basictest'; | |||
$response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']); | |||
$body = $response->getContent(); | |||
print_r($body); | |||
$this->assertEquals(200, $response->getStatusCode()); | |||
$data = json_decode($body); | |||
$this->assertNotEquals('error', $data->type); | |||
$this->assertNotEquals('no_link_found', $data->error); | |||
} | |||
} |
@ -0,0 +1,3 @@ | |||
HTTP/1.1 404 Not Found | |||
The page was not found. |
@ -0,0 +1,14 @@ | |||
HTTP/1.1 200 OK | |||
Server: Apache | |||
Date: Wed, 09 Dec 2015 03:29:14 GMT | |||
Content-Type: text/html; charset=utf-8 | |||
Connection: keep-alive | |||
<html> | |||
<head> | |||
<title>Test</title> | |||
</head> | |||
<body class="h-entry"> | |||
<p class="e-content">This page has links to <a href="http://target.example.com">target.example.com</a> and <a href="http://target2.example.com">target2.example.com</a>.</p> | |||
</body> | |||
</html> |