@ -0,0 +1,14 @@ | |||||
<?php | |||||
use Symfony\Component\HttpFoundation\Request; | |||||
use Symfony\Component\HttpFoundation\Response; | |||||
class Main { | |||||
public function index(Request $request, Response $response) { | |||||
$response->setContent(view('index', [ | |||||
'title' => 'Percolator' | |||||
])); | |||||
return $response; | |||||
} | |||||
} |
@ -0,0 +1,113 @@ | |||||
<?php | |||||
use Symfony\Component\HttpFoundation\Request; | |||||
use Symfony\Component\HttpFoundation\Response; | |||||
class Parse { | |||||
public $http; | |||||
public function __construct() { | |||||
$this->http = new p3k\HTTP(); | |||||
} | |||||
private function respond(Response $response, $code, $params, $headers=[]) { | |||||
$response->setStatusCode($code); | |||||
foreach($headers as $k=>$v) { | |||||
$response->headers->set($k, $v); | |||||
} | |||||
$response->headers->set('Content-Type', 'application/json'); | |||||
$response->setContent(json_encode($params)); | |||||
return $response; | |||||
} | |||||
private static function toHtmlEntities($input) { | |||||
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); | |||||
} | |||||
public function parse(Request $request, Response $response) { | |||||
$url = $request->get('url'); | |||||
if(!$url) { | |||||
return $this->respond($response, 400, [ | |||||
'type' => 'error', | |||||
'error' => 'missing_url', | |||||
'error_description' => 'Provide a URL to fetch' | |||||
]); | |||||
} | |||||
// Attempt some basic URL validation | |||||
$scheme = parse_url($url, PHP_URL_SCHEME); | |||||
if(!in_array($scheme, ['http','https'])) { | |||||
return $this->respond($response, 400, [ | |||||
'type' => 'error', | |||||
'error' => 'invalid_url', | |||||
'error_description' => 'Only http and https URLs are supported' | |||||
]); | |||||
} | |||||
$host = parse_url($url, PHP_URL_HOST); | |||||
if(!$host) { | |||||
return $this->respond($response, 400, [ | |||||
'type' => 'error', | |||||
'error' => 'invalid_url', | |||||
'error_description' => 'The URL provided was not valid' | |||||
]); | |||||
} | |||||
// Now fetch the URL and check for any curl errors | |||||
$result = $this->http->get($url); | |||||
if($result['error']) { | |||||
return $this->respond($response, 400, [ | |||||
'type' => 'error', | |||||
'error' => $result['error'], | |||||
'error_description' => $result['error_description'] | |||||
]); | |||||
} | |||||
// attempt to parse the page as HTML | |||||
$doc = new DOMDocument(); | |||||
@$doc->loadHTML(self::toHtmlEntities($result['body'])); | |||||
if(!$doc) { | |||||
return $this->respond($response, 400, [ | |||||
'type' => 'error', | |||||
'error' => 'invalid_content', | |||||
'error_description' => 'The document could not be parsed as HTML' | |||||
]); | |||||
} | |||||
// If a target parameter was provided, make sure a link to it exists on the page | |||||
if($target=$request->get('target')) { | |||||
$xpath = new DOMXPath($doc); | |||||
$found = []; | |||||
foreach($xpath->query('//a[@href]') as $href) { | |||||
$url = $href->getAttribute('href'); | |||||
if($target) { | |||||
# target parameter was provided | |||||
if($url == $target) { | |||||
$found[$url] = null; | |||||
} | |||||
} | |||||
} | |||||
if(!$found) { | |||||
return $this->respond($response, 400, [ | |||||
'type' => 'error', | |||||
'error' => 'no_link_found', | |||||
'error_description' => 'The source document does not have a link to the target URL' | |||||
]); | |||||
} | |||||
} | |||||
return $this->respond($response, 200, [ | |||||
'url' => $url, | |||||
]); | |||||
} | |||||
} |
@ -1,10 +0,0 @@ | |||||
<?php | |||||
use Symfony\Component\HttpFoundation\Request; | |||||
use Symfony\Component\HttpFoundation\Response; | |||||
$router->addRoute('GET', '/', function(Request $request, Response $response) { | |||||
$response->setContent(view('index', [ | |||||
'title' => 'Percolator' | |||||
])); | |||||
return $response; | |||||
}); |
@ -0,0 +1,115 @@ | |||||
<?php | |||||
namespace p3k; | |||||
class HTTP { | |||||
public $timeout = 3; | |||||
public $max_redirects = 8; | |||||
public function get($url) { | |||||
$ch = curl_init($url); | |||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||||
curl_setopt($ch, CURLOPT_HEADER, true); | |||||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||||
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||||
$response = curl_exec($ch); | |||||
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||||
return array( | |||||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||||
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||||
'body' => substr($response, $header_size), | |||||
'error' => self::error_string_from_code(curl_errno($ch)), | |||||
'error_description' => curl_error($ch), | |||||
'error_code' => curl_errno($ch), | |||||
); | |||||
} | |||||
public function post($url, $body, $headers=array()) { | |||||
$ch = curl_init($url); | |||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||||
curl_setopt($ch, CURLOPT_POST, true); | |||||
curl_setopt($ch, CURLOPT_POSTFIELDS, $body); | |||||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); | |||||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||||
curl_setopt($ch, CURLOPT_HEADER, true); | |||||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||||
$response = curl_exec($ch); | |||||
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||||
return array( | |||||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||||
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||||
'body' => substr($response, $header_size), | |||||
'error' => self::error_string_from_code(curl_errno($ch)), | |||||
'error_description' => curl_error($ch), | |||||
'error_code' => curl_errno($ch), | |||||
); | |||||
} | |||||
public function head($url) { | |||||
$ch = curl_init($url); | |||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||||
curl_setopt($ch, CURLOPT_HEADER, true); | |||||
curl_setopt($ch, CURLOPT_NOBODY, true); | |||||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||||
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||||
$response = curl_exec($ch); | |||||
return array( | |||||
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||||
'headers' => self::parse_headers(trim($response)), | |||||
'error' => self::error_string_from_code(curl_errno($ch)), | |||||
'error_description' => curl_error($ch), | |||||
'error_code' => curl_errno($ch), | |||||
); | |||||
} | |||||
public static function error_string_from_code($code) { | |||||
switch($code) { | |||||
case 0: | |||||
return ''; | |||||
case CURLE_COULDNT_RESOLVE_HOST: | |||||
return 'dns_error'; | |||||
case CURLE_COULDNT_CONNECT: | |||||
return 'connect_error'; | |||||
case CURLE_OPERATION_TIMEDOUT: | |||||
return 'timeout'; | |||||
case CURLE_SSL_CONNECT_ERROR: | |||||
return 'ssl_error'; | |||||
case CURLE_SSL_CERTPROBLEM: | |||||
return 'ssl_cert_error'; | |||||
case CURLE_SSL_CIPHER: | |||||
return 'ssl_unsupported_cipher'; | |||||
case CURLE_SSL_CACERT: | |||||
return 'ssl_cert_error'; | |||||
case CURLE_TOO_MANY_REDIRECTS: | |||||
return 'too_many_redirects'; | |||||
default: | |||||
return 'unknown'; | |||||
} | |||||
} | |||||
public static function parse_headers($headers) { | |||||
$retVal = array(); | |||||
$fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $headers)); | |||||
foreach($fields as $field) { | |||||
if(preg_match('/([^:]+): (.+)/m', $field, $match)) { | |||||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||||
return strtoupper($m[0]); | |||||
}, strtolower(trim($match[1]))); | |||||
// If there's already a value set for the header name being returned, turn it into an array and add the new value | |||||
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||||
return strtoupper($m[0]); | |||||
}, strtolower(trim($match[1]))); | |||||
if(isset($retVal[$match[1]])) { | |||||
if(!is_array($retVal[$match[1]])) | |||||
$retVal[$match[1]] = array($retVal[$match[1]]); | |||||
$retVal[$match[1]][] = $match[2]; | |||||
} else { | |||||
$retVal[$match[1]] = trim($match[2]); | |||||
} | |||||
} | |||||
} | |||||
return $retVal; | |||||
} | |||||
} |
@ -0,0 +1,58 @@ | |||||
<?php | |||||
namespace p3k; | |||||
class HTTPTest extends HTTP { | |||||
private $_testDataPath; | |||||
public function __construct($testDataPath) { | |||||
$this->_testDataPath = $testDataPath; | |||||
} | |||||
public function get($url) { | |||||
return $this->_read_file($url); | |||||
} | |||||
public function post($url, $body, $headers=array()) { | |||||
return $this->_read_file($url); | |||||
} | |||||
public function head($url) { | |||||
$response = $this->_read_file($url); | |||||
return array( | |||||
'code' => $response['code'], | |||||
'headers' => $response['headers'], | |||||
'error' => '', | |||||
'error_description' => '' | |||||
); | |||||
} | |||||
private function _read_file($url) { | |||||
$filename = $this->_testDataPath.preg_replace('/https?:\/\//', '', $url); | |||||
if(!file_exists($filename)) { | |||||
$filename = $this->_testDataPath.'404.response.txt'; | |||||
} | |||||
$response = file_get_contents($filename); | |||||
$split = explode("\r\n\r\n", $response); | |||||
if(count($split) != 2) { | |||||
throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url"); | |||||
} | |||||
list($headers, $body) = $split; | |||||
if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) { | |||||
$code = $match[1]; | |||||
} | |||||
$headers = preg_replace('/HTTP\/1\.1 \d+ .+/', '', $headers); | |||||
return array( | |||||
'code' => $code, | |||||
'headers' => self::parse_headers($headers), | |||||
'body' => $body, | |||||
'error' => '', | |||||
'error_description' => '' | |||||
); | |||||
} | |||||
} |
@ -0,0 +1 @@ | |||||
extension = "curl.so" |
@ -0,0 +1,33 @@ | |||||
<?php | |||||
use Symfony\Component\HttpFoundation\Request; | |||||
use Symfony\Component\HttpFoundation\Response; | |||||
class FetchTest extends PHPUnit_Framework_TestCase { | |||||
private $http; | |||||
public function setUp() { | |||||
$this->http = new p3k\HTTP(); | |||||
} | |||||
public function testTimeout() { | |||||
$url = 'https://nghttp2.org/httpbin/delay/2'; | |||||
$this->http->timeout = 1; | |||||
$response = $this->http->get($url); | |||||
$this->assertEquals('timeout', $response['error']); | |||||
} | |||||
public function testRedirectLimit() { | |||||
$url = 'https://nghttp2.org/httpbin/redirect/3'; | |||||
$this->http->max_redirects = 1; | |||||
$response = $this->http->get($url); | |||||
$this->assertEquals('too_many_redirects', $response['error']); | |||||
} | |||||
public function testNoError() { | |||||
$url = 'https://nghttp2.org/httpbin/ip'; | |||||
$response = $this->http->get($url); | |||||
$this->assertEquals('', $response['error']); | |||||
} | |||||
} |
@ -0,0 +1,64 @@ | |||||
<?php | |||||
use Symfony\Component\HttpFoundation\Request; | |||||
use Symfony\Component\HttpFoundation\Response; | |||||
class ParseTest extends PHPUnit_Framework_TestCase { | |||||
private $http; | |||||
public function setUp() { | |||||
$this->client = new Parse(); | |||||
$this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); | |||||
} | |||||
private function parse($params) { | |||||
$request = new Request($params); | |||||
$response = new Response(); | |||||
return $this->client->parse($request, $response); | |||||
} | |||||
public function testMissingURL() { | |||||
$response = $this->parse([]); | |||||
$body = $response->getContent(); | |||||
$this->assertEquals(400, $response->getStatusCode()); | |||||
$data = json_decode($body); | |||||
$this->assertEquals('error', $data->type); | |||||
$this->assertEquals('missing_url', $data->error); | |||||
} | |||||
public function testInvalidURL() { | |||||
$url = 'ftp://example.com/foo'; | |||||
$response = $this->parse(['url' => $url]); | |||||
$body = $response->getContent(); | |||||
$this->assertEquals(400, $response->getStatusCode()); | |||||
$data = json_decode($body); | |||||
$this->assertEquals('error', $data->type); | |||||
$this->assertEquals('invalid_url', $data->error); | |||||
} | |||||
public function testTargetNotFound() { | |||||
$url = 'http://source.example.com/baseictest'; | |||||
$response = $this->parse(['url' => $url, 'target' => 'http://example.net']); | |||||
$body = $response->getContent(); | |||||
$this->assertEquals(400, $response->getStatusCode()); | |||||
$data = json_decode($body); | |||||
$this->assertEquals('error', $data->type); | |||||
$this->assertEquals('no_link_found', $data->error); | |||||
} | |||||
public function testTargetFound() { | |||||
$url = 'http://source.example.com/basictest'; | |||||
$response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']); | |||||
$body = $response->getContent(); | |||||
print_r($body); | |||||
$this->assertEquals(200, $response->getStatusCode()); | |||||
$data = json_decode($body); | |||||
$this->assertNotEquals('error', $data->type); | |||||
$this->assertNotEquals('no_link_found', $data->error); | |||||
} | |||||
} |
@ -0,0 +1,3 @@ | |||||
HTTP/1.1 404 Not Found | |||||
The page was not found. |
@ -0,0 +1,14 @@ | |||||
HTTP/1.1 200 OK | |||||
Server: Apache | |||||
Date: Wed, 09 Dec 2015 03:29:14 GMT | |||||
Content-Type: text/html; charset=utf-8 | |||||
Connection: keep-alive | |||||
<html> | |||||
<head> | |||||
<title>Test</title> | |||||
</head> | |||||
<body class="h-entry"> | |||||
<p class="e-content">This page has links to <a href="http://target.example.com">target.example.com</a> and <a href="http://target2.example.com">target2.example.com</a>.</p> | |||||
</body> | |||||
</html> |