| @ -0,0 +1,14 @@ | |||||
| <?php | |||||
| use Symfony\Component\HttpFoundation\Request; | |||||
| use Symfony\Component\HttpFoundation\Response; | |||||
| class Main { | |||||
| public function index(Request $request, Response $response) { | |||||
| $response->setContent(view('index', [ | |||||
| 'title' => 'Percolator' | |||||
| ])); | |||||
| return $response; | |||||
| } | |||||
| } | |||||
| @ -0,0 +1,113 @@ | |||||
| <?php | |||||
| use Symfony\Component\HttpFoundation\Request; | |||||
| use Symfony\Component\HttpFoundation\Response; | |||||
| class Parse { | |||||
| public $http; | |||||
| public function __construct() { | |||||
| $this->http = new p3k\HTTP(); | |||||
| } | |||||
| private function respond(Response $response, $code, $params, $headers=[]) { | |||||
| $response->setStatusCode($code); | |||||
| foreach($headers as $k=>$v) { | |||||
| $response->headers->set($k, $v); | |||||
| } | |||||
| $response->headers->set('Content-Type', 'application/json'); | |||||
| $response->setContent(json_encode($params)); | |||||
| return $response; | |||||
| } | |||||
| private static function toHtmlEntities($input) { | |||||
| return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); | |||||
| } | |||||
| public function parse(Request $request, Response $response) { | |||||
| $url = $request->get('url'); | |||||
| if(!$url) { | |||||
| return $this->respond($response, 400, [ | |||||
| 'type' => 'error', | |||||
| 'error' => 'missing_url', | |||||
| 'error_description' => 'Provide a URL to fetch' | |||||
| ]); | |||||
| } | |||||
| // Attempt some basic URL validation | |||||
| $scheme = parse_url($url, PHP_URL_SCHEME); | |||||
| if(!in_array($scheme, ['http','https'])) { | |||||
| return $this->respond($response, 400, [ | |||||
| 'type' => 'error', | |||||
| 'error' => 'invalid_url', | |||||
| 'error_description' => 'Only http and https URLs are supported' | |||||
| ]); | |||||
| } | |||||
| $host = parse_url($url, PHP_URL_HOST); | |||||
| if(!$host) { | |||||
| return $this->respond($response, 400, [ | |||||
| 'type' => 'error', | |||||
| 'error' => 'invalid_url', | |||||
| 'error_description' => 'The URL provided was not valid' | |||||
| ]); | |||||
| } | |||||
| // Now fetch the URL and check for any curl errors | |||||
| $result = $this->http->get($url); | |||||
| if($result['error']) { | |||||
| return $this->respond($response, 400, [ | |||||
| 'type' => 'error', | |||||
| 'error' => $result['error'], | |||||
| 'error_description' => $result['error_description'] | |||||
| ]); | |||||
| } | |||||
| // attempt to parse the page as HTML | |||||
| $doc = new DOMDocument(); | |||||
| @$doc->loadHTML(self::toHtmlEntities($result['body'])); | |||||
| if(!$doc) { | |||||
| return $this->respond($response, 400, [ | |||||
| 'type' => 'error', | |||||
| 'error' => 'invalid_content', | |||||
| 'error_description' => 'The document could not be parsed as HTML' | |||||
| ]); | |||||
| } | |||||
| // If a target parameter was provided, make sure a link to it exists on the page | |||||
| if($target=$request->get('target')) { | |||||
| $xpath = new DOMXPath($doc); | |||||
| $found = []; | |||||
| foreach($xpath->query('//a[@href]') as $href) { | |||||
| $url = $href->getAttribute('href'); | |||||
| if($target) { | |||||
| # target parameter was provided | |||||
| if($url == $target) { | |||||
| $found[$url] = null; | |||||
| } | |||||
| } | |||||
| } | |||||
| if(!$found) { | |||||
| return $this->respond($response, 400, [ | |||||
| 'type' => 'error', | |||||
| 'error' => 'no_link_found', | |||||
| 'error_description' => 'The source document does not have a link to the target URL' | |||||
| ]); | |||||
| } | |||||
| } | |||||
| return $this->respond($response, 200, [ | |||||
| 'url' => $url, | |||||
| ]); | |||||
| } | |||||
| } | |||||
| @ -1,10 +0,0 @@ | |||||
| <?php | |||||
| use Symfony\Component\HttpFoundation\Request; | |||||
| use Symfony\Component\HttpFoundation\Response; | |||||
| $router->addRoute('GET', '/', function(Request $request, Response $response) { | |||||
| $response->setContent(view('index', [ | |||||
| 'title' => 'Percolator' | |||||
| ])); | |||||
| return $response; | |||||
| }); | |||||
| @ -0,0 +1,115 @@ | |||||
| <?php | |||||
| namespace p3k; | |||||
| class HTTP { | |||||
| public $timeout = 3; | |||||
| public $max_redirects = 8; | |||||
| public function get($url) { | |||||
| $ch = curl_init($url); | |||||
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||||
| curl_setopt($ch, CURLOPT_HEADER, true); | |||||
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||||
| curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||||
| curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||||
| $response = curl_exec($ch); | |||||
| $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||||
| return array( | |||||
| 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||||
| 'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||||
| 'body' => substr($response, $header_size), | |||||
| 'error' => self::error_string_from_code(curl_errno($ch)), | |||||
| 'error_description' => curl_error($ch), | |||||
| 'error_code' => curl_errno($ch), | |||||
| ); | |||||
| } | |||||
| public function post($url, $body, $headers=array()) { | |||||
| $ch = curl_init($url); | |||||
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||||
| curl_setopt($ch, CURLOPT_POST, true); | |||||
| curl_setopt($ch, CURLOPT_POSTFIELDS, $body); | |||||
| curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); | |||||
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||||
| curl_setopt($ch, CURLOPT_HEADER, true); | |||||
| curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||||
| $response = curl_exec($ch); | |||||
| $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||||
| return array( | |||||
| 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||||
| 'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||||
| 'body' => substr($response, $header_size), | |||||
| 'error' => self::error_string_from_code(curl_errno($ch)), | |||||
| 'error_description' => curl_error($ch), | |||||
| 'error_code' => curl_errno($ch), | |||||
| ); | |||||
| } | |||||
| public function head($url) { | |||||
| $ch = curl_init($url); | |||||
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||||
| curl_setopt($ch, CURLOPT_HEADER, true); | |||||
| curl_setopt($ch, CURLOPT_NOBODY, true); | |||||
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||||
| curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||||
| curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||||
| $response = curl_exec($ch); | |||||
| return array( | |||||
| 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||||
| 'headers' => self::parse_headers(trim($response)), | |||||
| 'error' => self::error_string_from_code(curl_errno($ch)), | |||||
| 'error_description' => curl_error($ch), | |||||
| 'error_code' => curl_errno($ch), | |||||
| ); | |||||
| } | |||||
| public static function error_string_from_code($code) { | |||||
| switch($code) { | |||||
| case 0: | |||||
| return ''; | |||||
| case CURLE_COULDNT_RESOLVE_HOST: | |||||
| return 'dns_error'; | |||||
| case CURLE_COULDNT_CONNECT: | |||||
| return 'connect_error'; | |||||
| case CURLE_OPERATION_TIMEDOUT: | |||||
| return 'timeout'; | |||||
| case CURLE_SSL_CONNECT_ERROR: | |||||
| return 'ssl_error'; | |||||
| case CURLE_SSL_CERTPROBLEM: | |||||
| return 'ssl_cert_error'; | |||||
| case CURLE_SSL_CIPHER: | |||||
| return 'ssl_unsupported_cipher'; | |||||
| case CURLE_SSL_CACERT: | |||||
| return 'ssl_cert_error'; | |||||
| case CURLE_TOO_MANY_REDIRECTS: | |||||
| return 'too_many_redirects'; | |||||
| default: | |||||
| return 'unknown'; | |||||
| } | |||||
| } | |||||
| public static function parse_headers($headers) { | |||||
| $retVal = array(); | |||||
| $fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $headers)); | |||||
| foreach($fields as $field) { | |||||
| if(preg_match('/([^:]+): (.+)/m', $field, $match)) { | |||||
| $match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||||
| return strtoupper($m[0]); | |||||
| }, strtolower(trim($match[1]))); | |||||
| // If there's already a value set for the header name being returned, turn it into an array and add the new value | |||||
| $match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||||
| return strtoupper($m[0]); | |||||
| }, strtolower(trim($match[1]))); | |||||
| if(isset($retVal[$match[1]])) { | |||||
| if(!is_array($retVal[$match[1]])) | |||||
| $retVal[$match[1]] = array($retVal[$match[1]]); | |||||
| $retVal[$match[1]][] = $match[2]; | |||||
| } else { | |||||
| $retVal[$match[1]] = trim($match[2]); | |||||
| } | |||||
| } | |||||
| } | |||||
| return $retVal; | |||||
| } | |||||
| } | |||||
| @ -0,0 +1,58 @@ | |||||
| <?php | |||||
| namespace p3k; | |||||
| class HTTPTest extends HTTP { | |||||
| private $_testDataPath; | |||||
| public function __construct($testDataPath) { | |||||
| $this->_testDataPath = $testDataPath; | |||||
| } | |||||
| public function get($url) { | |||||
| return $this->_read_file($url); | |||||
| } | |||||
| public function post($url, $body, $headers=array()) { | |||||
| return $this->_read_file($url); | |||||
| } | |||||
| public function head($url) { | |||||
| $response = $this->_read_file($url); | |||||
| return array( | |||||
| 'code' => $response['code'], | |||||
| 'headers' => $response['headers'], | |||||
| 'error' => '', | |||||
| 'error_description' => '' | |||||
| ); | |||||
| } | |||||
| private function _read_file($url) { | |||||
| $filename = $this->_testDataPath.preg_replace('/https?:\/\//', '', $url); | |||||
| if(!file_exists($filename)) { | |||||
| $filename = $this->_testDataPath.'404.response.txt'; | |||||
| } | |||||
| $response = file_get_contents($filename); | |||||
| $split = explode("\r\n\r\n", $response); | |||||
| if(count($split) != 2) { | |||||
| throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url"); | |||||
| } | |||||
| list($headers, $body) = $split; | |||||
| if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) { | |||||
| $code = $match[1]; | |||||
| } | |||||
| $headers = preg_replace('/HTTP\/1\.1 \d+ .+/', '', $headers); | |||||
| return array( | |||||
| 'code' => $code, | |||||
| 'headers' => self::parse_headers($headers), | |||||
| 'body' => $body, | |||||
| 'error' => '', | |||||
| 'error_description' => '' | |||||
| ); | |||||
| } | |||||
| } | |||||
| @ -0,0 +1 @@ | |||||
| extension = "curl.so" | |||||
| @ -0,0 +1,33 @@ | |||||
| <?php | |||||
| use Symfony\Component\HttpFoundation\Request; | |||||
| use Symfony\Component\HttpFoundation\Response; | |||||
| class FetchTest extends PHPUnit_Framework_TestCase { | |||||
| private $http; | |||||
| public function setUp() { | |||||
| $this->http = new p3k\HTTP(); | |||||
| } | |||||
| public function testTimeout() { | |||||
| $url = 'https://nghttp2.org/httpbin/delay/2'; | |||||
| $this->http->timeout = 1; | |||||
| $response = $this->http->get($url); | |||||
| $this->assertEquals('timeout', $response['error']); | |||||
| } | |||||
| public function testRedirectLimit() { | |||||
| $url = 'https://nghttp2.org/httpbin/redirect/3'; | |||||
| $this->http->max_redirects = 1; | |||||
| $response = $this->http->get($url); | |||||
| $this->assertEquals('too_many_redirects', $response['error']); | |||||
| } | |||||
| public function testNoError() { | |||||
| $url = 'https://nghttp2.org/httpbin/ip'; | |||||
| $response = $this->http->get($url); | |||||
| $this->assertEquals('', $response['error']); | |||||
| } | |||||
| } | |||||
| @ -0,0 +1,64 @@ | |||||
| <?php | |||||
| use Symfony\Component\HttpFoundation\Request; | |||||
| use Symfony\Component\HttpFoundation\Response; | |||||
| class ParseTest extends PHPUnit_Framework_TestCase { | |||||
| private $http; | |||||
| public function setUp() { | |||||
| $this->client = new Parse(); | |||||
| $this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); | |||||
| } | |||||
| private function parse($params) { | |||||
| $request = new Request($params); | |||||
| $response = new Response(); | |||||
| return $this->client->parse($request, $response); | |||||
| } | |||||
| public function testMissingURL() { | |||||
| $response = $this->parse([]); | |||||
| $body = $response->getContent(); | |||||
| $this->assertEquals(400, $response->getStatusCode()); | |||||
| $data = json_decode($body); | |||||
| $this->assertEquals('error', $data->type); | |||||
| $this->assertEquals('missing_url', $data->error); | |||||
| } | |||||
| public function testInvalidURL() { | |||||
| $url = 'ftp://example.com/foo'; | |||||
| $response = $this->parse(['url' => $url]); | |||||
| $body = $response->getContent(); | |||||
| $this->assertEquals(400, $response->getStatusCode()); | |||||
| $data = json_decode($body); | |||||
| $this->assertEquals('error', $data->type); | |||||
| $this->assertEquals('invalid_url', $data->error); | |||||
| } | |||||
| public function testTargetNotFound() { | |||||
| $url = 'http://source.example.com/baseictest'; | |||||
| $response = $this->parse(['url' => $url, 'target' => 'http://example.net']); | |||||
| $body = $response->getContent(); | |||||
| $this->assertEquals(400, $response->getStatusCode()); | |||||
| $data = json_decode($body); | |||||
| $this->assertEquals('error', $data->type); | |||||
| $this->assertEquals('no_link_found', $data->error); | |||||
| } | |||||
| public function testTargetFound() { | |||||
| $url = 'http://source.example.com/basictest'; | |||||
| $response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']); | |||||
| $body = $response->getContent(); | |||||
| print_r($body); | |||||
| $this->assertEquals(200, $response->getStatusCode()); | |||||
| $data = json_decode($body); | |||||
| $this->assertNotEquals('error', $data->type); | |||||
| $this->assertNotEquals('no_link_found', $data->error); | |||||
| } | |||||
| } | |||||
| @ -0,0 +1,3 @@ | |||||
| HTTP/1.1 404 Not Found | |||||
| The page was not found. | |||||
| @ -0,0 +1,14 @@ | |||||
| HTTP/1.1 200 OK | |||||
| Server: Apache | |||||
| Date: Wed, 09 Dec 2015 03:29:14 GMT | |||||
| Content-Type: text/html; charset=utf-8 | |||||
| Connection: keep-alive | |||||
| <html> | |||||
| <head> | |||||
| <title>Test</title> | |||||
| </head> | |||||
| <body class="h-entry"> | |||||
| <p class="e-content">This page has links to <a href="http://target.example.com">target.example.com</a> and <a href="http://target2.example.com">target2.example.com</a>.</p> | |||||
| </body> | |||||
| </html> | |||||