| @ -0,0 +1,14 @@ | |||
| <?php | |||
| use Symfony\Component\HttpFoundation\Request; | |||
| use Symfony\Component\HttpFoundation\Response; | |||
| class Main { | |||
| public function index(Request $request, Response $response) { | |||
| $response->setContent(view('index', [ | |||
| 'title' => 'Percolator' | |||
| ])); | |||
| return $response; | |||
| } | |||
| } | |||
| @ -0,0 +1,113 @@ | |||
| <?php | |||
| use Symfony\Component\HttpFoundation\Request; | |||
| use Symfony\Component\HttpFoundation\Response; | |||
| class Parse { | |||
| public $http; | |||
| public function __construct() { | |||
| $this->http = new p3k\HTTP(); | |||
| } | |||
| private function respond(Response $response, $code, $params, $headers=[]) { | |||
| $response->setStatusCode($code); | |||
| foreach($headers as $k=>$v) { | |||
| $response->headers->set($k, $v); | |||
| } | |||
| $response->headers->set('Content-Type', 'application/json'); | |||
| $response->setContent(json_encode($params)); | |||
| return $response; | |||
| } | |||
| private static function toHtmlEntities($input) { | |||
| return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); | |||
| } | |||
| public function parse(Request $request, Response $response) { | |||
| $url = $request->get('url'); | |||
| if(!$url) { | |||
| return $this->respond($response, 400, [ | |||
| 'type' => 'error', | |||
| 'error' => 'missing_url', | |||
| 'error_description' => 'Provide a URL to fetch' | |||
| ]); | |||
| } | |||
| // Attempt some basic URL validation | |||
| $scheme = parse_url($url, PHP_URL_SCHEME); | |||
| if(!in_array($scheme, ['http','https'])) { | |||
| return $this->respond($response, 400, [ | |||
| 'type' => 'error', | |||
| 'error' => 'invalid_url', | |||
| 'error_description' => 'Only http and https URLs are supported' | |||
| ]); | |||
| } | |||
| $host = parse_url($url, PHP_URL_HOST); | |||
| if(!$host) { | |||
| return $this->respond($response, 400, [ | |||
| 'type' => 'error', | |||
| 'error' => 'invalid_url', | |||
| 'error_description' => 'The URL provided was not valid' | |||
| ]); | |||
| } | |||
| // Now fetch the URL and check for any curl errors | |||
| $result = $this->http->get($url); | |||
| if($result['error']) { | |||
| return $this->respond($response, 400, [ | |||
| 'type' => 'error', | |||
| 'error' => $result['error'], | |||
| 'error_description' => $result['error_description'] | |||
| ]); | |||
| } | |||
| // attempt to parse the page as HTML | |||
| $doc = new DOMDocument(); | |||
| @$doc->loadHTML(self::toHtmlEntities($result['body'])); | |||
| if(!$doc) { | |||
| return $this->respond($response, 400, [ | |||
| 'type' => 'error', | |||
| 'error' => 'invalid_content', | |||
| 'error_description' => 'The document could not be parsed as HTML' | |||
| ]); | |||
| } | |||
| // If a target parameter was provided, make sure a link to it exists on the page | |||
| if($target=$request->get('target')) { | |||
| $xpath = new DOMXPath($doc); | |||
| $found = []; | |||
| foreach($xpath->query('//a[@href]') as $href) { | |||
| $url = $href->getAttribute('href'); | |||
| if($target) { | |||
| # target parameter was provided | |||
| if($url == $target) { | |||
| $found[$url] = null; | |||
| } | |||
| } | |||
| } | |||
| if(!$found) { | |||
| return $this->respond($response, 400, [ | |||
| 'type' => 'error', | |||
| 'error' => 'no_link_found', | |||
| 'error_description' => 'The source document does not have a link to the target URL' | |||
| ]); | |||
| } | |||
| } | |||
| return $this->respond($response, 200, [ | |||
| 'url' => $url, | |||
| ]); | |||
| } | |||
| } | |||
| @ -1,10 +0,0 @@ | |||
| <?php | |||
| use Symfony\Component\HttpFoundation\Request; | |||
| use Symfony\Component\HttpFoundation\Response; | |||
| $router->addRoute('GET', '/', function(Request $request, Response $response) { | |||
| $response->setContent(view('index', [ | |||
| 'title' => 'Percolator' | |||
| ])); | |||
| return $response; | |||
| }); | |||
| @ -0,0 +1,115 @@ | |||
| <?php | |||
| namespace p3k; | |||
| class HTTP { | |||
| public $timeout = 3; | |||
| public $max_redirects = 8; | |||
| public function get($url) { | |||
| $ch = curl_init($url); | |||
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||
| curl_setopt($ch, CURLOPT_HEADER, true); | |||
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||
| curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||
| curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||
| $response = curl_exec($ch); | |||
| $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||
| return array( | |||
| 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
| 'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||
| 'body' => substr($response, $header_size), | |||
| 'error' => self::error_string_from_code(curl_errno($ch)), | |||
| 'error_description' => curl_error($ch), | |||
| 'error_code' => curl_errno($ch), | |||
| ); | |||
| } | |||
| public function post($url, $body, $headers=array()) { | |||
| $ch = curl_init($url); | |||
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||
| curl_setopt($ch, CURLOPT_POST, true); | |||
| curl_setopt($ch, CURLOPT_POSTFIELDS, $body); | |||
| curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); | |||
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||
| curl_setopt($ch, CURLOPT_HEADER, true); | |||
| curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||
| $response = curl_exec($ch); | |||
| $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |||
| return array( | |||
| 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
| 'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), | |||
| 'body' => substr($response, $header_size), | |||
| 'error' => self::error_string_from_code(curl_errno($ch)), | |||
| 'error_description' => curl_error($ch), | |||
| 'error_code' => curl_errno($ch), | |||
| ); | |||
| } | |||
| public function head($url) { | |||
| $ch = curl_init($url); | |||
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |||
| curl_setopt($ch, CURLOPT_HEADER, true); | |||
| curl_setopt($ch, CURLOPT_NOBODY, true); | |||
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |||
| curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); | |||
| curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); | |||
| $response = curl_exec($ch); | |||
| return array( | |||
| 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), | |||
| 'headers' => self::parse_headers(trim($response)), | |||
| 'error' => self::error_string_from_code(curl_errno($ch)), | |||
| 'error_description' => curl_error($ch), | |||
| 'error_code' => curl_errno($ch), | |||
| ); | |||
| } | |||
| public static function error_string_from_code($code) { | |||
| switch($code) { | |||
| case 0: | |||
| return ''; | |||
| case CURLE_COULDNT_RESOLVE_HOST: | |||
| return 'dns_error'; | |||
| case CURLE_COULDNT_CONNECT: | |||
| return 'connect_error'; | |||
| case CURLE_OPERATION_TIMEDOUT: | |||
| return 'timeout'; | |||
| case CURLE_SSL_CONNECT_ERROR: | |||
| return 'ssl_error'; | |||
| case CURLE_SSL_CERTPROBLEM: | |||
| return 'ssl_cert_error'; | |||
| case CURLE_SSL_CIPHER: | |||
| return 'ssl_unsupported_cipher'; | |||
| case CURLE_SSL_CACERT: | |||
| return 'ssl_cert_error'; | |||
| case CURLE_TOO_MANY_REDIRECTS: | |||
| return 'too_many_redirects'; | |||
| default: | |||
| return 'unknown'; | |||
| } | |||
| } | |||
| public static function parse_headers($headers) { | |||
| $retVal = array(); | |||
| $fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $headers)); | |||
| foreach($fields as $field) { | |||
| if(preg_match('/([^:]+): (.+)/m', $field, $match)) { | |||
| $match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
| return strtoupper($m[0]); | |||
| }, strtolower(trim($match[1]))); | |||
| // If there's already a value set for the header name being returned, turn it into an array and add the new value | |||
| $match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { | |||
| return strtoupper($m[0]); | |||
| }, strtolower(trim($match[1]))); | |||
| if(isset($retVal[$match[1]])) { | |||
| if(!is_array($retVal[$match[1]])) | |||
| $retVal[$match[1]] = array($retVal[$match[1]]); | |||
| $retVal[$match[1]][] = $match[2]; | |||
| } else { | |||
| $retVal[$match[1]] = trim($match[2]); | |||
| } | |||
| } | |||
| } | |||
| return $retVal; | |||
| } | |||
| } | |||
| @ -0,0 +1,58 @@ | |||
| <?php | |||
| namespace p3k; | |||
| class HTTPTest extends HTTP { | |||
| private $_testDataPath; | |||
| public function __construct($testDataPath) { | |||
| $this->_testDataPath = $testDataPath; | |||
| } | |||
| public function get($url) { | |||
| return $this->_read_file($url); | |||
| } | |||
| public function post($url, $body, $headers=array()) { | |||
| return $this->_read_file($url); | |||
| } | |||
| public function head($url) { | |||
| $response = $this->_read_file($url); | |||
| return array( | |||
| 'code' => $response['code'], | |||
| 'headers' => $response['headers'], | |||
| 'error' => '', | |||
| 'error_description' => '' | |||
| ); | |||
| } | |||
| private function _read_file($url) { | |||
| $filename = $this->_testDataPath.preg_replace('/https?:\/\//', '', $url); | |||
| if(!file_exists($filename)) { | |||
| $filename = $this->_testDataPath.'404.response.txt'; | |||
| } | |||
| $response = file_get_contents($filename); | |||
| $split = explode("\r\n\r\n", $response); | |||
| if(count($split) != 2) { | |||
| throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url"); | |||
| } | |||
| list($headers, $body) = $split; | |||
| if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) { | |||
| $code = $match[1]; | |||
| } | |||
| $headers = preg_replace('/HTTP\/1\.1 \d+ .+/', '', $headers); | |||
| return array( | |||
| 'code' => $code, | |||
| 'headers' => self::parse_headers($headers), | |||
| 'body' => $body, | |||
| 'error' => '', | |||
| 'error_description' => '' | |||
| ); | |||
| } | |||
| } | |||
| @ -0,0 +1 @@ | |||
| extension = "curl.so" | |||
| @ -0,0 +1,33 @@ | |||
| <?php | |||
| use Symfony\Component\HttpFoundation\Request; | |||
| use Symfony\Component\HttpFoundation\Response; | |||
| class FetchTest extends PHPUnit_Framework_TestCase { | |||
| private $http; | |||
| public function setUp() { | |||
| $this->http = new p3k\HTTP(); | |||
| } | |||
| public function testTimeout() { | |||
| $url = 'https://nghttp2.org/httpbin/delay/2'; | |||
| $this->http->timeout = 1; | |||
| $response = $this->http->get($url); | |||
| $this->assertEquals('timeout', $response['error']); | |||
| } | |||
| public function testRedirectLimit() { | |||
| $url = 'https://nghttp2.org/httpbin/redirect/3'; | |||
| $this->http->max_redirects = 1; | |||
| $response = $this->http->get($url); | |||
| $this->assertEquals('too_many_redirects', $response['error']); | |||
| } | |||
| public function testNoError() { | |||
| $url = 'https://nghttp2.org/httpbin/ip'; | |||
| $response = $this->http->get($url); | |||
| $this->assertEquals('', $response['error']); | |||
| } | |||
| } | |||
| @ -0,0 +1,64 @@ | |||
| <?php | |||
| use Symfony\Component\HttpFoundation\Request; | |||
| use Symfony\Component\HttpFoundation\Response; | |||
| class ParseTest extends PHPUnit_Framework_TestCase { | |||
| private $http; | |||
| public function setUp() { | |||
| $this->client = new Parse(); | |||
| $this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); | |||
| } | |||
| private function parse($params) { | |||
| $request = new Request($params); | |||
| $response = new Response(); | |||
| return $this->client->parse($request, $response); | |||
| } | |||
| public function testMissingURL() { | |||
| $response = $this->parse([]); | |||
| $body = $response->getContent(); | |||
| $this->assertEquals(400, $response->getStatusCode()); | |||
| $data = json_decode($body); | |||
| $this->assertEquals('error', $data->type); | |||
| $this->assertEquals('missing_url', $data->error); | |||
| } | |||
| public function testInvalidURL() { | |||
| $url = 'ftp://example.com/foo'; | |||
| $response = $this->parse(['url' => $url]); | |||
| $body = $response->getContent(); | |||
| $this->assertEquals(400, $response->getStatusCode()); | |||
| $data = json_decode($body); | |||
| $this->assertEquals('error', $data->type); | |||
| $this->assertEquals('invalid_url', $data->error); | |||
| } | |||
| public function testTargetNotFound() { | |||
| $url = 'http://source.example.com/baseictest'; | |||
| $response = $this->parse(['url' => $url, 'target' => 'http://example.net']); | |||
| $body = $response->getContent(); | |||
| $this->assertEquals(400, $response->getStatusCode()); | |||
| $data = json_decode($body); | |||
| $this->assertEquals('error', $data->type); | |||
| $this->assertEquals('no_link_found', $data->error); | |||
| } | |||
| public function testTargetFound() { | |||
| $url = 'http://source.example.com/basictest'; | |||
| $response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']); | |||
| $body = $response->getContent(); | |||
| print_r($body); | |||
| $this->assertEquals(200, $response->getStatusCode()); | |||
| $data = json_decode($body); | |||
| $this->assertNotEquals('error', $data->type); | |||
| $this->assertNotEquals('no_link_found', $data->error); | |||
| } | |||
| } | |||
| @ -0,0 +1,3 @@ | |||
| HTTP/1.1 404 Not Found | |||
| The page was not found. | |||
| @ -0,0 +1,14 @@ | |||
| HTTP/1.1 200 OK | |||
| Server: Apache | |||
| Date: Wed, 09 Dec 2015 03:29:14 GMT | |||
| Content-Type: text/html; charset=utf-8 | |||
| Connection: keep-alive | |||
| <html> | |||
| <head> | |||
| <title>Test</title> | |||
| </head> | |||
| <body class="h-entry"> | |||
| <p class="e-content">This page has links to <a href="http://target.example.com">target.example.com</a> and <a href="http://target2.example.com">target2.example.com</a>.</p> | |||
| </body> | |||
| </html> | |||