From 85c3ce7b3380be4d7de76743ebe51758f037fd26 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 19 Feb 2016 15:13:28 -0800 Subject: [PATCH] starting the parse function, with tests --- composer.json | 10 ++- controllers/Main.php | 14 +++ controllers/Parse.php | 113 +++++++++++++++++++++++ controllers/controllers.php | 10 --- lib/HTTP.php | 115 ++++++++++++++++++++++++ lib/HTTPTest.php | 58 ++++++++++++ php.ini | 1 + public/index.php | 20 ++++- tests/FetchTest.php | 33 +++++++ tests/ParseTest.php | 64 +++++++++++++ tests/data/404.response.txt | 3 + tests/data/source.example.com/basictest | 14 +++ 12 files changed, 441 insertions(+), 14 deletions(-) create mode 100644 controllers/Main.php create mode 100644 controllers/Parse.php delete mode 100644 controllers/controllers.php create mode 100644 lib/HTTP.php create mode 100644 lib/HTTPTest.php create mode 100644 php.ini create mode 100644 tests/FetchTest.php create mode 100644 tests/ParseTest.php create mode 100644 tests/data/404.response.txt create mode 100644 tests/data/source.example.com/basictest diff --git a/composer.json b/composer.json index df980de..d6c8533 100644 --- a/composer.json +++ b/composer.json @@ -6,7 +6,15 @@ }, "autoload": { "files": [ - "lib/helpers.php" + "lib/helpers.php", + "controllers/Main.php", + "controllers/Parse.php", + "lib/HTTP.php" + ] + }, + "autoload-dev": { + "files": [ + "lib/HTTPTest.php" ] } } diff --git a/controllers/Main.php b/controllers/Main.php new file mode 100644 index 0000000..2228b2d --- /dev/null +++ b/controllers/Main.php @@ -0,0 +1,14 @@ +setContent(view('index', [ + 'title' => 'Percolator' + ])); + return $response; + } + +} diff --git a/controllers/Parse.php b/controllers/Parse.php new file mode 100644 index 0000000..d9d1314 --- /dev/null +++ b/controllers/Parse.php @@ -0,0 +1,113 @@ +http = new p3k\HTTP(); + } + + private function respond(Response $response, $code, $params, $headers=[]) { + $response->setStatusCode($code); + foreach($headers as $k=>$v) { + $response->headers->set($k, $v); + } + $response->headers->set('Content-Type', 'application/json'); + $response->setContent(json_encode($params)); + return $response; + } + + private static function toHtmlEntities($input) { + return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); + } + + public function parse(Request $request, Response $response) { + + $url = $request->get('url'); + + if(!$url) { + return $this->respond($response, 400, [ + 'type' => 'error', + 'error' => 'missing_url', + 'error_description' => 'Provide a URL to fetch' + ]); + } + + // Attempt some basic URL validation + $scheme = parse_url($url, PHP_URL_SCHEME); + if(!in_array($scheme, ['http','https'])) { + return $this->respond($response, 400, [ + 'type' => 'error', + 'error' => 'invalid_url', + 'error_description' => 'Only http and https URLs are supported' + ]); + } + + $host = parse_url($url, PHP_URL_HOST); + if(!$host) { + return $this->respond($response, 400, [ + 'type' => 'error', + 'error' => 'invalid_url', + 'error_description' => 'The URL provided was not valid' + ]); + } + + // Now fetch the URL and check for any curl errors + $result = $this->http->get($url); + + if($result['error']) { + return $this->respond($response, 400, [ + 'type' => 'error', + 'error' => $result['error'], + 'error_description' => $result['error_description'] + ]); + } + + // attempt to parse the page as HTML + $doc = new DOMDocument(); + @$doc->loadHTML(self::toHtmlEntities($result['body'])); + + if(!$doc) { + return $this->respond($response, 400, [ + 'type' => 'error', + 'error' => 'invalid_content', + 'error_description' => 'The document could not be parsed as HTML' + ]); + } + + // If a target parameter was provided, make sure a link to it exists on the page + if($target=$request->get('target')) { + $xpath = new DOMXPath($doc); + + $found = []; + foreach($xpath->query('//a[@href]') as $href) { + $url = $href->getAttribute('href'); + + if($target) { + # target parameter was provided + if($url == $target) { + $found[$url] = null; + } + } + } + + if(!$found) { + return $this->respond($response, 400, [ + 'type' => 'error', + 'error' => 'no_link_found', + 'error_description' => 'The source document does not have a link to the target URL' + ]); + } + } + + + + return $this->respond($response, 200, [ + 'url' => $url, + ]); + } + +} diff --git a/controllers/controllers.php b/controllers/controllers.php deleted file mode 100644 index 2a0d2e5..0000000 --- a/controllers/controllers.php +++ /dev/null @@ -1,10 +0,0 @@ -addRoute('GET', '/', function(Request $request, Response $response) { - $response->setContent(view('index', [ - 'title' => 'Percolator' - ])); - return $response; -}); diff --git a/lib/HTTP.php b/lib/HTTP.php new file mode 100644 index 0000000..63be97e --- /dev/null +++ b/lib/HTTP.php @@ -0,0 +1,115 @@ +max_redirects); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + $response = curl_exec($ch); + $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); + return array( + 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), + 'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), + 'body' => substr($response, $header_size), + 'error' => self::error_string_from_code(curl_errno($ch)), + 'error_description' => curl_error($ch), + 'error_code' => curl_errno($ch), + ); + } + + public function post($url, $body, $headers=array()) { + $ch = curl_init($url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $body); + curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + $response = curl_exec($ch); + $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); + return array( + 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), + 'headers' => self::parse_headers(trim(substr($response, 0, $header_size))), + 'body' => substr($response, $header_size), + 'error' => self::error_string_from_code(curl_errno($ch)), + 'error_description' => curl_error($ch), + 'error_code' => curl_errno($ch), + ); + } + + public function head($url) { + $ch = curl_init($url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_NOBODY, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); + $response = curl_exec($ch); + return array( + 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE), + 'headers' => self::parse_headers(trim($response)), + 'error' => self::error_string_from_code(curl_errno($ch)), + 'error_description' => curl_error($ch), + 'error_code' => curl_errno($ch), + ); + } + + public static function error_string_from_code($code) { + switch($code) { + case 0: + return ''; + case CURLE_COULDNT_RESOLVE_HOST: + return 'dns_error'; + case CURLE_COULDNT_CONNECT: + return 'connect_error'; + case CURLE_OPERATION_TIMEDOUT: + return 'timeout'; + case CURLE_SSL_CONNECT_ERROR: + return 'ssl_error'; + case CURLE_SSL_CERTPROBLEM: + return 'ssl_cert_error'; + case CURLE_SSL_CIPHER: + return 'ssl_unsupported_cipher'; + case CURLE_SSL_CACERT: + return 'ssl_cert_error'; + case CURLE_TOO_MANY_REDIRECTS: + return 'too_many_redirects'; + default: + return 'unknown'; + } + } + + public static function parse_headers($headers) { + $retVal = array(); + $fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $headers)); + foreach($fields as $field) { + if(preg_match('/([^:]+): (.+)/m', $field, $match)) { + $match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { + return strtoupper($m[0]); + }, strtolower(trim($match[1]))); + // If there's already a value set for the header name being returned, turn it into an array and add the new value + $match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) { + return strtoupper($m[0]); + }, strtolower(trim($match[1]))); + if(isset($retVal[$match[1]])) { + if(!is_array($retVal[$match[1]])) + $retVal[$match[1]] = array($retVal[$match[1]]); + $retVal[$match[1]][] = $match[2]; + } else { + $retVal[$match[1]] = trim($match[2]); + } + } + } + return $retVal; + } +} diff --git a/lib/HTTPTest.php b/lib/HTTPTest.php new file mode 100644 index 0000000..85337bb --- /dev/null +++ b/lib/HTTPTest.php @@ -0,0 +1,58 @@ +_testDataPath = $testDataPath; + } + + public function get($url) { + return $this->_read_file($url); + } + + public function post($url, $body, $headers=array()) { + return $this->_read_file($url); + } + + public function head($url) { + $response = $this->_read_file($url); + return array( + 'code' => $response['code'], + 'headers' => $response['headers'], + 'error' => '', + 'error_description' => '' + ); + } + + private function _read_file($url) { + $filename = $this->_testDataPath.preg_replace('/https?:\/\//', '', $url); + if(!file_exists($filename)) { + $filename = $this->_testDataPath.'404.response.txt'; + } + $response = file_get_contents($filename); + + $split = explode("\r\n\r\n", $response); + if(count($split) != 2) { + throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url"); + } + list($headers, $body) = $split; + + if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) { + $code = $match[1]; + } + + $headers = preg_replace('/HTTP\/1\.1 \d+ .+/', '', $headers); + + return array( + 'code' => $code, + 'headers' => self::parse_headers($headers), + 'body' => $body, + 'error' => '', + 'error_description' => '' + ); + } + +} diff --git a/php.ini b/php.ini new file mode 100644 index 0000000..3281396 --- /dev/null +++ b/php.ini @@ -0,0 +1 @@ +extension = "curl.so" \ No newline at end of file diff --git a/public/index.php b/public/index.php index d68e2d6..899a7bf 100644 --- a/public/index.php +++ b/public/index.php @@ -8,9 +8,23 @@ use Symfony\Component\HttpFoundation\Response; $router = new League\Route\RouteCollection; $templates = new League\Plates\Engine(dirname(__FILE__).'/../views'); -include('controllers/controllers.php'); +$router->addRoute('GET', '/', 'Main::index'); +$router->addRoute('GET', '/parse', 'Parse::parse'); $dispatcher = $router->getDispatcher(); $request = Request::createFromGlobals(); -$response = $dispatcher->dispatch($request->getMethod(), $request->getPathInfo()); -$response->send(); + +try { + $response = $dispatcher->dispatch($request->getMethod(), $request->getPathInfo()); + $response->send(); +} catch(League\Route\Http\Exception\NotFoundException $e) { + $response = new Response; + $response->setStatusCode(404); + $response->setContent("Not Found\n"); + $response->send(); +} catch(League\Route\Http\Exception\MethodNotAllowedException $e) { + $response = new Response; + $response->setStatusCode(405); + $response->setContent("Method not allowed\n"); + $response->send(); +} diff --git a/tests/FetchTest.php b/tests/FetchTest.php new file mode 100644 index 0000000..e3c2e4a --- /dev/null +++ b/tests/FetchTest.php @@ -0,0 +1,33 @@ +http = new p3k\HTTP(); + } + + public function testTimeout() { + $url = 'https://nghttp2.org/httpbin/delay/2'; + $this->http->timeout = 1; + $response = $this->http->get($url); + $this->assertEquals('timeout', $response['error']); + } + + public function testRedirectLimit() { + $url = 'https://nghttp2.org/httpbin/redirect/3'; + $this->http->max_redirects = 1; + $response = $this->http->get($url); + $this->assertEquals('too_many_redirects', $response['error']); + } + + public function testNoError() { + $url = 'https://nghttp2.org/httpbin/ip'; + $response = $this->http->get($url); + $this->assertEquals('', $response['error']); + } + +} \ No newline at end of file diff --git a/tests/ParseTest.php b/tests/ParseTest.php new file mode 100644 index 0000000..1360440 --- /dev/null +++ b/tests/ParseTest.php @@ -0,0 +1,64 @@ +client = new Parse(); + $this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/'); + } + + private function parse($params) { + $request = new Request($params); + $response = new Response(); + return $this->client->parse($request, $response); + } + + public function testMissingURL() { + $response = $this->parse([]); + + $body = $response->getContent(); + $this->assertEquals(400, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals('error', $data->type); + $this->assertEquals('missing_url', $data->error); + } + + public function testInvalidURL() { + $url = 'ftp://example.com/foo'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(400, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals('error', $data->type); + $this->assertEquals('invalid_url', $data->error); + } + + public function testTargetNotFound() { + $url = 'http://source.example.com/baseictest'; + $response = $this->parse(['url' => $url, 'target' => 'http://example.net']); + + $body = $response->getContent(); + $this->assertEquals(400, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals('error', $data->type); + $this->assertEquals('no_link_found', $data->error); + } + + public function testTargetFound() { + $url = 'http://source.example.com/basictest'; + $response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']); + + $body = $response->getContent(); + print_r($body); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertNotEquals('error', $data->type); + $this->assertNotEquals('no_link_found', $data->error); + } + +} \ No newline at end of file diff --git a/tests/data/404.response.txt b/tests/data/404.response.txt new file mode 100644 index 0000000..f0bd083 --- /dev/null +++ b/tests/data/404.response.txt @@ -0,0 +1,3 @@ +HTTP/1.1 404 Not Found + +The page was not found. diff --git a/tests/data/source.example.com/basictest b/tests/data/source.example.com/basictest new file mode 100644 index 0000000..b22f10b --- /dev/null +++ b/tests/data/source.example.com/basictest @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This page has links to target.example.com and target2.example.com.

+ +