Browse Source

starting the parse function, with tests

pull/39/head
Aaron Parecki 8 years ago
parent
commit
85c3ce7b33
12 changed files with 441 additions and 14 deletions
  1. +9
    -1
      composer.json
  2. +14
    -0
      controllers/Main.php
  3. +113
    -0
      controllers/Parse.php
  4. +0
    -10
      controllers/controllers.php
  5. +115
    -0
      lib/HTTP.php
  6. +58
    -0
      lib/HTTPTest.php
  7. +1
    -0
      php.ini
  8. +17
    -3
      public/index.php
  9. +33
    -0
      tests/FetchTest.php
  10. +64
    -0
      tests/ParseTest.php
  11. +3
    -0
      tests/data/404.response.txt
  12. +14
    -0
      tests/data/source.example.com/basictest

+ 9
- 1
composer.json View File

@ -6,7 +6,15 @@
},
"autoload": {
"files": [
"lib/helpers.php"
"lib/helpers.php",
"controllers/Main.php",
"controllers/Parse.php",
"lib/HTTP.php"
]
},
"autoload-dev": {
"files": [
"lib/HTTPTest.php"
]
}
}

+ 14
- 0
controllers/Main.php View File

@ -0,0 +1,14 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class Main {
public function index(Request $request, Response $response) {
$response->setContent(view('index', [
'title' => 'Percolator'
]));
return $response;
}
}

+ 113
- 0
controllers/Parse.php View File

@ -0,0 +1,113 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class Parse {
public $http;
public function __construct() {
$this->http = new p3k\HTTP();
}
private function respond(Response $response, $code, $params, $headers=[]) {
$response->setStatusCode($code);
foreach($headers as $k=>$v) {
$response->headers->set($k, $v);
}
$response->headers->set('Content-Type', 'application/json');
$response->setContent(json_encode($params));
return $response;
}
private static function toHtmlEntities($input) {
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
}
public function parse(Request $request, Response $response) {
$url = $request->get('url');
if(!$url) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'missing_url',
'error_description' => 'Provide a URL to fetch'
]);
}
// Attempt some basic URL validation
$scheme = parse_url($url, PHP_URL_SCHEME);
if(!in_array($scheme, ['http','https'])) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'invalid_url',
'error_description' => 'Only http and https URLs are supported'
]);
}
$host = parse_url($url, PHP_URL_HOST);
if(!$host) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'invalid_url',
'error_description' => 'The URL provided was not valid'
]);
}
// Now fetch the URL and check for any curl errors
$result = $this->http->get($url);
if($result['error']) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => $result['error'],
'error_description' => $result['error_description']
]);
}
// attempt to parse the page as HTML
$doc = new DOMDocument();
@$doc->loadHTML(self::toHtmlEntities($result['body']));
if(!$doc) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'invalid_content',
'error_description' => 'The document could not be parsed as HTML'
]);
}
// If a target parameter was provided, make sure a link to it exists on the page
if($target=$request->get('target')) {
$xpath = new DOMXPath($doc);
$found = [];
foreach($xpath->query('//a[@href]') as $href) {
$url = $href->getAttribute('href');
if($target) {
# target parameter was provided
if($url == $target) {
$found[$url] = null;
}
}
}
if(!$found) {
return $this->respond($response, 400, [
'type' => 'error',
'error' => 'no_link_found',
'error_description' => 'The source document does not have a link to the target URL'
]);
}
}
return $this->respond($response, 200, [
'url' => $url,
]);
}
}

+ 0
- 10
controllers/controllers.php View File

@ -1,10 +0,0 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
$router->addRoute('GET', '/', function(Request $request, Response $response) {
$response->setContent(view('index', [
'title' => 'Percolator'
]));
return $response;
});

+ 115
- 0
lib/HTTP.php View File

@ -0,0 +1,115 @@
<?php
namespace p3k;
class HTTP {
public $timeout = 3;
public $max_redirects = 8;
public function get($url) {
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
$response = curl_exec($ch);
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
return array(
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE),
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))),
'body' => substr($response, $header_size),
'error' => self::error_string_from_code(curl_errno($ch)),
'error_description' => curl_error($ch),
'error_code' => curl_errno($ch),
);
}
public function post($url, $body, $headers=array()) {
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $body);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
$response = curl_exec($ch);
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
return array(
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE),
'headers' => self::parse_headers(trim(substr($response, 0, $header_size))),
'body' => substr($response, $header_size),
'error' => self::error_string_from_code(curl_errno($ch)),
'error_description' => curl_error($ch),
'error_code' => curl_errno($ch),
);
}
public function head($url) {
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, $this->max_redirects);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
$response = curl_exec($ch);
return array(
'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE),
'headers' => self::parse_headers(trim($response)),
'error' => self::error_string_from_code(curl_errno($ch)),
'error_description' => curl_error($ch),
'error_code' => curl_errno($ch),
);
}
public static function error_string_from_code($code) {
switch($code) {
case 0:
return '';
case CURLE_COULDNT_RESOLVE_HOST:
return 'dns_error';
case CURLE_COULDNT_CONNECT:
return 'connect_error';
case CURLE_OPERATION_TIMEDOUT:
return 'timeout';
case CURLE_SSL_CONNECT_ERROR:
return 'ssl_error';
case CURLE_SSL_CERTPROBLEM:
return 'ssl_cert_error';
case CURLE_SSL_CIPHER:
return 'ssl_unsupported_cipher';
case CURLE_SSL_CACERT:
return 'ssl_cert_error';
case CURLE_TOO_MANY_REDIRECTS:
return 'too_many_redirects';
default:
return 'unknown';
}
}
public static function parse_headers($headers) {
$retVal = array();
$fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $headers));
foreach($fields as $field) {
if(preg_match('/([^:]+): (.+)/m', $field, $match)) {
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) {
return strtoupper($m[0]);
}, strtolower(trim($match[1])));
// If there's already a value set for the header name being returned, turn it into an array and add the new value
$match[1] = preg_replace_callback('/(?<=^|[\x09\x20\x2D])./', function($m) {
return strtoupper($m[0]);
}, strtolower(trim($match[1])));
if(isset($retVal[$match[1]])) {
if(!is_array($retVal[$match[1]]))
$retVal[$match[1]] = array($retVal[$match[1]]);
$retVal[$match[1]][] = $match[2];
} else {
$retVal[$match[1]] = trim($match[2]);
}
}
}
return $retVal;
}
}

+ 58
- 0
lib/HTTPTest.php View File

@ -0,0 +1,58 @@
<?php
namespace p3k;
class HTTPTest extends HTTP {
private $_testDataPath;
public function __construct($testDataPath) {
$this->_testDataPath = $testDataPath;
}
public function get($url) {
return $this->_read_file($url);
}
public function post($url, $body, $headers=array()) {
return $this->_read_file($url);
}
public function head($url) {
$response = $this->_read_file($url);
return array(
'code' => $response['code'],
'headers' => $response['headers'],
'error' => '',
'error_description' => ''
);
}
private function _read_file($url) {
$filename = $this->_testDataPath.preg_replace('/https?:\/\//', '', $url);
if(!file_exists($filename)) {
$filename = $this->_testDataPath.'404.response.txt';
}
$response = file_get_contents($filename);
$split = explode("\r\n\r\n", $response);
if(count($split) != 2) {
throw new \Exception("Invalid file contents in test data, check that newlines are CRLF: $url");
}
list($headers, $body) = $split;
if(preg_match('/HTTP\/1\.1 (\d+)/', $headers, $match)) {
$code = $match[1];
}
$headers = preg_replace('/HTTP\/1\.1 \d+ .+/', '', $headers);
return array(
'code' => $code,
'headers' => self::parse_headers($headers),
'body' => $body,
'error' => '',
'error_description' => ''
);
}
}

+ 1
- 0
php.ini View File

@ -0,0 +1 @@
extension = "curl.so"

+ 17
- 3
public/index.php View File

@ -8,9 +8,23 @@ use Symfony\Component\HttpFoundation\Response;
$router = new League\Route\RouteCollection;
$templates = new League\Plates\Engine(dirname(__FILE__).'/../views');
include('controllers/controllers.php');
$router->addRoute('GET', '/', 'Main::index');
$router->addRoute('GET', '/parse', 'Parse::parse');
$dispatcher = $router->getDispatcher();
$request = Request::createFromGlobals();
$response = $dispatcher->dispatch($request->getMethod(), $request->getPathInfo());
$response->send();
try {
$response = $dispatcher->dispatch($request->getMethod(), $request->getPathInfo());
$response->send();
} catch(League\Route\Http\Exception\NotFoundException $e) {
$response = new Response;
$response->setStatusCode(404);
$response->setContent("Not Found\n");
$response->send();
} catch(League\Route\Http\Exception\MethodNotAllowedException $e) {
$response = new Response;
$response->setStatusCode(405);
$response->setContent("Method not allowed\n");
$response->send();
}

+ 33
- 0
tests/FetchTest.php View File

@ -0,0 +1,33 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class FetchTest extends PHPUnit_Framework_TestCase {
private $http;
public function setUp() {
$this->http = new p3k\HTTP();
}
public function testTimeout() {
$url = 'https://nghttp2.org/httpbin/delay/2';
$this->http->timeout = 1;
$response = $this->http->get($url);
$this->assertEquals('timeout', $response['error']);
}
public function testRedirectLimit() {
$url = 'https://nghttp2.org/httpbin/redirect/3';
$this->http->max_redirects = 1;
$response = $this->http->get($url);
$this->assertEquals('too_many_redirects', $response['error']);
}
public function testNoError() {
$url = 'https://nghttp2.org/httpbin/ip';
$response = $this->http->get($url);
$this->assertEquals('', $response['error']);
}
}

+ 64
- 0
tests/ParseTest.php View File

@ -0,0 +1,64 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class ParseTest extends PHPUnit_Framework_TestCase {
private $http;
public function setUp() {
$this->client = new Parse();
$this->client->http = new p3k\HTTPTest(dirname(__FILE__).'/data/');
}
private function parse($params) {
$request = new Request($params);
$response = new Response();
return $this->client->parse($request, $response);
}
public function testMissingURL() {
$response = $this->parse([]);
$body = $response->getContent();
$this->assertEquals(400, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('error', $data->type);
$this->assertEquals('missing_url', $data->error);
}
public function testInvalidURL() {
$url = 'ftp://example.com/foo';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(400, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('error', $data->type);
$this->assertEquals('invalid_url', $data->error);
}
public function testTargetNotFound() {
$url = 'http://source.example.com/baseictest';
$response = $this->parse(['url' => $url, 'target' => 'http://example.net']);
$body = $response->getContent();
$this->assertEquals(400, $response->getStatusCode());
$data = json_decode($body);
$this->assertEquals('error', $data->type);
$this->assertEquals('no_link_found', $data->error);
}
public function testTargetFound() {
$url = 'http://source.example.com/basictest';
$response = $this->parse(['url' => $url, 'target' => 'http://target.example.com']);
$body = $response->getContent();
print_r($body);
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);
$this->assertNotEquals('error', $data->type);
$this->assertNotEquals('no_link_found', $data->error);
}
}

+ 3
- 0
tests/data/404.response.txt View File

@ -0,0 +1,3 @@
HTTP/1.1 404 Not Found
The page was not found.

+ 14
- 0
tests/data/source.example.com/basictest View File

@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content">This page has links to <a href="http://target.example.com">target.example.com</a> and <a href="http://target2.example.com">target2.example.com</a>.</p>
</body>
</html>

Loading…
Cancel
Save