You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
3.5 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. <?php
  2. use Symfony\Component\HttpFoundation\Request;
  3. use Symfony\Component\HttpFoundation\Response;
  4. use XRay\Formats;
  5. class Parse {
  6. public $http;
  7. public function __construct() {
  8. $this->http = new p3k\HTTP();
  9. }
  10. private function respond(Response $response, $code, $params, $headers=[]) {
  11. $response->setStatusCode($code);
  12. foreach($headers as $k=>$v) {
  13. $response->headers->set($k, $v);
  14. }
  15. $response->headers->set('Content-Type', 'application/json');
  16. $response->setContent(json_encode($params));
  17. return $response;
  18. }
  19. private static function toHtmlEntities($input) {
  20. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  21. }
  22. public function parse(Request $request, Response $response) {
  23. if($request->get('timeout')) {
  24. // We might make 2 HTTP requests, so each request gets half the desired timeout
  25. $this->http->timeout = $request->get('timeout') / 2;
  26. }
  27. $url = $request->get('url');
  28. if(!$url) {
  29. return $this->respond($response, 400, [
  30. 'error' => 'missing_url',
  31. 'error_description' => 'Provide a URL to fetch'
  32. ]);
  33. }
  34. // Attempt some basic URL validation
  35. $scheme = parse_url($url, PHP_URL_SCHEME);
  36. if(!in_array($scheme, ['http','https'])) {
  37. return $this->respond($response, 400, [
  38. 'error' => 'invalid_url',
  39. 'error_description' => 'Only http and https URLs are supported'
  40. ]);
  41. }
  42. $host = parse_url($url, PHP_URL_HOST);
  43. if(!$host) {
  44. return $this->respond($response, 400, [
  45. 'error' => 'invalid_url',
  46. 'error_description' => 'The URL provided was not valid'
  47. ]);
  48. }
  49. // Now fetch the URL and check for any curl errors
  50. $result = $this->http->get($url);
  51. if($result['error']) {
  52. return $this->respond($response, 400, [
  53. 'error' => $result['error'],
  54. 'error_description' => $result['error_description']
  55. ]);
  56. }
  57. // attempt to parse the page as HTML
  58. $doc = new DOMDocument();
  59. @$doc->loadHTML(self::toHtmlEntities($result['body']));
  60. if(!$doc) {
  61. return $this->respond($response, 400, [
  62. 'error' => 'invalid_content',
  63. 'error_description' => 'The document could not be parsed as HTML'
  64. ]);
  65. }
  66. // If a target parameter was provided, make sure a link to it exists on the page
  67. if($target=$request->get('target')) {
  68. $xpath = new DOMXPath($doc);
  69. $found = [];
  70. foreach($xpath->query('//a[@href]') as $href) {
  71. $url = $href->getAttribute('href');
  72. if($target) {
  73. # target parameter was provided
  74. if($url == $target) {
  75. $found[$url] = null;
  76. }
  77. }
  78. }
  79. if(!$found) {
  80. return $this->respond($response, 400, [
  81. 'error' => 'no_link_found',
  82. 'error_description' => 'The source document does not have a link to the target URL'
  83. ]);
  84. }
  85. }
  86. // Now start pulling in the data from the page. Start by looking for microformats2
  87. $mf2 = mf2\Parse($result['body'], $url);
  88. if($mf2 && count($mf2['items']) > 0) {
  89. $data = Formats\Mf2::parse($mf2, $url, $this->http);
  90. if($data) {
  91. return $this->respond($response, 200, [
  92. 'data' => $data,
  93. ]);
  94. }
  95. }
  96. // TODO: look for other content like OEmbed or other known services later
  97. return $this->respond($response, 400, [
  98. 'error' => 'no_content',
  99. 'error_description' => 'No usable content could be found at the given URL'
  100. ]);
  101. }
  102. }