You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
3.5 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. <?php
  2. use Symfony\Component\HttpFoundation\Request;
  3. use Symfony\Component\HttpFoundation\Response;
  4. use XRay\Formats;
  5. class Parse {
  6. public $http;
  7. public function __construct() {
  8. $this->http = new p3k\HTTP();
  9. }
  10. private function respond(Response $response, $code, $params, $headers=[]) {
  11. $response->setStatusCode($code);
  12. foreach($headers as $k=>$v) {
  13. $response->headers->set($k, $v);
  14. }
  15. $response->headers->set('Content-Type', 'application/json');
  16. $response->setContent(json_encode($params));
  17. return $response;
  18. }
  19. private static function toHtmlEntities($input) {
  20. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  21. }
  22. public function parse(Request $request, Response $response) {
  23. if($request->get('timeout')) {
  24. // We might make 2 HTTP requests, so each request gets half the desired timeout
  25. $this->http->timeout = $request->get('timeout') / 2;
  26. }
  27. $url = $request->get('url');
  28. if(!$url) {
  29. return $this->respond($response, 400, [
  30. 'error' => 'missing_url',
  31. 'error_description' => 'Provide a URL to fetch'
  32. ]);
  33. }
  34. // Attempt some basic URL validation
  35. $scheme = parse_url($url, PHP_URL_SCHEME);
  36. if(!in_array($scheme, ['http','https'])) {
  37. return $this->respond($response, 400, [
  38. 'error' => 'invalid_url',
  39. 'error_description' => 'Only http and https URLs are supported'
  40. ]);
  41. }
  42. $host = parse_url($url, PHP_URL_HOST);
  43. if(!$host) {
  44. return $this->respond($response, 400, [
  45. 'error' => 'invalid_url',
  46. 'error_description' => 'The URL provided was not valid'
  47. ]);
  48. }
  49. $url = \normalize_url($url);
  50. // Now fetch the URL and check for any curl errors
  51. $result = $this->http->get($url);
  52. if($result['error']) {
  53. return $this->respond($response, 400, [
  54. 'error' => $result['error'],
  55. 'error_description' => $result['error_description']
  56. ]);
  57. }
  58. // attempt to parse the page as HTML
  59. $doc = new DOMDocument();
  60. @$doc->loadHTML(self::toHtmlEntities($result['body']));
  61. if(!$doc) {
  62. return $this->respond($response, 400, [
  63. 'error' => 'invalid_content',
  64. 'error_description' => 'The document could not be parsed as HTML'
  65. ]);
  66. }
  67. // If a target parameter was provided, make sure a link to it exists on the page
  68. if($target=$request->get('target')) {
  69. $xpath = new DOMXPath($doc);
  70. $found = [];
  71. foreach($xpath->query('//a[@href]') as $href) {
  72. $url = $href->getAttribute('href');
  73. if($target) {
  74. # target parameter was provided
  75. if($url == $target) {
  76. $found[$url] = null;
  77. }
  78. }
  79. }
  80. if(!$found) {
  81. return $this->respond($response, 400, [
  82. 'error' => 'no_link_found',
  83. 'error_description' => 'The source document does not have a link to the target URL'
  84. ]);
  85. }
  86. }
  87. // Now start pulling in the data from the page. Start by looking for microformats2
  88. $mf2 = mf2\Parse($result['body'], $url);
  89. if($mf2 && count($mf2['items']) > 0) {
  90. $data = Formats\Mf2::parse($mf2, $url, $this->http);
  91. if($data) {
  92. return $this->respond($response, 200, [
  93. 'data' => $data,
  94. ]);
  95. }
  96. }
  97. // TODO: look for other content like OEmbed or other known services later
  98. return $this->respond($response, 400, [
  99. 'error' => 'no_content',
  100. 'error_description' => 'No usable content could be found at the given URL'
  101. ]);
  102. }
  103. }