You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

185 lines
5.3 KiB

5 years ago
  1. <?php
  2. namespace p3k\XRay;
  3. class Fetcher {
  4. private $http;
  5. public function __construct($http) {
  6. $this->http = $http;
  7. }
  8. public function fetch($url, $opts=[]) {
  9. if($opts == false) $opts = [];
  10. if(isset($opts['timeout']))
  11. $this->http->set_timeout($opts['timeout']);
  12. if(isset($opts['max_redirects']))
  13. $this->http->set_max_redirects($opts['max_redirects']);
  14. // Attempt some basic URL validation
  15. $scheme = parse_url($url, PHP_URL_SCHEME);
  16. if(!in_array($scheme, ['http','https'])) {
  17. return [
  18. 'error_code' => 400,
  19. 'error' => 'invalid_url',
  20. 'error_description' => 'Only http and https URLs are supported'
  21. ];
  22. }
  23. $host = parse_url($url, PHP_URL_HOST);
  24. if(!$host) {
  25. return [
  26. 'error_code' => 400,
  27. 'error' => 'invalid_url',
  28. 'error_description' => 'The URL provided was not valid'
  29. ];
  30. }
  31. $url = normalize_url($url);
  32. $host = parse_url($url, PHP_URL_HOST);
  33. // Check if this is a Twitter URL and use the API
  34. if(Formats\Twitter::matches_host($url)) {
  35. return $this->_fetch_tweet($url, $opts);
  36. }
  37. // Transform the HTML GitHub URL into an GitHub API request and fetch the API response
  38. if(Formats\GitHub::matches_host($url)) {
  39. return $this->_fetch_github($url, $opts);
  40. }
  41. // Check if this is a Hackernews URL and use the API
  42. if(Formats\Hackernews::matches($url)) {
  43. return Formats\Hackernews::fetch($this->http, $url, $opts);
  44. }
  45. // All other URLs are fetched normally
  46. // Special-case appspot.com URLs to not follow redirects.
  47. // https://cloud.google.com/appengine/docs/php/urlfetch/
  48. if(!should_follow_redirects($url)) {
  49. $this->http->set_max_redirects(0);
  50. $this->http->set_transport(new \p3k\HTTP\Stream());
  51. } else {
  52. $this->http->set_transport(new \p3k\HTTP\Curl());
  53. }
  54. $headers = [];
  55. $headers[] = 'Accept: application/mf2+json, application/activity+json, text/html, application/json, application/xml, text/xml';
  56. if(isset($opts['token']))
  57. $headers[] = 'Authorization: Bearer ' . $opts['token'];
  58. $result = $this->http->get($url, $headers);
  59. if($result['error']) {
  60. return [
  61. 'error' => $result['error'],
  62. 'error_description' => $result['error_description'],
  63. 'url' => $result['url'],
  64. 'code' => $result['code'],
  65. ];
  66. }
  67. // Show an error if the content type returned is not a recognized type
  68. $format = null;
  69. if(isset($result['headers']['Content-Type'])) {
  70. $contentType = null;
  71. if(is_array($result['headers']['Content-Type'])) {
  72. $contentType = $result['headers']['Content-Type'][0];
  73. } elseif(is_string($result['headers']['Content-Type'])) {
  74. $contentType = $result['headers']['Content-Type'];
  75. }
  76. if($contentType) {
  77. $type = new MediaType($contentType);
  78. $format = $type->format;
  79. }
  80. }
  81. if(!$format ||
  82. !in_array($format, ['html', 'json', 'xml'])) {
  83. return [
  84. 'error' => 'invalid_content',
  85. 'error_description' => 'The server did not return a recognized content type',
  86. 'content_type' => isset($result['headers']['Content-Type']) ? $result['headers']['Content-Type'] : null,
  87. 'url' => $result['url'],
  88. 'code' => $result['code']
  89. ];
  90. }
  91. if(trim($result['body']) == '') {
  92. if($result['code'] == 410) {
  93. // 410 Gone responses are valid and should not return an error
  94. return $result;
  95. }
  96. return [
  97. 'error' => 'no_content',
  98. 'error_description' => 'We did not get a response body when fetching the URL',
  99. 'url' => $result['url'],
  100. 'code' => $result['code']
  101. ];
  102. }
  103. // Check for HTTP 401/403
  104. if($result['code'] == 401) {
  105. return [
  106. 'error' => 'unauthorized',
  107. 'error_description' => 'The URL returned "HTTP 401 Unauthorized"',
  108. 'url' => $result['url'],
  109. 'code' => $result['code']
  110. ];
  111. }
  112. if($result['code'] == 403) {
  113. return [
  114. 'error' => 'forbidden',
  115. 'error_description' => 'The URL returned "HTTP 403 Forbidden"',
  116. 'url' => $result['url'],
  117. 'code' => $result['code']
  118. ];
  119. }
  120. // If the original URL had a fragment, include it in the final URL
  121. if(($fragment=parse_url($url, PHP_URL_FRAGMENT)) && !parse_url($result['url'], PHP_URL_FRAGMENT)) {
  122. $result['url'] .= '#'.$fragment;
  123. }
  124. return [
  125. 'url' => $result['url'],
  126. 'body' => $result['body'],
  127. 'code' => $result['code'],
  128. ];
  129. }
  130. private function _fetch_tweet($url, $opts) {
  131. $fields = ['twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret'];
  132. $creds = [];
  133. foreach($fields as $f) {
  134. if(isset($opts[$f]))
  135. $creds[$f] = $opts[$f];
  136. }
  137. if(count($creds) < 4) {
  138. return [
  139. 'error_code' => 400,
  140. 'error' => 'missing_parameters',
  141. 'error_description' => 'All 4 Twitter credentials must be included in the request'
  142. ];
  143. }
  144. return Formats\Twitter::fetch($url, $creds);
  145. }
  146. private function _fetch_github($url, $opts) {
  147. $fields = ['github_access_token'];
  148. $creds = [];
  149. foreach($fields as $f) {
  150. if(isset($opts[$f]))
  151. $creds[$f] = $opts[$f];
  152. }
  153. return Formats\GitHub::fetch($this->http, $url, $creds);
  154. }
  155. }