You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

215 lines
6.2 KiB

5 years ago
  1. <?php
  2. namespace p3k\XRay;
  3. class Fetcher {
  4. private $http;
  5. public function __construct($http) {
  6. $this->http = $http;
  7. }
  8. public function fetch($url, $opts=[]) {
  9. if($opts == false) $opts = [];
  10. if(isset($opts['timeout']))
  11. $this->http->set_timeout($opts['timeout']);
  12. if(isset($opts['max_redirects']))
  13. $this->http->set_max_redirects($opts['max_redirects']);
  14. // Attempt some basic URL validation
  15. $scheme = parse_url($url, PHP_URL_SCHEME);
  16. if(!in_array($scheme, ['http','https'])) {
  17. return [
  18. 'error_code' => 400,
  19. 'error' => 'invalid_url',
  20. 'error_description' => 'Only http and https URLs are supported'
  21. ];
  22. }
  23. $host = parse_url($url, PHP_URL_HOST);
  24. if(!$host) {
  25. return [
  26. 'error_code' => 400,
  27. 'error' => 'invalid_url',
  28. 'error_description' => 'The URL provided was not valid'
  29. ];
  30. }
  31. $url = normalize_url($url);
  32. $host = parse_url($url, PHP_URL_HOST);
  33. // Check if this is a Twitter URL and use the API
  34. if(Formats\Twitter::matches_host($url)) {
  35. return $this->_fetch_tweet($url, $opts);
  36. }
  37. // Check if this is a Facebook URL and use the API
  38. if(Formats\Facebook::matches_host($url)) {
  39. return $this->_fetch_facebook($url, $opts);
  40. }
  41. // Transform the HTML GitHub URL into an GitHub API request and fetch the API response
  42. if(Formats\GitHub::matches_host($url)) {
  43. return $this->_fetch_github($url, $opts);
  44. }
  45. // Check if this is a Hackernews URL and use the API
  46. if(Formats\Hackernews::matches($url)) {
  47. return Formats\Hackernews::fetch($this->http, $url, $opts);
  48. }
  49. // Check if this is an Instagram URL and enable passing a session cookie
  50. if(Formats\Instagram::matches($url)) {
  51. return Formats\Instagram::fetch($this->http, $url, $opts);
  52. }
  53. // All other URLs are fetched normally
  54. // Special-case appspot.com URLs to not follow redirects.
  55. // https://cloud.google.com/appengine/docs/php/urlfetch/
  56. if(!should_follow_redirects($url)) {
  57. $this->http->set_max_redirects(0);
  58. $this->http->set_transport(new \p3k\HTTP\Stream());
  59. } else {
  60. $this->http->set_transport(new \p3k\HTTP\Curl());
  61. }
  62. $headers = [];
  63. $headers[] = 'Accept: application/mf2+json, application/activity+json, text/html, application/json, application/xml, text/xml';
  64. if(isset($opts['token']))
  65. $headers[] = 'Authorization: Bearer ' . $opts['token'];
  66. $result = $this->http->get($url, $headers);
  67. if($result['error']) {
  68. return [
  69. 'error' => $result['error'],
  70. 'error_description' => $result['error_description'],
  71. 'url' => $result['url'],
  72. 'code' => $result['code'],
  73. ];
  74. }
  75. // Show an error if the content type returned is not a recognized type
  76. $format = null;
  77. if(isset($result['headers']['Content-Type'])) {
  78. $contentType = null;
  79. if(is_array($result['headers']['Content-Type'])) {
  80. $contentType = $result['headers']['Content-Type'][0];
  81. } elseif(is_string($result['headers']['Content-Type'])) {
  82. $contentType = $result['headers']['Content-Type'];
  83. }
  84. if($contentType) {
  85. $type = new MediaType($contentType);
  86. $format = $type->format;
  87. }
  88. }
  89. if(!$format ||
  90. !in_array($format, ['html', 'json', 'xml'])) {
  91. return [
  92. 'error' => 'invalid_content',
  93. 'error_description' => 'The server did not return a recognized content type',
  94. 'content_type' => isset($result['headers']['Content-Type']) ? $result['headers']['Content-Type'] : null,
  95. 'url' => $result['url'],
  96. 'code' => $result['code']
  97. ];
  98. }
  99. if(trim($result['body']) == '') {
  100. if($result['code'] == 410) {
  101. // 410 Gone responses are valid and should not return an error
  102. return $result;
  103. }
  104. return [
  105. 'error' => 'no_content',
  106. 'error_description' => 'We did not get a response body when fetching the URL',
  107. 'url' => $result['url'],
  108. 'code' => $result['code']
  109. ];
  110. }
  111. // Check for HTTP 401/403
  112. if($result['code'] == 401) {
  113. return [
  114. 'error' => 'unauthorized',
  115. 'error_description' => 'The URL returned "HTTP 401 Unauthorized"',
  116. 'url' => $result['url'],
  117. 'code' => $result['code']
  118. ];
  119. }
  120. if($result['code'] == 403) {
  121. return [
  122. 'error' => 'forbidden',
  123. 'error_description' => 'The URL returned "HTTP 403 Forbidden"',
  124. 'url' => $result['url'],
  125. 'code' => $result['code']
  126. ];
  127. }
  128. // If the original URL had a fragment, include it in the final URL
  129. if(($fragment=parse_url($url, PHP_URL_FRAGMENT)) && !parse_url($result['url'], PHP_URL_FRAGMENT)) {
  130. $result['url'] .= '#'.$fragment;
  131. }
  132. return [
  133. 'url' => $result['url'],
  134. 'body' => $result['body'],
  135. 'code' => $result['code'],
  136. ];
  137. }
  138. private function _fetch_tweet($url, $opts) {
  139. $fields = ['twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret'];
  140. $creds = [];
  141. foreach($fields as $f) {
  142. if(isset($opts[$f]))
  143. $creds[$f] = $opts[$f];
  144. }
  145. if(count($creds) < 4) {
  146. return [
  147. 'error_code' => 400,
  148. 'error' => 'missing_parameters',
  149. 'error_description' => 'All 4 Twitter credentials must be included in the request'
  150. ];
  151. }
  152. return Formats\Twitter::fetch($url, $creds);
  153. }
  154. private function _fetch_facebook($url, $opts) {
  155. $fields = ['facebook_app_id','facebook_app_secret'];
  156. $creds = [];
  157. foreach($fields as $f) {
  158. if(isset($opts[$f]))
  159. $creds[$f] = $opts[$f];
  160. }
  161. if(count($creds) < 2) {
  162. return [
  163. 'error_code' => 400,
  164. 'error' => 'missing_parameters',
  165. 'error_description' => 'Both Facebook credentials must be included in the request'
  166. ];
  167. }
  168. // TODO: Question, should I do this like Twitter or like Github?
  169. return Formats\Facebook::fetch($url, $creds);
  170. }
  171. private function _fetch_github($url, $opts) {
  172. $fields = ['github_access_token'];
  173. $creds = [];
  174. foreach($fields as $f) {
  175. if(isset($opts[$f]))
  176. $creds[$f] = $opts[$f];
  177. }
  178. return Formats\GitHub::fetch($this->http, $url, $creds);
  179. }
  180. }