You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

406 lines
12 KiB

  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. class Instagram extends Format {
  6. public static function matches_host($url) {
  7. $host = parse_url($url, PHP_URL_HOST);
  8. return in_array($host, ['www.instagram.com','instagram.com']);
  9. }
  10. public static function matches($url) {
  11. return self::matches_host($url);
  12. }
  13. public static function fetch($http, $url, $opts=[]) {
  14. if(!self::matches($url))
  15. return false;
  16. $headers = [];
  17. if(isset($opts['instagram_session']) && $opts['instagram_session'])
  18. $headers[] = 'Cookie: sessionid='.$opts['instagram_session'];
  19. $result = $http->get($url, $headers);
  20. // Check for errors such as getting redirected to the login page or getting rate limiited
  21. /*
  22. // TODO
  23. if(false) {
  24. return [
  25. 'error' => 'rate_limited',
  26. 'error_description' => 'Instagram has rate limited this client. Please try again later.',
  27. 'url' => $result['url'],
  28. 'code' => $result['code'],
  29. ];
  30. }
  31. if(false) {
  32. return [
  33. 'error' => 'unauthorized',
  34. 'error_description' => 'Instagram redirected to the login page. Either this user is private, or the client has been rate limited.',
  35. 'url' => $result['url'],
  36. 'code' => $result['code'],
  37. ];
  38. }
  39. */
  40. return $result;
  41. }
  42. public static function parse($http, $http_response, $opts=[]) {
  43. $html = $http_response['body'];
  44. $url = $http_response['url'];
  45. if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
  46. if(isset($opts['expect']) && $opts['expect'] == 'feed')
  47. return self::parseFeed($http, $html, $url);
  48. else
  49. return self::parseProfile($http, $html, $url);
  50. } else {
  51. return self::parsePhoto($http, $html, $url);
  52. }
  53. }
  54. private static function parseProfile($http, $html, $url) {
  55. $profileData = self::_parseProfileFromHTML($html);
  56. if(!$profileData)
  57. return self::_unknown();
  58. $card = self::_buildHCardFromInstagramProfile($profileData);
  59. return [
  60. 'data' => $card,
  61. 'source-format' => 'instagram',
  62. ];
  63. }
  64. private static function parseFeed($http, $html, $url) {
  65. $profileData = self::_parseProfileFromHTML($html);
  66. if(!$profileData)
  67. return self::_unknown();
  68. $photos = $profileData['edge_owner_to_timeline_media']['edges'];
  69. $items = [];
  70. foreach($photos as $photoData) {
  71. $item = self::parsePhotoFromData($http, $photoData['node'],
  72. 'https://www.instagram.com/p/'.$photoData['node']['shortcode'].'/', $profileData);
  73. // Note: Not all the photo info is available in the initial JSON.
  74. // Things like video mp4 URLs and person tags and locations are missing.
  75. // Consumers of the feed will need to fetch the photo permalink in order to get all missing information.
  76. // if($photoData['is_video'])
  77. // $item['data']['video'] = true;
  78. $items[] = $item['data'];
  79. }
  80. return [
  81. 'data' => [
  82. 'type' => 'feed',
  83. 'items' => $items,
  84. ],
  85. 'source-format' => 'instagram',
  86. ];
  87. }
  88. private static function parsePhoto($http, $html, $url, $profile=false) {
  89. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  90. return self::parsePhotoFromData($http, $photoData, $url, $profile);
  91. }
  92. private static function altTextIsPlaceholder($text) {
  93. return $text == 'No photo description available.';
  94. }
  95. private static function parsePhotoFromData($http, $photoData, $url, $profile=false) {
  96. if(!$photoData)
  97. return self::_unknown();
  98. // Start building the h-entry
  99. $entry = array(
  100. 'type' => 'entry',
  101. 'url' => $url,
  102. 'author' => [
  103. 'type' => 'card',
  104. 'name' => null,
  105. 'photo' => null,
  106. 'url' => null
  107. ]
  108. );
  109. $profiles = [];
  110. if(!$profile) {
  111. if(isset($photoData['owner'])) {
  112. // Get profile info from the page
  113. $entry['author'] = self::_buildHCardFromInstagramProfile($photoData['owner']);
  114. }
  115. // 2019-10-13 disabling this fetch because profile fetches are severely rate limited now
  116. // // Fetch profile info for this user
  117. // $username = $photoData['owner']['username'];
  118. // $profile = self::_getInstagramProfile($username, $http);
  119. // if($profile) {
  120. // $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  121. // $profiles[] = $profile;
  122. // }
  123. } else {
  124. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  125. $profiles[] = $profile;
  126. }
  127. // Content and hashtags
  128. $caption = false;
  129. if(isset($photoData['caption'])) {
  130. $caption = $photoData['caption'];
  131. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  132. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  133. }
  134. if($caption) {
  135. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  136. $entry['category'] = [];
  137. foreach($matches[1] as $match) {
  138. $entry['category'][] = $match;
  139. }
  140. }
  141. $entry['content'] = [
  142. 'text' => $caption
  143. ];
  144. }
  145. $refs = [];
  146. $meta = [];
  147. // Include the photo/video media URLs
  148. // (Always return arrays, even for single images)
  149. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  150. // Multi-post
  151. // For now, we will only pull photos from multi-posts, and skip videos.
  152. // https://github.com/aaronpk/XRay/issues/84
  153. $entry['photo'] = [];
  154. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  155. $entry['photo'][] = $edge['node']['display_url'];
  156. // Don't need to pull person-tags from here because the main parent object already has them.
  157. if(isset($edge['node']['accessibility_caption']) && $edge['node']['accessibility_caption'] && !self::altTextIsPlaceholder($edge['node']['accessibility_caption'])) {
  158. $meta[$edge['node']['display_url']] = [
  159. 'alt' => $edge['node']['accessibility_caption']
  160. ];
  161. }
  162. }
  163. } else {
  164. // Single photo or video
  165. if(array_key_exists('display_src', $photoData))
  166. $entry['photo'] = [$photoData['display_src']];
  167. elseif(array_key_exists('display_url', $photoData))
  168. $entry['photo'] = [$photoData['display_url']];
  169. if(isset($photoData['accessibility_caption']) && $photoData['accessibility_caption'] && !self::altTextIsPlaceholder($photoData['accessibility_caption'])) {
  170. $meta[$entry['photo'][0]] = [
  171. 'alt' => $photoData['accessibility_caption']
  172. ];
  173. }
  174. if(isset($photoData['is_video']) && $photoData['is_video'] && isset($photoData['video_url'])) {
  175. $entry['video'] = [$photoData['video_url']];
  176. }
  177. }
  178. // Find person tags and fetch user profiles
  179. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  180. if(!isset($entry['category'])) $entry['category'] = [];
  181. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  182. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  183. if($profile) {
  184. $card = self::_buildHCardFromInstagramProfile($profile);
  185. $entry['category'][] = $card['url'];
  186. $refs[$card['url']] = $card;
  187. $profiles[] = $profile;
  188. }
  189. }
  190. }
  191. // Published date
  192. if(isset($photoData['taken_at_timestamp']))
  193. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  194. elseif(isset($photoData['date']))
  195. $published = DateTime::createFromFormat('U', $photoData['date']);
  196. // Include venue data
  197. $locations = [];
  198. if(isset($photoData['location'])) {
  199. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  200. if($location) {
  201. $entry['location'] = [$location['url']];
  202. $refs[$location['url']] = $location;
  203. $locations[] = $location;
  204. // Look up timezone
  205. if($location['latitude']) {
  206. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  207. if($tz) {
  208. $published->setTimeZone(new DateTimeZone($tz));
  209. }
  210. }
  211. }
  212. }
  213. $entry['published'] = $published->format('c');
  214. if(count($refs)) {
  215. $entry['refs'] = $refs;
  216. }
  217. if(count($meta)) {
  218. $entry['meta'] = $meta;
  219. }
  220. $entry['post-type'] = \p3k\XRay\PostType::discover($entry);
  221. return [
  222. 'data' => $entry,
  223. 'original' => json_encode([
  224. 'photo' => $photoData,
  225. 'profiles' => $profiles,
  226. 'locations' => $locations
  227. ]),
  228. 'source-format' => 'instagram',
  229. ];
  230. }
  231. private static function _buildHCardFromInstagramProfile($profile) {
  232. if(!$profile) return false;
  233. $author = [
  234. 'type' => 'card'
  235. ];
  236. if($profile['full_name'])
  237. $author['name'] = $profile['full_name'];
  238. else
  239. $author['name'] = $profile['username'];
  240. $author['nickname'] = $profile['username'];
  241. $author['url'] = 'https://www.instagram.com/' . $profile['username'] . '/';
  242. if(isset($profile['profile_pic_url_hd']))
  243. $author['photo'] = $profile['profile_pic_url_hd'];
  244. elseif(isset($profile['profile_pic_url']))
  245. $author['photo'] = $profile['profile_pic_url'];
  246. if(isset($profile['biography']))
  247. $author['note'] = $profile['biography'];
  248. return $author;
  249. }
  250. private static function _getInstagramProfile($username, $http) {
  251. $response = $http->get('https://www.instagram.com/'.$username.'/');
  252. if(!$response['error'])
  253. return self::_parseProfileFromHTML($response['body']);
  254. return null;
  255. }
  256. private static function _parseProfileFromHTML($html) {
  257. $data = self::_extractIGData($html);
  258. if(isset($data['entry_data']['ProfilePage'][0])) {
  259. $profile = $data['entry_data']['ProfilePage'][0];
  260. if($profile && isset($profile['graphql']['user'])) {
  261. $user = $profile['graphql']['user'];
  262. return $user;
  263. }
  264. }
  265. return null;
  266. }
  267. private static function _getInstagramLocation($id, $http) {
  268. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  269. $response = $http->get($igURL);
  270. if($response['body']) {
  271. $data = self::_extractVenueDataFromVenuePage($response['body']);
  272. if($data) {
  273. return [
  274. 'type' => 'card',
  275. 'name' => $data['name'],
  276. 'url' => $igURL,
  277. 'latitude' => $data['lat'],
  278. 'longitude' => $data['lng'],
  279. ];
  280. }
  281. }
  282. return null;
  283. }
  284. private static function _extractPhotoDataFromPhotoPage($html) {
  285. $data = self::_extractIGData($html);
  286. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  287. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  288. $post = $data['entry_data']['PostPage'];
  289. if(isset($post[0]['graphql']['shortcode_media'])) {
  290. return $post[0]['graphql']['shortcode_media'];
  291. } elseif(isset($post[0]['graphql']['media'])) {
  292. return $post[0]['graphql']['media'];
  293. } elseif(isset($post[0]['media'])) {
  294. return $post[0]['media'];
  295. }
  296. }
  297. }
  298. return null;
  299. }
  300. private static function _extractVenueDataFromVenuePage($html) {
  301. $data = self::_extractIGData($html);
  302. if($data && isset($data['entry_data']['LocationsPage'])) {
  303. $data = $data['entry_data']['LocationsPage'];
  304. if(isset($data[0]['graphql']['location'])) {
  305. $location = $data[0]['graphql']['location'];
  306. # we don't need these and they're huge, so drop them now
  307. unset($location['media']);
  308. unset($location['top_posts']);
  309. return $location;
  310. }
  311. }
  312. return null;
  313. }
  314. private static function _extractIGData($html) {
  315. $doc = new DOMDocument();
  316. @$doc->loadHTML($html);
  317. if(!$doc) {
  318. return null;
  319. }
  320. $xpath = new DOMXPath($doc);
  321. $data = null;
  322. foreach($xpath->query('//script') as $script) {
  323. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  324. $data = json_decode($match[1], true);
  325. }
  326. }
  327. return $data;
  328. }
  329. }