You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

402 lines
12 KiB

7 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. class Instagram extends Format {
  6. public static function matches_host($url) {
  7. $host = parse_url($url, PHP_URL_HOST);
  8. return in_array($host, ['www.instagram.com','instagram.com']);
  9. }
  10. public static function matches($url) {
  11. return self::matches_host($url);
  12. }
  13. public static function fetch($http, $url, $opts=[]) {
  14. if(!self::matches($url))
  15. return false;
  16. $headers = [];
  17. if(isset($opts['instagram_session']) && $opts['instagram_session'])
  18. $headers[] = 'Cookie: sessionid='.$opts['instagram_session'];
  19. $result = $http->get($url, $headers);
  20. // Check for errors such as getting redirected to the login page or getting rate limiited
  21. /*
  22. // TODO
  23. if(false) {
  24. return [
  25. 'error' => 'rate_limited',
  26. 'error_description' => 'Instagram has rate limited this client. Please try again later.',
  27. 'url' => $result['url'],
  28. 'code' => $result['code'],
  29. ];
  30. }
  31. if(false) {
  32. return [
  33. 'error' => 'unauthorized',
  34. 'error_description' => 'Instagram redirected to the login page. Either this user is private, or the client has been rate limited.',
  35. 'url' => $result['url'],
  36. 'code' => $result['code'],
  37. ];
  38. }
  39. */
  40. return $result;
  41. }
  42. public static function parse($http, $http_response, $opts=[]) {
  43. $html = $http_response['body'];
  44. $url = $http_response['url'];
  45. if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
  46. if(isset($opts['expect']) && $opts['expect'] == 'feed')
  47. return self::parseFeed($http, $html, $url);
  48. else
  49. return self::parseProfile($http, $html, $url);
  50. } else {
  51. return self::parsePhoto($http, $html, $url);
  52. }
  53. }
  54. private static function parseProfile($http, $html, $url) {
  55. $profileData = self::_parseProfileFromHTML($html);
  56. if(!$profileData)
  57. return self::_unknown();
  58. $card = self::_buildHCardFromInstagramProfile($profileData);
  59. return [
  60. 'data' => $card,
  61. 'source-format' => 'instagram',
  62. ];
  63. }
  64. private static function parseFeed($http, $html, $url) {
  65. $profileData = self::_parseProfileFromHTML($html);
  66. if(!$profileData)
  67. return self::_unknown();
  68. $photos = $profileData['edge_owner_to_timeline_media']['edges'];
  69. $items = [];
  70. foreach($photos as $photoData) {
  71. $item = self::parsePhotoFromData($http, $photoData['node'],
  72. 'https://www.instagram.com/p/'.$photoData['node']['shortcode'].'/', $profileData);
  73. // Note: Not all the photo info is available in the initial JSON.
  74. // Things like video mp4 URLs and person tags and locations are missing.
  75. // Consumers of the feed will need to fetch the photo permalink in order to get all missing information.
  76. // if($photoData['is_video'])
  77. // $item['data']['video'] = true;
  78. $items[] = $item['data'];
  79. }
  80. return [
  81. 'data' => [
  82. 'type' => 'feed',
  83. 'items' => $items,
  84. ],
  85. 'source-format' => 'instagram',
  86. ];
  87. }
  88. private static function parsePhoto($http, $html, $url, $profile=false) {
  89. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  90. return self::parsePhotoFromData($http, $photoData, $url, $profile);
  91. }
  92. private static function altTextIsPlaceholder($text) {
  93. return $text == 'No photo description available.';
  94. }
  95. private static function parsePhotoFromData($http, $photoData, $url, $profile=false) {
  96. if(!$photoData)
  97. return self::_unknown();
  98. // Start building the h-entry
  99. $entry = array(
  100. 'type' => 'entry',
  101. 'url' => $url,
  102. 'author' => [
  103. 'type' => 'card',
  104. 'name' => null,
  105. 'photo' => null,
  106. 'url' => null
  107. ]
  108. );
  109. $profiles = [];
  110. if(!$profile) {
  111. // Fetch profile info for this user
  112. $username = $photoData['owner']['username'];
  113. $profile = self::_getInstagramProfile($username, $http);
  114. if($profile) {
  115. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  116. $profiles[] = $profile;
  117. }
  118. } else {
  119. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  120. $profiles[] = $profile;
  121. }
  122. // Content and hashtags
  123. $caption = false;
  124. if(isset($photoData['caption'])) {
  125. $caption = $photoData['caption'];
  126. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  127. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  128. }
  129. if($caption) {
  130. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  131. $entry['category'] = [];
  132. foreach($matches[1] as $match) {
  133. $entry['category'][] = $match;
  134. }
  135. }
  136. $entry['content'] = [
  137. 'text' => $caption
  138. ];
  139. }
  140. $refs = [];
  141. $meta = [];
  142. // Include the photo/video media URLs
  143. // (Always return arrays, even for single images)
  144. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  145. // Multi-post
  146. // For now, we will only pull photos from multi-posts, and skip videos.
  147. // https://github.com/aaronpk/XRay/issues/84
  148. $entry['photo'] = [];
  149. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  150. $entry['photo'][] = $edge['node']['display_url'];
  151. // Don't need to pull person-tags from here because the main parent object already has them.
  152. if(isset($edge['node']['accessibility_caption']) && $edge['node']['accessibility_caption'] && !self::altTextIsPlaceholder($edge['node']['accessibility_caption'])) {
  153. $meta[$edge['node']['display_url']] = [
  154. 'alt' => $edge['node']['accessibility_caption']
  155. ];
  156. }
  157. }
  158. } else {
  159. // Single photo or video
  160. if(array_key_exists('display_src', $photoData))
  161. $entry['photo'] = [$photoData['display_src']];
  162. elseif(array_key_exists('display_url', $photoData))
  163. $entry['photo'] = [$photoData['display_url']];
  164. if(isset($photoData['accessibility_caption']) && $photoData['accessibility_caption'] && !self::altTextIsPlaceholder($photoData['accessibility_caption'])) {
  165. $meta[$entry['photo'][0]] = [
  166. 'alt' => $photoData['accessibility_caption']
  167. ];
  168. }
  169. if(isset($photoData['is_video']) && $photoData['is_video'] && isset($photoData['video_url'])) {
  170. $entry['video'] = [$photoData['video_url']];
  171. }
  172. }
  173. // Find person tags and fetch user profiles
  174. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  175. if(!isset($entry['category'])) $entry['category'] = [];
  176. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  177. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  178. if($profile) {
  179. $card = self::_buildHCardFromInstagramProfile($profile);
  180. $entry['category'][] = $card['url'];
  181. $refs[$card['url']] = $card;
  182. $profiles[] = $profile;
  183. }
  184. }
  185. }
  186. // Published date
  187. if(isset($photoData['taken_at_timestamp']))
  188. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  189. elseif(isset($photoData['date']))
  190. $published = DateTime::createFromFormat('U', $photoData['date']);
  191. // Include venue data
  192. $locations = [];
  193. if(isset($photoData['location'])) {
  194. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  195. if($location) {
  196. $entry['location'] = [$location['url']];
  197. $refs[$location['url']] = $location;
  198. $locations[] = $location;
  199. // Look up timezone
  200. if($location['latitude']) {
  201. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  202. if($tz) {
  203. $published->setTimeZone(new DateTimeZone($tz));
  204. }
  205. }
  206. }
  207. }
  208. $entry['published'] = $published->format('c');
  209. if(count($refs)) {
  210. $entry['refs'] = $refs;
  211. }
  212. if(count($meta)) {
  213. $entry['meta'] = $meta;
  214. }
  215. $entry['post-type'] = \p3k\XRay\PostType::discover($entry);
  216. return [
  217. 'data' => $entry,
  218. 'original' => json_encode([
  219. 'photo' => $photoData,
  220. 'profiles' => $profiles,
  221. 'locations' => $locations
  222. ]),
  223. 'source-format' => 'instagram',
  224. ];
  225. }
  226. private static function _buildHCardFromInstagramProfile($profile) {
  227. if(!$profile) return false;
  228. $author = [
  229. 'type' => 'card'
  230. ];
  231. if($profile['full_name'])
  232. $author['name'] = $profile['full_name'];
  233. else
  234. $author['name'] = $profile['username'];
  235. if(isset($profile['external_url']) && $profile['external_url'])
  236. $author['url'] = $profile['external_url'];
  237. else
  238. $author['url'] = 'https://www.instagram.com/' . $profile['username'];
  239. if(isset($profile['profile_pic_url_hd']))
  240. $author['photo'] = $profile['profile_pic_url_hd'];
  241. else
  242. $author['photo'] = $profile['profile_pic_url'];
  243. if(isset($profile['biography']))
  244. $author['note'] = $profile['biography'];
  245. return $author;
  246. }
  247. private static function _getInstagramProfile($username, $http) {
  248. $response = $http->get('https://www.instagram.com/'.$username.'/');
  249. if(!$response['error'])
  250. return self::_parseProfileFromHTML($response['body']);
  251. return null;
  252. }
  253. private static function _parseProfileFromHTML($html) {
  254. $data = self::_extractIGData($html);
  255. if(isset($data['entry_data']['ProfilePage'][0])) {
  256. $profile = $data['entry_data']['ProfilePage'][0];
  257. if($profile && isset($profile['graphql']['user'])) {
  258. $user = $profile['graphql']['user'];
  259. return $user;
  260. }
  261. }
  262. return null;
  263. }
  264. private static function _getInstagramLocation($id, $http) {
  265. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  266. $response = $http->get($igURL);
  267. if($response['body']) {
  268. $data = self::_extractVenueDataFromVenuePage($response['body']);
  269. if($data) {
  270. return [
  271. 'type' => 'card',
  272. 'name' => $data['name'],
  273. 'url' => $igURL,
  274. 'latitude' => $data['lat'],
  275. 'longitude' => $data['lng'],
  276. ];
  277. }
  278. }
  279. return null;
  280. }
  281. private static function _extractPhotoDataFromPhotoPage($html) {
  282. $data = self::_extractIGData($html);
  283. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  284. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  285. $post = $data['entry_data']['PostPage'];
  286. if(isset($post[0]['graphql']['shortcode_media'])) {
  287. return $post[0]['graphql']['shortcode_media'];
  288. } elseif(isset($post[0]['graphql']['media'])) {
  289. return $post[0]['graphql']['media'];
  290. } elseif(isset($post[0]['media'])) {
  291. return $post[0]['media'];
  292. }
  293. }
  294. }
  295. return null;
  296. }
  297. private static function _extractVenueDataFromVenuePage($html) {
  298. $data = self::_extractIGData($html);
  299. if($data && isset($data['entry_data']['LocationsPage'])) {
  300. $data = $data['entry_data']['LocationsPage'];
  301. if(isset($data[0]['graphql']['location'])) {
  302. $location = $data[0]['graphql']['location'];
  303. # we don't need these and they're huge, so drop them now
  304. unset($location['media']);
  305. unset($location['top_posts']);
  306. return $location;
  307. }
  308. }
  309. return null;
  310. }
  311. private static function _extractIGData($html) {
  312. $doc = new DOMDocument();
  313. @$doc->loadHTML($html);
  314. if(!$doc) {
  315. return null;
  316. }
  317. $xpath = new DOMXPath($doc);
  318. $data = null;
  319. foreach($xpath->query('//script') as $script) {
  320. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  321. $data = json_decode($match[1], true);
  322. }
  323. }
  324. return $data;
  325. }
  326. }