You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

409 lines
12 KiB

  1. <?php
  2. namespace p3k\XRay\Formats;
  3. const BASE_URL = 'https://www.instagram.com/';
  4. const QUERY_MEDIA = BASE_URL.'graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%s';
  5. const QUERY_MEDIA_VARS = '{"id":"%s","first":%d,"after":"%s"}';
  6. use DOMDocument, DOMXPath;
  7. use DateTime, DateTimeZone;
  8. class Instagram extends Format {
  9. private static $gis;
  10. private static $extra_photos = 20;
  11. public static function matches_host($url) {
  12. $host = parse_url($url, PHP_URL_HOST);
  13. return in_array($host, ['www.instagram.com','instagram.com']);
  14. }
  15. public static function matches($url) {
  16. return self::matches_host($url);
  17. }
  18. public static function parse($http, $html, $url, $opts=[]) {
  19. if(isset($opts['length'])) {
  20. self::$extra_photos = intval($opts['length'])-12;
  21. }
  22. if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
  23. if(isset($opts['expect']) && $opts['expect'] == 'feed')
  24. return self::parseFeed($http, $html, $url);
  25. else
  26. return self::parseProfile($http, $html, $url);
  27. } else {
  28. return self::parsePhoto($http, $html, $url);
  29. }
  30. }
  31. private static function parseProfile($http, $html, $url) {
  32. $profileData = self::_parseProfileFromHTML($html);
  33. if(!$profileData)
  34. return self::_unknown();
  35. $card = self::_buildHCardFromInstagramProfile($profileData);
  36. return [
  37. 'data' => $card,
  38. 'source-format' => 'instagram',
  39. ];
  40. }
  41. private static function _getIntstagramGIS($params) {
  42. $data = self::$gis.":".$params;
  43. return md5($data);
  44. }
  45. private static function _getMorePhotos($http,$html,$url,$profileData) {
  46. $params = sprintf(
  47. QUERY_MEDIA_VARS,
  48. $profileData['id'],
  49. self::$extra_photos,
  50. $profileData['edge_owner_to_timeline_media']['page_info']['end_cursor']
  51. );
  52. $url = sprintf(QUERY_MEDIA,$params);
  53. $headers = [];
  54. $headers[] = 'x-instagram-gis: ' . self::_getIntstagramGIS($params);
  55. $headers[] = 'x-requested-with: XMLHttpRequest';
  56. $resp = $http->get($url,$headers);
  57. if(!$resp['error']) {
  58. $data = json_decode($resp['body'],true);
  59. $photos = $data['data']['user']['edge_owner_to_timeline_media']['edges'];
  60. return $photos;
  61. }
  62. return null;
  63. }
  64. private static function parseFeed($http, $html, $url) {
  65. $profileData = self::_parseProfileFromHTML($html);
  66. if(!$profileData)
  67. return self::_unknown();
  68. $photos = $profileData['edge_owner_to_timeline_media']['edges'];
  69. $items = [];
  70. $morePhotos = self::_getMorePhotos($http,$html,$url,$profileData);
  71. $photos = array_merge($photos,$morePhotos);
  72. foreach($photos as $photoData) {
  73. $item = self::parsePhotoFromData($http, $photoData['node'],
  74. BASE_URL.'p/'.$photoData['node']['shortcode'].'/', $profileData);
  75. // Note: Not all the photo info is available in the initial JSON.
  76. // Things like video mp4 URLs and person tags and locations are missing.
  77. // Consumers of the feed will need to fetch the photo permalink in order to get all missing information.
  78. // if($photoData['is_video'])
  79. // $item['data']['video'] = true;
  80. $items[] = $item['data'];
  81. }
  82. return [
  83. 'data' => [
  84. 'type' => 'feed',
  85. 'items' => $items,
  86. ],
  87. 'source-format' => 'instagram',
  88. ];
  89. }
  90. private static function parsePhoto($http, $html, $url, $profile=false) {
  91. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  92. return self::parsePhotoFromData($http, $photoData, $url, $profile);
  93. }
  94. private static function altTextIsPlaceholder($text) {
  95. return $text == 'No photo description available.';
  96. }
  97. private static function parsePhotoFromData($http, $photoData, $url, $profile=false) {
  98. if(!$photoData)
  99. return self::_unknown();
  100. // Start building the h-entry
  101. $entry = array(
  102. 'type' => 'entry',
  103. 'url' => $url,
  104. 'author' => [
  105. 'type' => 'card',
  106. 'name' => null,
  107. 'photo' => null,
  108. 'url' => null
  109. ]
  110. );
  111. $profiles = [];
  112. if(!$profile) {
  113. // Fetch profile info for this user
  114. $username = $photoData['owner']['username'];
  115. $profile = self::_getInstagramProfile($username, $http);
  116. if($profile) {
  117. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  118. $profiles[] = $profile;
  119. }
  120. } else {
  121. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  122. $profiles[] = $profile;
  123. }
  124. // Content and hashtags
  125. $caption = false;
  126. if(isset($photoData['caption'])) {
  127. $caption = $photoData['caption'];
  128. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  129. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  130. }
  131. if($caption) {
  132. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  133. $entry['category'] = [];
  134. foreach($matches[1] as $match) {
  135. $entry['category'][] = $match;
  136. }
  137. }
  138. $entry['content'] = [
  139. 'text' => $caption
  140. ];
  141. }
  142. $refs = [];
  143. $meta = [];
  144. // Include the photo/video media URLs
  145. // (Always return arrays, even for single images)
  146. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  147. // Multi-post
  148. // For now, we will only pull photos from multi-posts, and skip videos.
  149. // https://github.com/aaronpk/XRay/issues/84
  150. $entry['photo'] = [];
  151. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  152. $entry['photo'][] = $edge['node']['display_url'];
  153. // Don't need to pull person-tags from here because the main parent object already has them.
  154. if(isset($edge['node']['accessibility_caption']) && $edge['node']['accessibility_caption'] && !self::altTextIsPlaceholder($edge['node']['accessibility_caption'])) {
  155. $meta[$edge['node']['display_url']] = [
  156. 'alt' => $edge['node']['accessibility_caption']
  157. ];
  158. }
  159. }
  160. } else {
  161. // Single photo or video
  162. if(array_key_exists('display_src', $photoData))
  163. $entry['photo'] = [$photoData['display_src']];
  164. elseif(array_key_exists('display_url', $photoData))
  165. $entry['photo'] = [$photoData['display_url']];
  166. if(isset($photoData['accessibility_caption']) && $photoData['accessibility_caption'] && !self::altTextIsPlaceholder($photoData['accessibility_caption'])) {
  167. $meta[$entry['photo'][0]] = [
  168. 'alt' => $photoData['accessibility_caption']
  169. ];
  170. }
  171. if(isset($photoData['is_video']) && $photoData['is_video'] && isset($photoData['video_url'])) {
  172. $entry['video'] = [$photoData['video_url']];
  173. }
  174. }
  175. // Find person tags and fetch user profiles
  176. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  177. if(!isset($entry['category'])) $entry['category'] = [];
  178. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  179. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  180. if($profile) {
  181. $card = self::_buildHCardFromInstagramProfile($profile);
  182. $entry['category'][] = $card['url'];
  183. $refs[$card['url']] = $card;
  184. $profiles[] = $profile;
  185. }
  186. }
  187. }
  188. // Published date
  189. if(isset($photoData['taken_at_timestamp']))
  190. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  191. elseif(isset($photoData['date']))
  192. $published = DateTime::createFromFormat('U', $photoData['date']);
  193. // Include venue data
  194. $locations = [];
  195. if(isset($photoData['location'])) {
  196. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  197. if($location) {
  198. $entry['location'] = [$location['url']];
  199. $refs[$location['url']] = $location;
  200. $locations[] = $location;
  201. // Look up timezone
  202. if($location['latitude']) {
  203. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  204. if($tz) {
  205. $published->setTimeZone(new DateTimeZone($tz));
  206. }
  207. }
  208. }
  209. }
  210. $entry['published'] = $published->format('c');
  211. if(count($refs)) {
  212. $entry['refs'] = $refs;
  213. }
  214. if(count($meta)) {
  215. $entry['meta'] = $meta;
  216. }
  217. $entry['post-type'] = \p3k\XRay\PostType::discover($entry);
  218. return [
  219. 'data' => $entry,
  220. 'original' => json_encode([
  221. 'photo' => $photoData,
  222. 'profiles' => $profiles,
  223. 'locations' => $locations
  224. ]),
  225. 'source-format' => 'instagram',
  226. ];
  227. }
  228. private static function _buildHCardFromInstagramProfile($profile) {
  229. if(!$profile) return false;
  230. $author = [
  231. 'type' => 'card'
  232. ];
  233. if($profile['full_name'])
  234. $author['name'] = $profile['full_name'];
  235. else
  236. $author['name'] = $profile['username'];
  237. if(isset($profile['external_url']) && $profile['external_url'])
  238. $author['url'] = $profile['external_url'];
  239. else
  240. $author['url'] = BASE_URL . $profile['username'];
  241. if(isset($profile['profile_pic_url_hd']))
  242. $author['photo'] = $profile['profile_pic_url_hd'];
  243. else
  244. $author['photo'] = $profile['profile_pic_url'];
  245. if(isset($profile['biography']))
  246. $author['note'] = $profile['biography'];
  247. return $author;
  248. }
  249. private static function _getInstagramProfile($username, $http) {
  250. $response = $http->get(BASE_URL.$username.'/');
  251. if(!$response['error'])
  252. return self::_parseProfileFromHTML($response['body']);
  253. return null;
  254. }
  255. private static function _parseProfileFromHTML($html) {
  256. $data = self::_extractIGData($html);
  257. if(isset($data['rhx_gis'])) {
  258. self::$gis = $data['rhx_gis'];
  259. }
  260. if(isset($data['entry_data']['ProfilePage'][0])) {
  261. $profile = $data['entry_data']['ProfilePage'][0];
  262. if($profile && isset($profile['graphql']['user'])) {
  263. $user = $profile['graphql']['user'];
  264. return $user;
  265. }
  266. }
  267. return null;
  268. }
  269. private static function _getInstagramLocation($id, $http) {
  270. $igURL = BASE_URL.'explore/locations/'.$id.'/';
  271. $response = $http->get($igURL);
  272. if($response['body']) {
  273. $data = self::_extractVenueDataFromVenuePage($response['body']);
  274. if($data) {
  275. return [
  276. 'type' => 'card',
  277. 'name' => $data['name'],
  278. 'url' => $igURL,
  279. 'latitude' => $data['lat'],
  280. 'longitude' => $data['lng'],
  281. ];
  282. }
  283. }
  284. return null;
  285. }
  286. private static function _extractPhotoDataFromPhotoPage($html) {
  287. $data = self::_extractIGData($html);
  288. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  289. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  290. $post = $data['entry_data']['PostPage'];
  291. if(isset($post[0]['graphql']['shortcode_media'])) {
  292. return $post[0]['graphql']['shortcode_media'];
  293. } elseif(isset($post[0]['graphql']['media'])) {
  294. return $post[0]['graphql']['media'];
  295. } elseif(isset($post[0]['media'])) {
  296. return $post[0]['media'];
  297. }
  298. }
  299. }
  300. return null;
  301. }
  302. private static function _extractVenueDataFromVenuePage($html) {
  303. $data = self::_extractIGData($html);
  304. if($data && isset($data['entry_data']['LocationsPage'])) {
  305. $data = $data['entry_data']['LocationsPage'];
  306. if(isset($data[0]['graphql']['location'])) {
  307. $location = $data[0]['graphql']['location'];
  308. # we don't need these and they're huge, so drop them now
  309. unset($location['media']);
  310. unset($location['top_posts']);
  311. return $location;
  312. }
  313. }
  314. return null;
  315. }
  316. private static function _extractIGData($html) {
  317. $doc = new DOMDocument();
  318. @$doc->loadHTML($html);
  319. if(!$doc) {
  320. return null;
  321. }
  322. $xpath = new DOMXPath($doc);
  323. $data = null;
  324. foreach($xpath->query('//script') as $script) {
  325. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  326. $data = json_decode($match[1], true);
  327. }
  328. }
  329. return $data;
  330. }
  331. }