You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

372 lines
11 KiB

  1. <?php
  2. namespace p3k\XRay\Formats;
  3. const BASE_URL = 'https://www.instagram.com/';
  4. const QUERY_MEDIA = BASE_URL.'graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%s';
  5. const QUERY_MEDIA_VARS = '{"id":"%s","first":20,"after":"%s"}';
  6. use DOMDocument, DOMXPath;
  7. use DateTime, DateTimeZone;
  8. class Instagram extends Format {
  9. private static $gis;
  10. public static function matches_host($url) {
  11. $host = parse_url($url, PHP_URL_HOST);
  12. return in_array($host, ['www.instagram.com','instagram.com']);
  13. }
  14. public static function matches($url) {
  15. return self::matches_host($url);
  16. }
  17. public static function parse($http, $html, $url, $opts=[]) {
  18. if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
  19. if(isset($opts['expect']) && $opts['expect'] == 'feed')
  20. return self::parseFeed($http, $html, $url);
  21. else
  22. return self::parseProfile($http, $html, $url);
  23. } else {
  24. return self::parsePhoto($http, $html, $url);
  25. }
  26. }
  27. private static function parseProfile($http, $html, $url) {
  28. $profileData = self::_parseProfileFromHTML($html);
  29. if(!$profileData)
  30. return self::_unknown();
  31. $card = self::_buildHCardFromInstagramProfile($profileData);
  32. return [
  33. 'data' => $card
  34. ];
  35. }
  36. private static function _getIntstagramGIS($params) {
  37. $data = self::$gis.":".$params;
  38. return md5($data);
  39. }
  40. private static function _getMorePhotos($http,$html,$url,$profileData) {
  41. $params = sprintf(QUERY_MEDIA_VARS, $profileData['id'], $profileData['edge_owner_to_timeline_media']['page_info']['end_cursor']);
  42. $url = sprintf(QUERY_MEDIA,$params);
  43. $headers = [];
  44. $headers[] = 'x-instagram-gis: ' . self::_getIntstagramGIS($params);
  45. $headers[] = 'x-requested-with: XMLHttpRequest';
  46. $resp = $http->get($url,$headers);
  47. if(!$resp['error'])
  48. $data = json_decode($resp['body'],true);
  49. $photos = $data['data']['user']['edge_owner_to_timeline_media']['edges'];
  50. return $photos;
  51. return null;
  52. }
  53. private static function parseFeed($http, $html, $url) {
  54. $profileData = self::_parseProfileFromHTML($html);
  55. if(!$profileData)
  56. return self::_unknown();
  57. $photos = $profileData['edge_owner_to_timeline_media']['edges'];
  58. $items = [];
  59. $morePhotos = self::_getMorePhotos($http,$html,$url,$profileData);
  60. $photos = array_merge($photos,$morePhotos);
  61. foreach($photos as $photoData) {
  62. $item = self::parsePhotoFromData($http, $photoData['node'],
  63. BASE_URL.'p/'.$photoData['node']['shortcode'].'/', $profileData);
  64. // Note: Not all the photo info is available in the initial JSON.
  65. // Things like video mp4 URLs and person tags and locations are missing.
  66. // Consumers of the feed will need to fetch the photo permalink in order to get all missing information.
  67. // if($photoData['is_video'])
  68. // $item['data']['video'] = true;
  69. $items[] = $item['data'];
  70. }
  71. return [
  72. 'data' => [
  73. 'type' => 'feed',
  74. 'items' => $items,
  75. ]
  76. ];
  77. }
  78. private static function parsePhoto($http, $html, $url, $profile=false) {
  79. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  80. return self::parsePhotoFromData($http, $photoData, $url, $profile);
  81. }
  82. private static function parsePhotoFromData($http, $photoData, $url, $profile=false) {
  83. if(!$photoData)
  84. return self::_unknown();
  85. // Start building the h-entry
  86. $entry = array(
  87. 'type' => 'entry',
  88. 'url' => $url,
  89. 'author' => [
  90. 'type' => 'card',
  91. 'name' => null,
  92. 'photo' => null,
  93. 'url' => null
  94. ]
  95. );
  96. $profiles = [];
  97. if(!$profile) {
  98. // Fetch profile info for this user
  99. $username = $photoData['owner']['username'];
  100. $profile = self::_getInstagramProfile($username, $http);
  101. if($profile) {
  102. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  103. $profiles[] = $profile;
  104. }
  105. } else {
  106. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  107. $profiles[] = $profile;
  108. }
  109. // Content and hashtags
  110. $caption = false;
  111. if(isset($photoData['caption'])) {
  112. $caption = $photoData['caption'];
  113. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  114. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  115. }
  116. if($caption) {
  117. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  118. $entry['category'] = [];
  119. foreach($matches[1] as $match) {
  120. $entry['category'][] = $match;
  121. }
  122. }
  123. $entry['content'] = [
  124. 'text' => $caption
  125. ];
  126. }
  127. $refs = [];
  128. // Include the photo/video media URLs
  129. // (Always return arrays, even for single images)
  130. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  131. // Multi-post
  132. // For now, we will only pull photos from multi-posts, and skip videos.
  133. $entry['photo'] = [];
  134. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  135. $entry['photo'][] = $edge['node']['display_url'];
  136. // Don't need to pull person-tags from here because the main parent object already has them.
  137. }
  138. } else {
  139. // Single photo or video
  140. if(array_key_exists('display_src', $photoData))
  141. $entry['photo'] = [$photoData['display_src']];
  142. elseif(array_key_exists('display_url', $photoData))
  143. $entry['photo'] = [$photoData['display_url']];
  144. if(isset($photoData['is_video']) && $photoData['is_video'] && isset($photoData['video_url'])) {
  145. $entry['video'] = [$photoData['video_url']];
  146. }
  147. }
  148. // Find person tags and fetch user profiles
  149. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  150. if(!isset($entry['category'])) $entry['category'] = [];
  151. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  152. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  153. if($profile) {
  154. $card = self::_buildHCardFromInstagramProfile($profile);
  155. $entry['category'][] = $card['url'];
  156. $refs[$card['url']] = $card;
  157. $profiles[] = $profile;
  158. }
  159. }
  160. }
  161. // Published date
  162. if(isset($photoData['taken_at_timestamp']))
  163. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  164. elseif(isset($photoData['date']))
  165. $published = DateTime::createFromFormat('U', $photoData['date']);
  166. // Include venue data
  167. $locations = [];
  168. if(isset($photoData['location'])) {
  169. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  170. if($location) {
  171. $entry['location'] = [$location['url']];
  172. $refs[$location['url']] = $location;
  173. $locations[] = $location;
  174. // Look up timezone
  175. if($location['latitude']) {
  176. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  177. if($tz) {
  178. $published->setTimeZone(new DateTimeZone($tz));
  179. }
  180. }
  181. }
  182. }
  183. $entry['published'] = $published->format('c');
  184. if(count($refs)) {
  185. $entry['refs'] = $refs;
  186. }
  187. return [
  188. 'data' => $entry,
  189. 'original' => json_encode([
  190. 'photo' => $photoData,
  191. 'profiles' => $profiles,
  192. 'locations' => $locations
  193. ])
  194. ];
  195. }
  196. private static function _buildHCardFromInstagramProfile($profile) {
  197. if(!$profile) return false;
  198. $author = [
  199. 'type' => 'card'
  200. ];
  201. if($profile['full_name'])
  202. $author['name'] = $profile['full_name'];
  203. else
  204. $author['name'] = $profile['username'];
  205. if(isset($profile['external_url']) && $profile['external_url'])
  206. $author['url'] = $profile['external_url'];
  207. else
  208. $author['url'] = BASE_URL . $profile['username'];
  209. if(isset($profile['profile_pic_url_hd']))
  210. $author['photo'] = $profile['profile_pic_url_hd'];
  211. else
  212. $author['photo'] = $profile['profile_pic_url'];
  213. if(isset($profile['biography']))
  214. $author['note'] = $profile['biography'];
  215. return $author;
  216. }
  217. private static function _getInstagramProfile($username, $http) {
  218. $response = $http->get(BASE_URL.$username.'/');
  219. if(!$response['error'])
  220. return self::_parseProfileFromHTML($response['body']);
  221. return null;
  222. }
  223. private static function _parseProfileFromHTML($html) {
  224. $data = self::_extractIGData($html);
  225. if(isset($data['rhx_gis'])) {
  226. self::$gis = $data['rhx_gis'];
  227. }
  228. if(isset($data['entry_data']['ProfilePage'][0])) {
  229. $profile = $data['entry_data']['ProfilePage'][0];
  230. if($profile && isset($profile['graphql']['user'])) {
  231. $user = $profile['graphql']['user'];
  232. return $user;
  233. }
  234. }
  235. return null;
  236. }
  237. private static function _getInstagramLocation($id, $http) {
  238. $igURL = BASE_URL.'explore/locations/'.$id.'/';
  239. $response = $http->get($igURL);
  240. if($response['body']) {
  241. $data = self::_extractVenueDataFromVenuePage($response['body']);
  242. if($data) {
  243. return [
  244. 'type' => 'card',
  245. 'name' => $data['name'],
  246. 'url' => $igURL,
  247. 'latitude' => $data['lat'],
  248. 'longitude' => $data['lng'],
  249. ];
  250. }
  251. }
  252. return null;
  253. }
  254. private static function _extractPhotoDataFromPhotoPage($html) {
  255. $data = self::_extractIGData($html);
  256. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  257. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  258. $post = $data['entry_data']['PostPage'];
  259. if(isset($post[0]['graphql']['shortcode_media'])) {
  260. return $post[0]['graphql']['shortcode_media'];
  261. } elseif(isset($post[0]['graphql']['media'])) {
  262. return $post[0]['graphql']['media'];
  263. } elseif(isset($post[0]['media'])) {
  264. return $post[0]['media'];
  265. }
  266. }
  267. }
  268. return null;
  269. }
  270. private static function _extractVenueDataFromVenuePage($html) {
  271. $data = self::_extractIGData($html);
  272. if($data && isset($data['entry_data']['LocationsPage'])) {
  273. $data = $data['entry_data']['LocationsPage'];
  274. if(isset($data[0]['graphql']['location'])) {
  275. $location = $data[0]['graphql']['location'];
  276. # we don't need these and they're huge, so drop them now
  277. unset($location['media']);
  278. unset($location['top_posts']);
  279. return $location;
  280. }
  281. }
  282. return null;
  283. }
  284. private static function _extractIGData($html) {
  285. $doc = new DOMDocument();
  286. @$doc->loadHTML($html);
  287. if(!$doc) {
  288. return null;
  289. }
  290. $xpath = new DOMXPath($doc);
  291. $data = null;
  292. foreach($xpath->query('//script') as $script) {
  293. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  294. $data = json_decode($match[1], true);
  295. }
  296. }
  297. return $data;
  298. }
  299. }