You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

366 lines
11 KiB

7 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. class Instagram extends Format {
  6. public static function matches_host($url) {
  7. $host = parse_url($url, PHP_URL_HOST);
  8. return in_array($host, ['www.instagram.com','instagram.com']);
  9. }
  10. public static function matches($url) {
  11. return self::matches_host($url);
  12. }
  13. public static function parse($http, $http_response, $opts=[]) {
  14. $html = $http_response['body'];
  15. $url = $http_response['url'];
  16. if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
  17. if(isset($opts['expect']) && $opts['expect'] == 'feed')
  18. return self::parseFeed($http, $html, $url);
  19. else
  20. return self::parseProfile($http, $html, $url);
  21. } else {
  22. return self::parsePhoto($http, $html, $url);
  23. }
  24. }
  25. private static function parseProfile($http, $html, $url) {
  26. $profileData = self::_parseProfileFromHTML($html);
  27. if(!$profileData)
  28. return self::_unknown();
  29. $card = self::_buildHCardFromInstagramProfile($profileData);
  30. return [
  31. 'data' => $card,
  32. 'source-format' => 'instagram',
  33. ];
  34. }
  35. private static function parseFeed($http, $html, $url) {
  36. $profileData = self::_parseProfileFromHTML($html);
  37. if(!$profileData)
  38. return self::_unknown();
  39. $photos = $profileData['edge_owner_to_timeline_media']['edges'];
  40. $items = [];
  41. foreach($photos as $photoData) {
  42. $item = self::parsePhotoFromData($http, $photoData['node'],
  43. 'https://www.instagram.com/p/'.$photoData['node']['shortcode'].'/', $profileData);
  44. // Note: Not all the photo info is available in the initial JSON.
  45. // Things like video mp4 URLs and person tags and locations are missing.
  46. // Consumers of the feed will need to fetch the photo permalink in order to get all missing information.
  47. // if($photoData['is_video'])
  48. // $item['data']['video'] = true;
  49. $items[] = $item['data'];
  50. }
  51. return [
  52. 'data' => [
  53. 'type' => 'feed',
  54. 'items' => $items,
  55. ],
  56. 'source-format' => 'instagram',
  57. ];
  58. }
  59. private static function parsePhoto($http, $html, $url, $profile=false) {
  60. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  61. return self::parsePhotoFromData($http, $photoData, $url, $profile);
  62. }
  63. private static function altTextIsPlaceholder($text) {
  64. return $text == 'No photo description available.';
  65. }
  66. private static function parsePhotoFromData($http, $photoData, $url, $profile=false) {
  67. if(!$photoData)
  68. return self::_unknown();
  69. // Start building the h-entry
  70. $entry = array(
  71. 'type' => 'entry',
  72. 'url' => $url,
  73. 'author' => [
  74. 'type' => 'card',
  75. 'name' => null,
  76. 'photo' => null,
  77. 'url' => null
  78. ]
  79. );
  80. $profiles = [];
  81. if(!$profile) {
  82. // Fetch profile info for this user
  83. $username = $photoData['owner']['username'];
  84. $profile = self::_getInstagramProfile($username, $http);
  85. if($profile) {
  86. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  87. $profiles[] = $profile;
  88. }
  89. } else {
  90. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  91. $profiles[] = $profile;
  92. }
  93. // Content and hashtags
  94. $caption = false;
  95. if(isset($photoData['caption'])) {
  96. $caption = $photoData['caption'];
  97. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  98. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  99. }
  100. if($caption) {
  101. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  102. $entry['category'] = [];
  103. foreach($matches[1] as $match) {
  104. $entry['category'][] = $match;
  105. }
  106. }
  107. $entry['content'] = [
  108. 'text' => $caption
  109. ];
  110. }
  111. $refs = [];
  112. $meta = [];
  113. // Include the photo/video media URLs
  114. // (Always return arrays, even for single images)
  115. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  116. // Multi-post
  117. // For now, we will only pull photos from multi-posts, and skip videos.
  118. // https://github.com/aaronpk/XRay/issues/84
  119. $entry['photo'] = [];
  120. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  121. $entry['photo'][] = $edge['node']['display_url'];
  122. // Don't need to pull person-tags from here because the main parent object already has them.
  123. if(isset($edge['node']['accessibility_caption']) && $edge['node']['accessibility_caption'] && !self::altTextIsPlaceholder($edge['node']['accessibility_caption'])) {
  124. $meta[$edge['node']['display_url']] = [
  125. 'alt' => $edge['node']['accessibility_caption']
  126. ];
  127. }
  128. }
  129. } else {
  130. // Single photo or video
  131. if(array_key_exists('display_src', $photoData))
  132. $entry['photo'] = [$photoData['display_src']];
  133. elseif(array_key_exists('display_url', $photoData))
  134. $entry['photo'] = [$photoData['display_url']];
  135. if(isset($photoData['accessibility_caption']) && $photoData['accessibility_caption'] && !self::altTextIsPlaceholder($photoData['accessibility_caption'])) {
  136. $meta[$entry['photo'][0]] = [
  137. 'alt' => $photoData['accessibility_caption']
  138. ];
  139. }
  140. if(isset($photoData['is_video']) && $photoData['is_video'] && isset($photoData['video_url'])) {
  141. $entry['video'] = [$photoData['video_url']];
  142. }
  143. }
  144. // Find person tags and fetch user profiles
  145. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  146. if(!isset($entry['category'])) $entry['category'] = [];
  147. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  148. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  149. if($profile) {
  150. $card = self::_buildHCardFromInstagramProfile($profile);
  151. $entry['category'][] = $card['url'];
  152. $refs[$card['url']] = $card;
  153. $profiles[] = $profile;
  154. }
  155. }
  156. }
  157. // Published date
  158. if(isset($photoData['taken_at_timestamp']))
  159. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  160. elseif(isset($photoData['date']))
  161. $published = DateTime::createFromFormat('U', $photoData['date']);
  162. // Include venue data
  163. $locations = [];
  164. if(isset($photoData['location'])) {
  165. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  166. if($location) {
  167. $entry['location'] = [$location['url']];
  168. $refs[$location['url']] = $location;
  169. $locations[] = $location;
  170. // Look up timezone
  171. if($location['latitude']) {
  172. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  173. if($tz) {
  174. $published->setTimeZone(new DateTimeZone($tz));
  175. }
  176. }
  177. }
  178. }
  179. $entry['published'] = $published->format('c');
  180. if(count($refs)) {
  181. $entry['refs'] = $refs;
  182. }
  183. if(count($meta)) {
  184. $entry['meta'] = $meta;
  185. }
  186. $entry['post-type'] = \p3k\XRay\PostType::discover($entry);
  187. return [
  188. 'data' => $entry,
  189. 'original' => json_encode([
  190. 'photo' => $photoData,
  191. 'profiles' => $profiles,
  192. 'locations' => $locations
  193. ]),
  194. 'source-format' => 'instagram',
  195. ];
  196. }
  197. private static function _buildHCardFromInstagramProfile($profile) {
  198. if(!$profile) return false;
  199. $author = [
  200. 'type' => 'card'
  201. ];
  202. if($profile['full_name'])
  203. $author['name'] = $profile['full_name'];
  204. else
  205. $author['name'] = $profile['username'];
  206. if(isset($profile['external_url']) && $profile['external_url'])
  207. $author['url'] = $profile['external_url'];
  208. else
  209. $author['url'] = 'https://www.instagram.com/' . $profile['username'];
  210. if(isset($profile['profile_pic_url_hd']))
  211. $author['photo'] = $profile['profile_pic_url_hd'];
  212. else
  213. $author['photo'] = $profile['profile_pic_url'];
  214. if(isset($profile['biography']))
  215. $author['note'] = $profile['biography'];
  216. return $author;
  217. }
  218. private static function _getInstagramProfile($username, $http) {
  219. $response = $http->get('https://www.instagram.com/'.$username.'/');
  220. if(!$response['error'])
  221. return self::_parseProfileFromHTML($response['body']);
  222. return null;
  223. }
  224. private static function _parseProfileFromHTML($html) {
  225. $data = self::_extractIGData($html);
  226. if(isset($data['entry_data']['ProfilePage'][0])) {
  227. $profile = $data['entry_data']['ProfilePage'][0];
  228. if($profile && isset($profile['graphql']['user'])) {
  229. $user = $profile['graphql']['user'];
  230. return $user;
  231. }
  232. }
  233. return null;
  234. }
  235. private static function _getInstagramLocation($id, $http) {
  236. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  237. $response = $http->get($igURL);
  238. if($response['body']) {
  239. $data = self::_extractVenueDataFromVenuePage($response['body']);
  240. if($data) {
  241. return [
  242. 'type' => 'card',
  243. 'name' => $data['name'],
  244. 'url' => $igURL,
  245. 'latitude' => $data['lat'],
  246. 'longitude' => $data['lng'],
  247. ];
  248. }
  249. }
  250. return null;
  251. }
  252. private static function _extractPhotoDataFromPhotoPage($html) {
  253. $data = self::_extractIGData($html);
  254. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  255. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  256. $post = $data['entry_data']['PostPage'];
  257. if(isset($post[0]['graphql']['shortcode_media'])) {
  258. return $post[0]['graphql']['shortcode_media'];
  259. } elseif(isset($post[0]['graphql']['media'])) {
  260. return $post[0]['graphql']['media'];
  261. } elseif(isset($post[0]['media'])) {
  262. return $post[0]['media'];
  263. }
  264. }
  265. }
  266. return null;
  267. }
  268. private static function _extractVenueDataFromVenuePage($html) {
  269. $data = self::_extractIGData($html);
  270. if($data && isset($data['entry_data']['LocationsPage'])) {
  271. $data = $data['entry_data']['LocationsPage'];
  272. if(isset($data[0]['graphql']['location'])) {
  273. $location = $data[0]['graphql']['location'];
  274. # we don't need these and they're huge, so drop them now
  275. unset($location['media']);
  276. unset($location['top_posts']);
  277. return $location;
  278. }
  279. }
  280. return null;
  281. }
  282. private static function _extractIGData($html) {
  283. $doc = new DOMDocument();
  284. @$doc->loadHTML($html);
  285. if(!$doc) {
  286. return null;
  287. }
  288. $xpath = new DOMXPath($doc);
  289. $data = null;
  290. foreach($xpath->query('//script') as $script) {
  291. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  292. $data = json_decode($match[1], true);
  293. }
  294. }
  295. return $data;
  296. }
  297. }