You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

359 lines
10 KiB

7 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. class Instagram extends Format {
  6. public static function matches_host($url) {
  7. $host = parse_url($url, PHP_URL_HOST);
  8. return in_array($host, ['www.instagram.com','instagram.com']);
  9. }
  10. public static function matches($url) {
  11. return self::matches_host($url);
  12. }
  13. public static function parse($http, $html, $url, $opts=[]) {
  14. if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
  15. if(isset($opts['expect']) && $opts['expect'] == 'feed')
  16. return self::parseFeed($http, $html, $url);
  17. else
  18. return self::parseProfile($http, $html, $url);
  19. } else {
  20. return self::parsePhoto($http, $html, $url);
  21. }
  22. }
  23. private static function parseProfile($http, $html, $url) {
  24. $profileData = self::_parseProfileFromHTML($html);
  25. if(!$profileData)
  26. return self::_unknown();
  27. $card = self::_buildHCardFromInstagramProfile($profileData);
  28. return [
  29. 'data' => $card,
  30. 'source-format' => 'instagram',
  31. ];
  32. }
  33. private static function parseFeed($http, $html, $url) {
  34. $profileData = self::_parseProfileFromHTML($html);
  35. if(!$profileData)
  36. return self::_unknown();
  37. $photos = $profileData['edge_owner_to_timeline_media']['edges'];
  38. $items = [];
  39. foreach($photos as $photoData) {
  40. $item = self::parsePhotoFromData($http, $photoData['node'],
  41. 'https://www.instagram.com/p/'.$photoData['node']['shortcode'].'/', $profileData);
  42. // Note: Not all the photo info is available in the initial JSON.
  43. // Things like video mp4 URLs and person tags and locations are missing.
  44. // Consumers of the feed will need to fetch the photo permalink in order to get all missing information.
  45. // if($photoData['is_video'])
  46. // $item['data']['video'] = true;
  47. $items[] = $item['data'];
  48. }
  49. return [
  50. 'data' => [
  51. 'type' => 'feed',
  52. 'items' => $items,
  53. ],
  54. 'source-format' => 'instagram',
  55. ];
  56. }
  57. private static function parsePhoto($http, $html, $url, $profile=false) {
  58. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  59. return self::parsePhotoFromData($http, $photoData, $url, $profile);
  60. }
  61. private static function parsePhotoFromData($http, $photoData, $url, $profile=false) {
  62. if(!$photoData)
  63. return self::_unknown();
  64. // Start building the h-entry
  65. $entry = array(
  66. 'type' => 'entry',
  67. 'url' => $url,
  68. 'author' => [
  69. 'type' => 'card',
  70. 'name' => null,
  71. 'photo' => null,
  72. 'url' => null
  73. ]
  74. );
  75. $profiles = [];
  76. if(!$profile) {
  77. // Fetch profile info for this user
  78. $username = $photoData['owner']['username'];
  79. $profile = self::_getInstagramProfile($username, $http);
  80. if($profile) {
  81. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  82. $profiles[] = $profile;
  83. }
  84. } else {
  85. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  86. $profiles[] = $profile;
  87. }
  88. // Content and hashtags
  89. $caption = false;
  90. if(isset($photoData['caption'])) {
  91. $caption = $photoData['caption'];
  92. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  93. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  94. }
  95. if($caption) {
  96. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  97. $entry['category'] = [];
  98. foreach($matches[1] as $match) {
  99. $entry['category'][] = $match;
  100. }
  101. }
  102. $entry['content'] = [
  103. 'text' => $caption
  104. ];
  105. }
  106. $refs = [];
  107. $meta = [];
  108. // Include the photo/video media URLs
  109. // (Always return arrays, even for single images)
  110. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  111. // Multi-post
  112. // For now, we will only pull photos from multi-posts, and skip videos.
  113. // https://github.com/aaronpk/XRay/issues/84
  114. $entry['photo'] = [];
  115. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  116. $entry['photo'][] = $edge['node']['display_url'];
  117. // Don't need to pull person-tags from here because the main parent object already has them.
  118. if(isset($edge['node']['accessibility_caption'])) {
  119. $meta[$edge['node']['display_url']] = [
  120. 'alt' => $edge['node']['accessibility_caption']
  121. ];
  122. }
  123. }
  124. } else {
  125. // Single photo or video
  126. if(array_key_exists('display_src', $photoData))
  127. $entry['photo'] = [$photoData['display_src']];
  128. elseif(array_key_exists('display_url', $photoData))
  129. $entry['photo'] = [$photoData['display_url']];
  130. if(isset($photoData['accessibility_caption']) && $photoData['accessibility_caption']) {
  131. $meta[$entry['photo'][0]] = [
  132. 'alt' => $photoData['accessibility_caption']
  133. ];
  134. }
  135. if(isset($photoData['is_video']) && $photoData['is_video'] && isset($photoData['video_url'])) {
  136. $entry['video'] = [$photoData['video_url']];
  137. }
  138. }
  139. // Find person tags and fetch user profiles
  140. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  141. if(!isset($entry['category'])) $entry['category'] = [];
  142. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  143. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  144. if($profile) {
  145. $card = self::_buildHCardFromInstagramProfile($profile);
  146. $entry['category'][] = $card['url'];
  147. $refs[$card['url']] = $card;
  148. $profiles[] = $profile;
  149. }
  150. }
  151. }
  152. // Published date
  153. if(isset($photoData['taken_at_timestamp']))
  154. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  155. elseif(isset($photoData['date']))
  156. $published = DateTime::createFromFormat('U', $photoData['date']);
  157. // Include venue data
  158. $locations = [];
  159. if(isset($photoData['location'])) {
  160. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  161. if($location) {
  162. $entry['location'] = [$location['url']];
  163. $refs[$location['url']] = $location;
  164. $locations[] = $location;
  165. // Look up timezone
  166. if($location['latitude']) {
  167. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  168. if($tz) {
  169. $published->setTimeZone(new DateTimeZone($tz));
  170. }
  171. }
  172. }
  173. }
  174. $entry['published'] = $published->format('c');
  175. if(count($refs)) {
  176. $entry['refs'] = $refs;
  177. }
  178. if(count($meta)) {
  179. $entry['meta'] = $meta;
  180. }
  181. $entry['post-type'] = \p3k\XRay\PostType::discover($entry);
  182. return [
  183. 'data' => $entry,
  184. 'original' => json_encode([
  185. 'photo' => $photoData,
  186. 'profiles' => $profiles,
  187. 'locations' => $locations
  188. ]),
  189. 'source-format' => 'instagram',
  190. ];
  191. }
  192. private static function _buildHCardFromInstagramProfile($profile) {
  193. if(!$profile) return false;
  194. $author = [
  195. 'type' => 'card'
  196. ];
  197. if($profile['full_name'])
  198. $author['name'] = $profile['full_name'];
  199. else
  200. $author['name'] = $profile['username'];
  201. if(isset($profile['external_url']) && $profile['external_url'])
  202. $author['url'] = $profile['external_url'];
  203. else
  204. $author['url'] = 'https://www.instagram.com/' . $profile['username'];
  205. if(isset($profile['profile_pic_url_hd']))
  206. $author['photo'] = $profile['profile_pic_url_hd'];
  207. else
  208. $author['photo'] = $profile['profile_pic_url'];
  209. if(isset($profile['biography']))
  210. $author['note'] = $profile['biography'];
  211. return $author;
  212. }
  213. private static function _getInstagramProfile($username, $http) {
  214. $response = $http->get('https://www.instagram.com/'.$username.'/');
  215. if(!$response['error'])
  216. return self::_parseProfileFromHTML($response['body']);
  217. return null;
  218. }
  219. private static function _parseProfileFromHTML($html) {
  220. $data = self::_extractIGData($html);
  221. if(isset($data['entry_data']['ProfilePage'][0])) {
  222. $profile = $data['entry_data']['ProfilePage'][0];
  223. if($profile && isset($profile['graphql']['user'])) {
  224. $user = $profile['graphql']['user'];
  225. return $user;
  226. }
  227. }
  228. return null;
  229. }
  230. private static function _getInstagramLocation($id, $http) {
  231. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  232. $response = $http->get($igURL);
  233. if($response['body']) {
  234. $data = self::_extractVenueDataFromVenuePage($response['body']);
  235. if($data) {
  236. return [
  237. 'type' => 'card',
  238. 'name' => $data['name'],
  239. 'url' => $igURL,
  240. 'latitude' => $data['lat'],
  241. 'longitude' => $data['lng'],
  242. ];
  243. }
  244. }
  245. return null;
  246. }
  247. private static function _extractPhotoDataFromPhotoPage($html) {
  248. $data = self::_extractIGData($html);
  249. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  250. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  251. $post = $data['entry_data']['PostPage'];
  252. if(isset($post[0]['graphql']['shortcode_media'])) {
  253. return $post[0]['graphql']['shortcode_media'];
  254. } elseif(isset($post[0]['graphql']['media'])) {
  255. return $post[0]['graphql']['media'];
  256. } elseif(isset($post[0]['media'])) {
  257. return $post[0]['media'];
  258. }
  259. }
  260. }
  261. return null;
  262. }
  263. private static function _extractVenueDataFromVenuePage($html) {
  264. $data = self::_extractIGData($html);
  265. if($data && isset($data['entry_data']['LocationsPage'])) {
  266. $data = $data['entry_data']['LocationsPage'];
  267. if(isset($data[0]['graphql']['location'])) {
  268. $location = $data[0]['graphql']['location'];
  269. # we don't need these and they're huge, so drop them now
  270. unset($location['media']);
  271. unset($location['top_posts']);
  272. return $location;
  273. }
  274. }
  275. return null;
  276. }
  277. private static function _extractIGData($html) {
  278. $doc = new DOMDocument();
  279. @$doc->loadHTML($html);
  280. if(!$doc) {
  281. return null;
  282. }
  283. $xpath = new DOMXPath($doc);
  284. $data = null;
  285. foreach($xpath->query('//script') as $script) {
  286. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  287. $data = json_decode($match[1], true);
  288. }
  289. }
  290. return $data;
  291. }
  292. }