You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

287 lines
8.1 KiB

7 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. class Instagram extends Format {
  6. public static function matches_host($url) {
  7. $host = parse_url($url, PHP_URL_HOST);
  8. return in_array($host, ['www.instagram.com','instagram.com']);
  9. }
  10. public static function matches($url) {
  11. return self::matches_host($url);
  12. }
  13. public static function parse($http, $html, $url) {
  14. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  15. if(!$photoData)
  16. return self::_unknown();
  17. // Start building the h-entry
  18. $entry = array(
  19. 'type' => 'entry',
  20. 'url' => $url,
  21. 'author' => [
  22. 'type' => 'card',
  23. 'name' => null,
  24. 'photo' => null,
  25. 'url' => null
  26. ]
  27. );
  28. $profiles = [];
  29. // Fetch profile info for this user
  30. $username = $photoData['owner']['username'];
  31. $profile = self::_getInstagramProfile($username, $http);
  32. if($profile) {
  33. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  34. $profiles[] = $profile;
  35. }
  36. // Content and hashtags
  37. $caption = false;
  38. if(isset($photoData['caption'])) {
  39. $caption = $photoData['caption'];
  40. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  41. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  42. }
  43. if($caption) {
  44. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  45. $entry['category'] = [];
  46. foreach($matches[1] as $match) {
  47. $entry['category'][] = $match;
  48. }
  49. }
  50. $entry['content'] = [
  51. 'text' => $caption
  52. ];
  53. }
  54. $refs = [];
  55. // Include the photo/video media URLs
  56. // (Always return arrays, even for single images)
  57. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  58. // Multi-post
  59. // For now, we will only pull photos from multi-posts, and skip videos.
  60. $entry['photo'] = [];
  61. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  62. $entry['photo'][] = $edge['node']['display_url'];
  63. // Don't need to pull person-tags from here because the main parent object already has them.
  64. }
  65. } else {
  66. // Single photo or video
  67. if(array_key_exists('display_src', $photoData))
  68. $entry['photo'] = [$photoData['display_src']];
  69. elseif(array_key_exists('display_url', $photoData))
  70. $entry['photo'] = [$photoData['display_url']];
  71. if(array_key_exists('is_video', $photoData) && $photoData['is_video']) {
  72. $entry['video'] = [$photoData['video_url']];
  73. }
  74. }
  75. // Find person tags and fetch user profiles
  76. // old instagram json
  77. if(isset($photoData['usertags']['nodes'])) {
  78. if(!isset($entry['category'])) $entry['category'] = [];
  79. foreach($photoData['usertags']['nodes'] as $tag) {
  80. $profile = self::_getInstagramProfile($tag['user']['username'], $http);
  81. if($profile) {
  82. $card = self::_buildHCardFromInstagramProfile($profile);
  83. $entry['category'][] = $card['url'];
  84. $refs[$card['url']] = $card;
  85. $profiles[] = $profile;
  86. }
  87. }
  88. }
  89. // new instagram json as of approximately 2017-04-19
  90. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  91. if(!isset($entry['category'])) $entry['category'] = [];
  92. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  93. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  94. if($profile) {
  95. $card = self::_buildHCardFromInstagramProfile($profile);
  96. $entry['category'][] = $card['url'];
  97. $refs[$card['url']] = $card;
  98. $profiles[] = $profile;
  99. }
  100. }
  101. }
  102. // Published date
  103. if(array_key_exists('taken_at_timestamp', $photoData))
  104. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  105. elseif(array_key_exists('date', $photoData))
  106. $published = DateTime::createFromFormat('U', $photoData['date']);
  107. // Include venue data
  108. $locations = [];
  109. if($photoData['location']) {
  110. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  111. if($location) {
  112. $entry['location'] = [$location['url']];
  113. $refs[$location['url']] = $location;
  114. $locations[] = $location;
  115. // Look up timezone
  116. if($location['latitude']) {
  117. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  118. if($tz) {
  119. $published->setTimeZone(new DateTimeZone($tz));
  120. }
  121. }
  122. }
  123. }
  124. $entry['published'] = $published->format('c');
  125. if(count($refs)) {
  126. $entry['refs'] = $refs;
  127. }
  128. return [
  129. 'data' => $entry,
  130. 'original' => json_encode([
  131. 'photo' => $photoData,
  132. 'profiles' => $profiles,
  133. 'locations' => $locations
  134. ])
  135. ];
  136. }
  137. private static function _buildHCardFromInstagramProfile($profile) {
  138. if(!$profile) return false;
  139. $author = [
  140. 'type' => 'card'
  141. ];
  142. if($profile['full_name'])
  143. $author['name'] = $profile['full_name'];
  144. else
  145. $author['name'] = $profile['username'];
  146. if(isset($profile['external_url']) && $profile['external_url'])
  147. $author['url'] = $profile['external_url'];
  148. else
  149. $author['url'] = 'https://www.instagram.com/' . $profile['username'];
  150. if(isset($profile['profile_pic_url_hd']))
  151. $author['photo'] = $profile['profile_pic_url_hd'];
  152. else
  153. $author['photo'] = $profile['profile_pic_url'];
  154. return $author;
  155. }
  156. private static function _getInstagramProfile($username, $http) {
  157. $response = $http->get('https://www.instagram.com/'.$username.'/?__a=1');
  158. if(!$response['error']) {
  159. $profile = @json_decode($response['body'], true);
  160. if($profile && array_key_exists('user', $profile)) {
  161. $user = $profile['user'];
  162. return $user;
  163. }
  164. }
  165. return null;
  166. }
  167. private static function _getInstagramLocation($id, $http) {
  168. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  169. $response = $http->get($igURL);
  170. if($response['body']) {
  171. $data = self::_extractVenueDataFromVenuePage($response['body']);
  172. if($data) {
  173. return [
  174. 'type' => 'card',
  175. 'name' => $data['name'],
  176. 'url' => $igURL,
  177. 'latitude' => $data['lat'],
  178. 'longitude' => $data['lng'],
  179. ];
  180. }
  181. }
  182. return null;
  183. }
  184. private static function _extractPhotoDataFromPhotoPage($html) {
  185. $data = self::_extractIGData($html);
  186. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  187. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  188. $post = $data['entry_data']['PostPage'];
  189. if(isset($post[0]['graphql']['shortcode_media'])) {
  190. return $post[0]['graphql']['shortcode_media'];
  191. } elseif(isset($post[0]['graphql']['media'])) {
  192. return $post[0]['graphql']['media'];
  193. } elseif(isset($post[0]['media'])) {
  194. return $post[0]['media'];
  195. }
  196. }
  197. }
  198. return null;
  199. }
  200. private static function _extractVenueDataFromVenuePage($html) {
  201. $data = self::_extractIGData($html);
  202. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  203. if(isset($data['entry_data']['LocationsPage'])) {
  204. $data = $data['entry_data']['LocationsPage'];
  205. if(isset($data[0]['location'])) {
  206. $location = $data[0]['location'];
  207. # we don't need these and they're huge, so drop them now
  208. unset($location['media']);
  209. unset($location['top_posts']);
  210. return $location;
  211. }
  212. }
  213. }
  214. return null;
  215. }
  216. private static function _extractIGData($html) {
  217. $doc = new DOMDocument();
  218. @$doc->loadHTML($html);
  219. if(!$doc) {
  220. return null;
  221. }
  222. $xpath = new DOMXPath($doc);
  223. $data = null;
  224. foreach($xpath->query('//script') as $script) {
  225. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  226. $data = json_decode($match[1], true);
  227. }
  228. }
  229. return $data;
  230. }
  231. }