You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

313 lines
8.7 KiB

7 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. class Instagram extends Format {
  6. public static function matches_host($url) {
  7. $host = parse_url($url, PHP_URL_HOST);
  8. return in_array($host, ['www.instagram.com','instagram.com']);
  9. }
  10. public static function matches($url) {
  11. return self::matches_host($url);
  12. }
  13. public static function parse($http, $html, $url) {
  14. if(preg_match('#instagram.com/([^/]+)/$#', $url)) {
  15. return self::parseProfile($http, $html, $url);
  16. } else {
  17. return self::parsePhoto($http, $html, $url);
  18. }
  19. }
  20. private static function parseProfile($http, $html, $url) {
  21. $profileData = self::_parseProfileFromHTML($html);
  22. if(!$profileData)
  23. return self::_unknown();
  24. $card = self::_buildHCardFromInstagramProfile($profileData);
  25. return [
  26. 'data' => $card
  27. ];
  28. }
  29. private static function parsePhoto($http, $html, $url) {
  30. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  31. if(!$photoData)
  32. return self::_unknown();
  33. // Start building the h-entry
  34. $entry = array(
  35. 'type' => 'entry',
  36. 'url' => $url,
  37. 'author' => [
  38. 'type' => 'card',
  39. 'name' => null,
  40. 'photo' => null,
  41. 'url' => null
  42. ]
  43. );
  44. $profiles = [];
  45. // Fetch profile info for this user
  46. $username = $photoData['owner']['username'];
  47. $profile = self::_getInstagramProfile($username, $http);
  48. if($profile) {
  49. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  50. $profiles[] = $profile;
  51. }
  52. // Content and hashtags
  53. $caption = false;
  54. if(isset($photoData['caption'])) {
  55. $caption = $photoData['caption'];
  56. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  57. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  58. }
  59. if($caption) {
  60. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  61. $entry['category'] = [];
  62. foreach($matches[1] as $match) {
  63. $entry['category'][] = $match;
  64. }
  65. }
  66. $entry['content'] = [
  67. 'text' => $caption
  68. ];
  69. }
  70. $refs = [];
  71. // Include the photo/video media URLs
  72. // (Always return arrays, even for single images)
  73. if(array_key_exists('edge_sidecar_to_children', $photoData)) {
  74. // Multi-post
  75. // For now, we will only pull photos from multi-posts, and skip videos.
  76. $entry['photo'] = [];
  77. foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) {
  78. $entry['photo'][] = $edge['node']['display_url'];
  79. // Don't need to pull person-tags from here because the main parent object already has them.
  80. }
  81. } else {
  82. // Single photo or video
  83. if(array_key_exists('display_src', $photoData))
  84. $entry['photo'] = [$photoData['display_src']];
  85. elseif(array_key_exists('display_url', $photoData))
  86. $entry['photo'] = [$photoData['display_url']];
  87. if(array_key_exists('is_video', $photoData) && $photoData['is_video']) {
  88. $entry['video'] = [$photoData['video_url']];
  89. }
  90. }
  91. // Find person tags and fetch user profiles
  92. // old instagram json
  93. if(isset($photoData['usertags']['nodes'])) {
  94. if(!isset($entry['category'])) $entry['category'] = [];
  95. foreach($photoData['usertags']['nodes'] as $tag) {
  96. $profile = self::_getInstagramProfile($tag['user']['username'], $http);
  97. if($profile) {
  98. $card = self::_buildHCardFromInstagramProfile($profile);
  99. $entry['category'][] = $card['url'];
  100. $refs[$card['url']] = $card;
  101. $profiles[] = $profile;
  102. }
  103. }
  104. }
  105. // new instagram json as of approximately 2017-04-19
  106. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  107. if(!isset($entry['category'])) $entry['category'] = [];
  108. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  109. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  110. if($profile) {
  111. $card = self::_buildHCardFromInstagramProfile($profile);
  112. $entry['category'][] = $card['url'];
  113. $refs[$card['url']] = $card;
  114. $profiles[] = $profile;
  115. }
  116. }
  117. }
  118. // Published date
  119. if(array_key_exists('taken_at_timestamp', $photoData))
  120. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  121. elseif(array_key_exists('date', $photoData))
  122. $published = DateTime::createFromFormat('U', $photoData['date']);
  123. // Include venue data
  124. $locations = [];
  125. if($photoData['location']) {
  126. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  127. if($location) {
  128. $entry['location'] = [$location['url']];
  129. $refs[$location['url']] = $location;
  130. $locations[] = $location;
  131. // Look up timezone
  132. if($location['latitude']) {
  133. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  134. if($tz) {
  135. $published->setTimeZone(new DateTimeZone($tz));
  136. }
  137. }
  138. }
  139. }
  140. $entry['published'] = $published->format('c');
  141. if(count($refs)) {
  142. $entry['refs'] = $refs;
  143. }
  144. return [
  145. 'data' => $entry,
  146. 'original' => json_encode([
  147. 'photo' => $photoData,
  148. 'profiles' => $profiles,
  149. 'locations' => $locations
  150. ])
  151. ];
  152. }
  153. private static function _buildHCardFromInstagramProfile($profile) {
  154. if(!$profile) return false;
  155. $author = [
  156. 'type' => 'card'
  157. ];
  158. if($profile['full_name'])
  159. $author['name'] = $profile['full_name'];
  160. else
  161. $author['name'] = $profile['username'];
  162. if(isset($profile['external_url']) && $profile['external_url'])
  163. $author['url'] = $profile['external_url'];
  164. else
  165. $author['url'] = 'https://www.instagram.com/' . $profile['username'];
  166. if(isset($profile['profile_pic_url_hd']))
  167. $author['photo'] = $profile['profile_pic_url_hd'];
  168. else
  169. $author['photo'] = $profile['profile_pic_url'];
  170. return $author;
  171. }
  172. private static function _getInstagramProfile($username, $http) {
  173. $response = $http->get('https://www.instagram.com/'.$username.'/');
  174. if(!$response['error'])
  175. return self::_parseProfileFromHTML($response['body']);
  176. return null;
  177. }
  178. private static function _parseProfileFromHTML($html) {
  179. $data = self::_extractIGData($html);
  180. if(isset($data['entry_data']['ProfilePage'][0])) {
  181. $profile = $data['entry_data']['ProfilePage'][0];
  182. if($profile && isset($profile['graphql']['user'])) {
  183. $user = $profile['graphql']['user'];
  184. return $user;
  185. }
  186. }
  187. return null;
  188. }
  189. private static function _getInstagramLocation($id, $http) {
  190. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  191. $response = $http->get($igURL);
  192. if($response['body']) {
  193. $data = self::_extractVenueDataFromVenuePage($response['body']);
  194. if($data) {
  195. return [
  196. 'type' => 'card',
  197. 'name' => $data['name'],
  198. 'url' => $igURL,
  199. 'latitude' => $data['lat'],
  200. 'longitude' => $data['lng'],
  201. ];
  202. }
  203. }
  204. return null;
  205. }
  206. private static function _extractPhotoDataFromPhotoPage($html) {
  207. $data = self::_extractIGData($html);
  208. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  209. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  210. $post = $data['entry_data']['PostPage'];
  211. if(isset($post[0]['graphql']['shortcode_media'])) {
  212. return $post[0]['graphql']['shortcode_media'];
  213. } elseif(isset($post[0]['graphql']['media'])) {
  214. return $post[0]['graphql']['media'];
  215. } elseif(isset($post[0]['media'])) {
  216. return $post[0]['media'];
  217. }
  218. }
  219. }
  220. return null;
  221. }
  222. private static function _extractVenueDataFromVenuePage($html) {
  223. $data = self::_extractIGData($html);
  224. if($data && isset($data['entry_data']['LocationsPage'])) {
  225. $data = $data['entry_data']['LocationsPage'];
  226. if(isset($data[0]['graphql']['location'])) {
  227. $location = $data[0]['graphql']['location'];
  228. # we don't need these and they're huge, so drop them now
  229. unset($location['media']);
  230. unset($location['top_posts']);
  231. return $location;
  232. }
  233. }
  234. return null;
  235. }
  236. private static function _extractIGData($html) {
  237. $doc = new DOMDocument();
  238. @$doc->loadHTML($html);
  239. if(!$doc) {
  240. return null;
  241. }
  242. $xpath = new DOMXPath($doc);
  243. $data = null;
  244. foreach($xpath->query('//script') as $script) {
  245. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  246. $data = json_decode($match[1], true);
  247. }
  248. }
  249. return $data;
  250. }
  251. }