You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

275 lines
7.5 KiB

7 years ago
  1. <?php
  2. namespace p3k\XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. use Parse;
  6. class Instagram {
  7. public static function matches_host($url) {
  8. $host = parse_url($url, PHP_URL_HOST);
  9. return in_array($host, ['www.instagram.com','instagram.com']);
  10. }
  11. public static function matches($url) {
  12. return self::matches_host($url);
  13. }
  14. public static function parse($html, $url, $http) {
  15. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  16. if(!$photoData)
  17. return false;
  18. // Start building the h-entry
  19. $entry = array(
  20. 'type' => 'entry',
  21. 'url' => $url,
  22. 'author' => [
  23. 'type' => 'card',
  24. 'name' => null,
  25. 'photo' => null,
  26. 'url' => null
  27. ]
  28. );
  29. $profiles = [];
  30. // Fetch profile info for this user
  31. $username = $photoData['owner']['username'];
  32. $profile = self::_getInstagramProfile($username, $http);
  33. if($profile) {
  34. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  35. $profiles[] = $profile;
  36. }
  37. // Content and hashtags
  38. $caption = false;
  39. if(isset($photoData['caption'])) {
  40. $caption = $photoData['caption'];
  41. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  42. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  43. }
  44. if($caption) {
  45. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  46. $entry['category'] = [];
  47. foreach($matches[1] as $match) {
  48. $entry['category'][] = $match;
  49. }
  50. }
  51. $entry['content'] = [
  52. 'text' => $caption
  53. ];
  54. }
  55. // Include the photo/video media URLs
  56. // (Always return arrays)
  57. if(array_key_exists('display_src', $photoData))
  58. $entry['photo'] = [$photoData['display_src']];
  59. elseif(array_key_exists('display_url', $photoData))
  60. $entry['photo'] = [$photoData['display_url']];
  61. if(array_key_exists('is_video', $photoData) && $photoData['is_video']) {
  62. $entry['video'] = [$photoData['video_url']];
  63. }
  64. $refs = [];
  65. // Find person tags and fetch user profiles
  66. // old json
  67. if(isset($photoData['usertags']['nodes'])) {
  68. if(!isset($entry['category'])) $entry['category'] = [];
  69. foreach($photoData['usertags']['nodes'] as $tag) {
  70. $profile = self::_getInstagramProfile($tag['user']['username'], $http);
  71. if($profile) {
  72. $card = self::_buildHCardFromInstagramProfile($profile);
  73. $entry['category'][] = $card['url'];
  74. $refs[$card['url']] = $card;
  75. $profiles[] = $profile;
  76. }
  77. }
  78. }
  79. // new json as of approximately 2017-04-19
  80. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  81. if(!isset($entry['category'])) $entry['category'] = [];
  82. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  83. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  84. if($profile) {
  85. $card = self::_buildHCardFromInstagramProfile($profile);
  86. $entry['category'][] = $card['url'];
  87. $refs[$card['url']] = $card;
  88. $profiles[] = $profile;
  89. }
  90. }
  91. }
  92. // Published date
  93. if(array_key_exists('taken_at_timestamp', $photoData))
  94. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  95. elseif(array_key_exists('date', $photoData))
  96. $published = DateTime::createFromFormat('U', $photoData['date']);
  97. // Include venue data
  98. $locations = [];
  99. if($photoData['location']) {
  100. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  101. if($location) {
  102. $entry['location'] = [$location['url']];
  103. $refs[$location['url']] = $location;
  104. $locations[] = $location;
  105. // Look up timezone
  106. if($location['latitude']) {
  107. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  108. if($tz) {
  109. $published->setTimeZone(new DateTimeZone($tz));
  110. }
  111. }
  112. }
  113. }
  114. $entry['published'] = $published->format('c');
  115. $response = [
  116. 'data' => $entry
  117. ];
  118. if(count($refs)) {
  119. $response['refs'] = $refs;
  120. }
  121. return [$response, [
  122. 'photo' => $photoData,
  123. 'profiles' => $profiles,
  124. 'locations' => $locations
  125. ]];
  126. }
  127. private static function _buildHCardFromInstagramProfile($profile) {
  128. if(!$profile) return false;
  129. $author = [
  130. 'type' => 'card'
  131. ];
  132. if($profile['full_name'])
  133. $author['name'] = $profile['full_name'];
  134. else
  135. $author['name'] = $profile['username'];
  136. if(isset($profile['external_url']) && $profile['external_url'])
  137. $author['url'] = $profile['external_url'];
  138. else
  139. $author['url'] = 'https://www.instagram.com/' . $profile['username'];
  140. if(isset($profile['profile_pic_url_hd']))
  141. $author['photo'] = $profile['profile_pic_url_hd'];
  142. else
  143. $author['photo'] = $profile['profile_pic_url'];
  144. return $author;
  145. }
  146. private static function _getInstagramProfile($username, $http) {
  147. $response = $http->get('https://www.instagram.com/'.$username.'/?__a=1');
  148. if(!$response['error']) {
  149. $profile = @json_decode($response['body'], true);
  150. if($profile && array_key_exists('user', $profile)) {
  151. $user = $profile['user'];
  152. return $user;
  153. }
  154. }
  155. return null;
  156. }
  157. private static function _getInstagramLocation($id, $http) {
  158. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  159. $response = $http->get($igURL);
  160. if($response['body']) {
  161. $data = self::_extractVenueDataFromVenuePage($response['body']);
  162. if($data) {
  163. return [
  164. 'type' => 'card',
  165. 'name' => $data['name'],
  166. 'url' => $igURL,
  167. 'latitude' => $data['lat'],
  168. 'longitude' => $data['lng'],
  169. ];
  170. }
  171. }
  172. return null;
  173. }
  174. private static function _extractPhotoDataFromPhotoPage($html) {
  175. $data = self::_extractIGData($html);
  176. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  177. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  178. $post = $data['entry_data']['PostPage'];
  179. if(isset($post[0]['graphql']['shortcode_media'])) {
  180. return $post[0]['graphql']['shortcode_media'];
  181. } elseif(isset($post[0]['graphql']['media'])) {
  182. return $post[0]['graphql']['media'];
  183. } elseif(isset($post[0]['media'])) {
  184. return $post[0]['media'];
  185. }
  186. }
  187. }
  188. return null;
  189. }
  190. private static function _extractVenueDataFromVenuePage($html) {
  191. $data = self::_extractIGData($html);
  192. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  193. if(isset($data['entry_data']['LocationsPage'])) {
  194. $data = $data['entry_data']['LocationsPage'];
  195. if(isset($data[0]['location'])) {
  196. $location = $data[0]['location'];
  197. # we don't need these and they're huge, so drop them now
  198. unset($location['media']);
  199. unset($location['top_posts']);
  200. return $location;
  201. }
  202. }
  203. }
  204. return null;
  205. }
  206. private static function _extractIGData($html) {
  207. $doc = new DOMDocument();
  208. @$doc->loadHTML($html);
  209. if(!$doc) {
  210. return null;
  211. }
  212. $xpath = new DOMXPath($doc);
  213. $data = null;
  214. foreach($xpath->query('//script') as $script) {
  215. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  216. $data = json_decode($match[1], true);
  217. }
  218. }
  219. return $data;
  220. }
  221. }