You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

266 lines
7.3 KiB

4 years ago
  1. <?php
  2. namespace XRay\Formats;
  3. use DOMDocument, DOMXPath;
  4. use DateTime, DateTimeZone;
  5. use Parse;
  6. class Instagram {
  7. public static function parse($html, $url, $http) {
  8. $photoData = self::_extractPhotoDataFromPhotoPage($html);
  9. if(!$photoData)
  10. return false;
  11. // Start building the h-entry
  12. $entry = array(
  13. 'type' => 'entry',
  14. 'url' => $url,
  15. 'author' => [
  16. 'type' => 'card',
  17. 'name' => null,
  18. 'photo' => null,
  19. 'url' => null
  20. ]
  21. );
  22. $profiles = [];
  23. // Fetch profile info for this user
  24. $username = $photoData['owner']['username'];
  25. $profile = self::_getInstagramProfile($username, $http);
  26. if($profile) {
  27. $entry['author'] = self::_buildHCardFromInstagramProfile($profile);
  28. $profiles[] = $profile;
  29. }
  30. // Content and hashtags
  31. $caption = false;
  32. if(isset($photoData['caption'])) {
  33. $caption = $photoData['caption'];
  34. } elseif(isset($photoData['edge_media_to_caption']['edges'][0]['node']['text'])) {
  35. $caption = $photoData['edge_media_to_caption']['edges'][0]['node']['text'];
  36. }
  37. if($caption) {
  38. if(preg_match_all('/#([a-z0-9_-]+)/i', $caption, $matches)) {
  39. $entry['category'] = [];
  40. foreach($matches[1] as $match) {
  41. $entry['category'][] = $match;
  42. }
  43. }
  44. $entry['content'] = [
  45. 'text' => $caption
  46. ];
  47. }
  48. // Include the photo/video media URLs
  49. // (Always return arrays)
  50. if(array_key_exists('display_src', $photoData))
  51. $entry['photo'] = [$photoData['display_src']];
  52. elseif(array_key_exists('display_url', $photoData))
  53. $entry['photo'] = [$photoData['display_url']];
  54. if(array_key_exists('is_video', $photoData) && $photoData['is_video']) {
  55. $entry['video'] = [$photoData['video_url']];
  56. }
  57. $refs = [];
  58. // Find person tags and fetch user profiles
  59. // old json
  60. if(isset($photoData['usertags']['nodes'])) {
  61. if(!isset($entry['category'])) $entry['category'] = [];
  62. foreach($photoData['usertags']['nodes'] as $tag) {
  63. $profile = self::_getInstagramProfile($tag['user']['username'], $http);
  64. if($profile) {
  65. $card = self::_buildHCardFromInstagramProfile($profile);
  66. $entry['category'][] = $card['url'];
  67. $refs[$card['url']] = $card;
  68. $profiles[] = $profile;
  69. }
  70. }
  71. }
  72. // new json as of approximately 2017-04-19
  73. if(isset($photoData['edge_media_to_tagged_user']['edges'])) {
  74. if(!isset($entry['category'])) $entry['category'] = [];
  75. foreach($photoData['edge_media_to_tagged_user']['edges'] as $edge) {
  76. $profile = self::_getInstagramProfile($edge['node']['user']['username'], $http);
  77. if($profile) {
  78. $card = self::_buildHCardFromInstagramProfile($profile);
  79. $entry['category'][] = $card['url'];
  80. $refs[$card['url']] = $card;
  81. $profiles[] = $profile;
  82. }
  83. }
  84. }
  85. // Published date
  86. if(array_key_exists('taken_at_timestamp', $photoData))
  87. $published = DateTime::createFromFormat('U', $photoData['taken_at_timestamp']);
  88. elseif(array_key_exists('date', $photoData))
  89. $published = DateTime::createFromFormat('U', $photoData['date']);
  90. // Include venue data
  91. $locations = [];
  92. if($photoData['location']) {
  93. $location = self::_getInstagramLocation($photoData['location']['id'], $http);
  94. if($location) {
  95. $entry['location'] = [$location['url']];
  96. $refs[$location['url']] = $location;
  97. $locations[] = $location;
  98. // Look up timezone
  99. if($location['latitude']) {
  100. $tz = \p3k\Timezone::timezone_for_location($location['latitude'], $location['longitude']);
  101. if($tz) {
  102. $published->setTimeZone(new DateTimeZone($tz));
  103. }
  104. }
  105. }
  106. }
  107. $entry['published'] = $published->format('c');
  108. $response = [
  109. 'data' => $entry
  110. ];
  111. if(count($refs)) {
  112. $response['refs'] = $refs;
  113. }
  114. return [$response, [
  115. 'photo' => $photoData,
  116. 'profiles' => $profiles,
  117. 'locations' => $locations
  118. ]];
  119. }
  120. private static function _buildHCardFromInstagramProfile($profile) {
  121. if(!$profile) return false;
  122. $author = [
  123. 'type' => 'card'
  124. ];
  125. if($profile['full_name'])
  126. $author['name'] = $profile['full_name'];
  127. else
  128. $author['name'] = $profile['username'];
  129. if(isset($profile['external_url']) && $profile['external_url'])
  130. $author['url'] = $profile['external_url'];
  131. else
  132. $author['url'] = 'https://www.instagram.com/' . $profile['username'];
  133. if(isset($profile['profile_pic_url_hd']))
  134. $author['photo'] = $profile['profile_pic_url_hd'];
  135. else
  136. $author['photo'] = $profile['profile_pic_url'];
  137. return $author;
  138. }
  139. private static function _getInstagramProfile($username, $http) {
  140. $response = $http->get('https://www.instagram.com/'.$username.'/?__a=1');
  141. if(!$response['error']) {
  142. $profile = @json_decode($response['body'], true);
  143. if($profile && array_key_exists('user', $profile)) {
  144. $user = $profile['user'];
  145. return $user;
  146. }
  147. }
  148. return null;
  149. }
  150. private static function _getInstagramLocation($id, $http) {
  151. $igURL = 'https://www.instagram.com/explore/locations/'.$id.'/';
  152. $response = $http->get($igURL);
  153. if($response['body']) {
  154. $data = self::_extractVenueDataFromVenuePage($response['body']);
  155. if($data) {
  156. return [
  157. 'type' => 'card',
  158. 'name' => $data['name'],
  159. 'url' => $igURL,
  160. 'latitude' => $data['lat'],
  161. 'longitude' => $data['lng'],
  162. ];
  163. }
  164. }
  165. return null;
  166. }
  167. private static function _extractPhotoDataFromPhotoPage($html) {
  168. $data = self::_extractIGData($html);
  169. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  170. if(is_array($data['entry_data']) && array_key_exists('PostPage', $data['entry_data'])) {
  171. $post = $data['entry_data']['PostPage'];
  172. if(isset($post[0]['graphql']['shortcode_media'])) {
  173. return $post[0]['graphql']['shortcode_media'];
  174. } elseif(isset($post[0]['graphql']['media'])) {
  175. return $post[0]['graphql']['media'];
  176. } elseif(isset($post[0]['media'])) {
  177. return $post[0]['media'];
  178. }
  179. }
  180. }
  181. return null;
  182. }
  183. private static function _extractVenueDataFromVenuePage($html) {
  184. $data = self::_extractIGData($html);
  185. if($data && is_array($data) && array_key_exists('entry_data', $data)) {
  186. if(isset($data['entry_data']['LocationsPage'])) {
  187. $data = $data['entry_data']['LocationsPage'];
  188. if(isset($data[0]['location'])) {
  189. $location = $data[0]['location'];
  190. # we don't need these and they're huge, so drop them now
  191. unset($location['media']);
  192. unset($location['top_posts']);
  193. return $location;
  194. }
  195. }
  196. }
  197. return null;
  198. }
  199. private static function _extractIGData($html) {
  200. $doc = new DOMDocument();
  201. @$doc->loadHTML($html);
  202. if(!$doc) {
  203. return null;
  204. }
  205. $xpath = new DOMXPath($doc);
  206. $data = null;
  207. foreach($xpath->query('//script') as $script) {
  208. if(preg_match('/window\._sharedData = ({.+});/', $script->textContent, $match)) {
  209. $data = json_decode($match[1], true);
  210. }
  211. }
  212. return $data;
  213. }
  214. }