From c5d417e87e04ab3fc64d44e63bf7f9867ae9a39e Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Thu, 10 Jan 2019 11:11:57 -0800 Subject: [PATCH] leave out placeholder alt text from instagram --- lib/XRay/Formats/Instagram.php | 8 +- tests/InstagramTest.php | 17 +- tests/data/www.instagram.com/p_Bq8U12UAcdq_ | 312 ++++++++++++++++++++ tests/download-instagram-data.sh | 1 + 4 files changed, 335 insertions(+), 3 deletions(-) create mode 100644 tests/data/www.instagram.com/p_Bq8U12UAcdq_ diff --git a/lib/XRay/Formats/Instagram.php b/lib/XRay/Formats/Instagram.php index 1bfb843..b799bdf 100644 --- a/lib/XRay/Formats/Instagram.php +++ b/lib/XRay/Formats/Instagram.php @@ -72,6 +72,10 @@ class Instagram extends Format { return self::parsePhotoFromData($http, $photoData, $url, $profile); } + private static function altTextIsPlaceholder($text) { + return $text == 'No photo description available.'; + } + private static function parsePhotoFromData($http, $photoData, $url, $profile=false) { if(!$photoData) @@ -141,7 +145,7 @@ class Instagram extends Format { foreach($photoData['edge_sidecar_to_children']['edges'] as $edge) { $entry['photo'][] = $edge['node']['display_url']; // Don't need to pull person-tags from here because the main parent object already has them. - if(isset($edge['node']['accessibility_caption'])) { + if(isset($edge['node']['accessibility_caption']) && $edge['node']['accessibility_caption'] && !self::altTextIsPlaceholder($edge['node']['accessibility_caption'])) { $meta[$edge['node']['display_url']] = [ 'alt' => $edge['node']['accessibility_caption'] ]; @@ -156,7 +160,7 @@ class Instagram extends Format { elseif(array_key_exists('display_url', $photoData)) $entry['photo'] = [$photoData['display_url']]; - if(isset($photoData['accessibility_caption']) && $photoData['accessibility_caption']) { + if(isset($photoData['accessibility_caption']) && $photoData['accessibility_caption'] && !self::altTextIsPlaceholder($photoData['accessibility_caption'])) { $meta[$entry['photo'][0]] = [ 'alt' => $photoData['accessibility_caption'] ]; diff --git a/tests/InstagramTest.php b/tests/InstagramTest.php index 06779d3..1046713 100644 --- a/tests/InstagramTest.php +++ b/tests/InstagramTest.php @@ -40,6 +40,7 @@ class InstagramTest extends PHPUnit_Framework_TestCase { $this->assertEquals('https://aaronparecki.com/', $data['data']['author']['url']); $this->assertEquals('Aaron Parecki', $data['data']['author']['name']); $this->assertEquals('https://instagram.fsjc1-3.fna.fbcdn.net/vp/45aee453740a714bf408f8947f89da8e/5CCB4B8E/t51.2885-19/s320x320/14240576_268350536897085_1129715662_a.jpg?_nc_ht=instagram.fsjc1-3.fna.fbcdn.net', $data['data']['author']['photo']); + $this->assertArrayNotHasKey('meta', $data['data']); // make sure this does not include alt text (autogenerated placeholder from instagram) } public function testBGDpqNoiMJ0() { @@ -167,6 +168,7 @@ class InstagramTest extends PHPUnit_Framework_TestCase { $this->assertEquals('https://instagram.fsjc1-3.fna.fbcdn.net/vp/8b1b2e6efa86a4856ec37a60f0fa77f5/5CC2D34D/t51.2885-15/e35/21909774_347707439021016_5237540582556958720_n.jpg?_nc_ht=instagram.fsjc1-3.fna.fbcdn.net', $data['data']['photo'][1]); $this->assertArrayNotHasKey('video', $data['data']); $this->assertEquals(2, count($data['data']['category'])); + $this->assertArrayNotHasKey('meta', $data['data']); } public function testMixPhotosAndVideos() { @@ -264,7 +266,6 @@ class InstagramTest extends PHPUnit_Framework_TestCase { public function testInstagramMultiPhotoWithAltText() { $url = 'https://www.instagram.com/p/BsdlOmLh_IX/'; - $response = $this->parse(['url' => $url]); $body = $response->getContent(); @@ -278,4 +279,18 @@ class InstagramTest extends PHPUnit_Framework_TestCase { $this->assertEquals('A large green "2" in a circle with a small pink "1" behind it', $data['data']['meta']['https://instagram.fsjc1-3.fna.fbcdn.net/vp/a6c93d8fcd5ad0e3b60f2ac0695eb34e/5CC3898E/t51.2885-15/e35/49663055_349750985612151_2949260446582336214_n.jpg?_nc_ht=instagram.fsjc1-3.fna.fbcdn.net']['alt']); } + public function testInstagramPhotoAutogeneratedAltText() { + $url = 'https://www.instagram.com/p/Bq8U12UAcdq/'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body, true); + + $this->assertEquals(200, $data['code']); + $this->assertEquals('instagram', $data['source-format']); + + $this->assertEquals('Image may contain: one or more people and hat', $data['data']['meta']['https://instagram.fsjc1-3.fna.fbcdn.net/vp/7f8954f33de897c0c57656b798637f4c/5CC3DF9F/t51.2885-15/e35/45605085_1989380037822519_4707213851165118070_n.jpg?_nc_ht=instagram.fsjc1-3.fna.fbcdn.net']['alt']); + } + } diff --git a/tests/data/www.instagram.com/p_Bq8U12UAcdq_ b/tests/data/www.instagram.com/p_Bq8U12UAcdq_ new file mode 100644 index 0000000..315318b --- /dev/null +++ b/tests/data/www.instagram.com/p_Bq8U12UAcdq_ @@ -0,0 +1,312 @@ +HTTP/1.1 200 OK +Content-Type: text/html; charset=utf-8 +Vary: Accept-Language, Cookie, Accept-Encoding +Content-Language: en +Date: Thu, 10 Jan 2019 18:59:50 GMT +Strict-Transport-Security: max-age=3600 +Cache-Control: private, no-cache, no-store, must-revalidate +Pragma: no-cache +Expires: Sat, 01 Jan 2000 00:00:00 GMT +X-Frame-Options: SAMEORIGIN +content-security-policy: report-uri https://www.instagram.com/security/csp_report/; default-src 'self' https://www.instagram.com; img-src https: data: blob:; font-src https: data:; media-src 'self' blob: https://www.instagram.com https://*.cdninstagram.com https://*.fbcdn.net; manifest-src 'self' https://www.instagram.com; script-src 'self' https://instagram.com https://www.instagram.com https://*.www.instagram.com https://*.cdninstagram.com wss://www.instagram.com https://*.facebook.com https://*.fbcdn.net https://*.facebook.net 'unsafe-inline' 'unsafe-eval' blob:; style-src 'self' https://*.www.instagram.com https://www.instagram.com 'unsafe-inline'; connect-src 'self' https://instagram.com https://www.instagram.com https://*.www.instagram.com https://graph.instagram.com https://*.graph.instagram.com https://*.cdninstagram.com https://api.instagram.com wss://www.instagram.com wss://edge-chat.instagram.com https://*.facebook.com https://*.fbcdn.net https://*.facebook.net chrome-extension://boadgeojelhgndaghljhdicfkmllpafd; worker-src 'self' https://www.instagram.com; frame-src 'self' https://instagram.com https://www.instagram.com https://staticxx.facebook.com https://www.facebook.com https://web.facebook.com https://connect.facebook.net https://m.facebook.com; object-src 'none'; upgrade-insecure-requests +X-Content-Type-Options: nosniff +X-XSS-Protection: 0 +Set-Cookie: urlgen="{\"108.161.19.190\": 54154}:1ghfYU:eKaA3HGZXpJpYX969QrXgbecnUg"; Domain=.instagram.com; HttpOnly; Path=/; Secure +Set-Cookie: mid=XDeWJgAEAAHA4fKyESvVBDv4JZ3i; Domain=.instagram.com; expires=Sun, 07-Jan-2029 18:59:50 GMT; Max-Age=315360000; Path=/; Secure +Set-Cookie: rur=PRN; Domain=.instagram.com; HttpOnly; Path=/; Secure +Set-Cookie: mcd=3; Domain=.instagram.com; expires=Sun, 07-Jan-2029 18:59:50 GMT; Max-Age=315360000; Path=/; Secure +Set-Cookie: csrftoken=9pFPcSYEu3Fc2hSMz4qJzhR6jImNc3ol; Domain=.instagram.com; expires=Thu, 09-Jan-2020 18:59:50 GMT; Max-Age=31449600; Path=/; Secure +Connection: keep-alive +Content-Length: 32756 + + + + + + + + +Ryan B on Instagram: “🤔” + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/download-instagram-data.sh b/tests/download-instagram-data.sh index 301752b..ae3b9ed 100755 --- a/tests/download-instagram-data.sh +++ b/tests/download-instagram-data.sh @@ -17,6 +17,7 @@ urls=( 'https://www.instagram.com/explore/locations/359000003/' 'https://www.instagram.com/p/BsdjKytBZyx/' 'https://www.instagram.com/p/BsdlOmLh_IX/' + 'https://www.instagram.com/p/Bq8U12UAcdq/' ) for url in ${urls[@]}; do