Browse Source

refactor Twitter parser

pull/38/head
Aaron Parecki 4 years ago
parent
commit
01b53edc95
No known key found for this signature in database GPG Key ID: 276C2817346D6056
6 changed files with 65 additions and 130 deletions
  1. +20
    -0
      README.md
  2. +2
    -78
      controllers/Parse.php
  3. +2
    -1
      lib/XRay/Fetcher.php
  4. +19
    -33
      lib/XRay/Formats/Twitter.php
  5. +4
    -0
      lib/XRay/Parser.php
  6. +18
    -18
      tests/TwitterTest.php

+ 20
- 0
README.md View File

@ -40,6 +40,26 @@ In both cases, the response will be a JSON object containing a key of "type". If
You can also make a POST request with the same parameter names.
If you already have an HTML or JSON document you want to parse, you can include that in the parameter `body`. This POST request would look like the below:
```
POST /parse
Content-type: application/x-www-form-urlencoded
url=https://aaronparecki.com/2016/01/16/11/
&body=<html>....</html>
```
or for Twitter/GitHub where you might have JSON,
```
POST /parse
Content-type: application/x-www-form-urlencoded
url=https://github.com/aaronpk/XRay
&body={"repo":......}
```
### Authentication
If the URL you are fetching requires authentication, include the access token in the parameter "token", and it will be included in an "Authorization" header when fetching the URL. (It is recommended to use a POST request in this case, to avoid the access token potentially being logged as part of the query string.) This is useful for [Private Webmention](https://indieweb.org/Private-Webmention) verification.

+ 2
- 78
controllers/Parse.php View File

@ -62,12 +62,12 @@ class Parse {
}
$url = $request->get('url');
$html = $request->get('html');
$html = $request->get('html') ?: $request->get('body');
if(!$url && !$html) {
return $this->respond($response, 400, [
'error' => 'missing_url',
'error_description' => 'Provide a URL or HTML to fetch'
'error_description' => 'Provide a URL or HTML to fetch',
]);
}
@ -236,81 +236,5 @@ class Parse {
return $element;
}
private function parseTwitterURL(&$request, &$response, $url, $match) {
$fields = ['twitter_api_key','twitter_api_secret','twitter_access_token','twitter_access_token_secret'];
$creds = [];
foreach($fields as $f) {
if($v=$request->get($f))
$creds[$f] = $v;
}
$data = false;
if(count($creds) == 4) {
list($data, $parsed) = Formats\Twitter::parse($url, $match[1], $creds);
} elseif(count($creds) > 0) {
// If only some Twitter credentials were present, return an error
return $this->respond($response, 400, [
'error' => 'missing_parameters',
'error_description' => 'All 4 Twitter credentials must be included in the request'
]);
} else {
// Accept Tweet JSON and parse that if provided
$json = $request->get('json');
if($json) {
list($data, $parsed) = Formats\Twitter::parse($url, $match[1], null, $json);
}
// Skip parsing from the Twitter API if they didn't include credentials
}
if($data) {
if($request->get('include_original'))
$data['original'] = $parsed;
$data['url'] = $url;
$data['code'] = 200;
return $this->respond($response, 200, $data);
} else {
return $this->respond($response, 200, [
'data' => [
'type' => 'unknown'
],
'url' => $url,
'code' => 0
]);
}
}
private function parseGitHubURL(&$request, &$response, $url) {
$fields = ['github_access_token'];
$creds = [];
foreach($fields as $f) {
if($v=$request->get($f))
$creds[$f] = $v;
}
$data = false;
$json = $request->get('json');
if($json) {
// Accept GitHub JSON and parse that if provided
list($data, $json, $code) = Formats\GitHub::parse($this->http, $url, null, $json);
} else {
// Otherwise fetch the post unauthenticated or with the provided access token
list($data, $json, $code) = Formats\GitHub::parse($this->http, $url, $creds);
}
if($data) {
if($request->get('include_original'))
$data['original'] = $json;
$data['url'] = $url;
$data['code'] = $code;
return $this->respond($response, 200, $data);
} else {
return $this->respond($response, 200, [
'data' => [
'type' => 'unknown'
],
'url' => $url,
'code' => $code
]);
}
}
}

+ 2
- 1
lib/XRay/Fetcher.php View File

@ -10,7 +10,7 @@ class Fetcher {
public function fetch($url, $opts=[]) {
if($opts == false) $opts = [];
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
@ -127,6 +127,7 @@ class Fetcher {
}
if(count($creds) < 4) {
print_r(debug_backtrace()[1]);
return [
'error_code' => 400,
'error' => 'missing_parameters',

+ 19
- 33
lib/XRay/Formats/Twitter.php View File

@ -39,30 +39,17 @@ class Twitter extends Format {
return $tweet;
}
public static function parse($url, $tweet_id, $creds, $json=null) {
public static function parse($json, $url) {
$host = parse_url($url, PHP_URL_HOST);
if($host == 'twtr.io') {
$tweet_id = self::b60to10($tweet_id);
}
if(is_string($json))
$tweet = json_decode($json);
else
$tweet = $json;
if($json) {
if(is_string($json))
$tweet = json_decode($json);
else
$tweet = $json;
} else {
$twitter = new \Twitter($creds['twitter_api_key'], $creds['twitter_api_secret'], $creds['twitter_access_token'], $creds['twitter_access_token_secret']);
try {
$tweet = $twitter->request('statuses/show/'.$tweet_id, 'GET', ['tweet_mode'=>'extended']);
} catch(\TwitterException $e) {
return [false, false];
}
if(!$tweet) {
return self::_unknown();
}
if(!$tweet)
return [false, false];
$entry = array(
'type' => 'entry',
'url' => $url,
@ -89,9 +76,9 @@ class Twitter extends Format {
$repostOf = 'https://twitter.com/' . $reposted->user->screen_name . '/status/' . $reposted->id_str;
$entry['repost-of'] = $repostOf;
list($repostedEntry) = self::parse($repostOf, $reposted->id_str, null, $reposted);
if(isset($repostedEntry['refs'])) {
foreach($repostedEntry['refs'] as $k=>$v) {
$repostedEntry = self::parse($reposted, $repostOf);
if(isset($repostedEntry['data']['refs'])) {
foreach($repostedEntry['data']['refs'] as $k=>$v) {
$refs[$k] = $v;
}
}
@ -174,28 +161,27 @@ class Twitter extends Format {
// Quoted Status
if(property_exists($tweet, 'quoted_status')) {
$quoteOf = 'https://twitter.com/' . $tweet->quoted_status->user->screen_name . '/status/' . $tweet->quoted_status_id_str;
list($quoted) = self::parse($quoteOf, $tweet->quoted_status_id_str, null, $tweet->quoted_status);
if(isset($quoted['refs'])) {
foreach($quoted['refs'] as $k=>$v) {
$quotedEntry = self::parse($tweet->quoted_status, $quoteOf);
if(isset($quotedEntry['data']['refs'])) {
foreach($quotedEntry['data']['refs'] as $k=>$v) {
$refs[$k] = $v;
}
}
$refs[$quoteOf] = $quoted['data'];
$refs[$quoteOf] = $quotedEntry['data'];
}
if($author = self::_buildHCardFromTwitterProfile($tweet->user)) {
$entry['author'] = $author;
}
$response = [
'data' => $entry
];
if(count($refs)) {
$response['refs'] = $refs;
$entry['refs'] = $refs;
}
return [$response, $tweet];
return [
'data' => $entry,
'original' => $tweet,
];
}
private static function _buildHCardFromTwitterProfile($profile) {

+ 4
- 0
lib/XRay/Parser.php View File

@ -26,6 +26,10 @@ class Parser {
return Formats\GitHub::parse($body, $url);
}
if(Formats\Twitter::matches($url)) {
return Formats\Twitter::parse($body, $url);
}
if(Formats\XKCD::matches($url)) {
return Formats\XKCD::parse($body, $url);
}

+ 18
- 18
tests/TwitterTest.php View File

@ -29,7 +29,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testBasicProfileInfo() {
list($url, $json) = $this->loadTweet('818912506496229376');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('aaronpk dev', $data['data']['author']['name']);
@ -43,7 +43,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testProfileWithNonExpandedURL() {
list($url, $json) = $this->loadTweet('791704641046052864');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('http://agiletortoise.com', $data['data']['author']['url']);
}
@ -51,9 +51,9 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testBasicTestStuff() {
list($url, $json) = $this->loadTweet('818913630569664512');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals(200, $data['code']);
$this->assertEquals(null, $data['code']); // no code is expected if we pass in the body
$this->assertEquals('https://twitter.com/pkdev/status/818913630569664512', $data['url']);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('A tweet with a URL https://indieweb.org/ #and #some #hashtags', $data['data']['content']['text']);
@ -67,14 +67,14 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testPositiveTimezone() {
list($url, $json) = $this->loadTweet('719914707566649344');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals("2016-04-12T16:46:56+01:00", $data['data']['published']);
}
public function testTweetWithEmoji() {
list($url, $json) = $this->loadTweet('818943244553699328');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('Here 🎉 have an emoji', $data['data']['content']['text']);
@ -83,7 +83,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testHTMLEscaping() {
list($url, $json) = $this->loadTweet('818928092383166465');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('Double escaping &amp; & amp', $data['data']['content']['text']);
@ -92,7 +92,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testTweetWithPhoto() {
list($url, $json) = $this->loadTweet('818912506496229376');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('Tweet with a photo and a location', $data['data']['content']['text']);
@ -102,7 +102,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testTweetWithTwoPhotos() {
list($url, $json) = $this->loadTweet('818935308813103104');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('Two photos', $data['data']['content']['text']);
@ -113,7 +113,7 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testTweetWithVideo() {
list($url, $json) = $this->loadTweet('818913178260160512');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('Tweet with a video', $data['data']['content']['text']);
@ -123,12 +123,12 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testTweetWithLocation() {
list($url, $json) = $this->loadTweet('818912506496229376');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('Tweet with a photo and a location', $data['data']['content']['text']);
$this->assertEquals('https://api.twitter.com/1.1/geo/id/ac88a4f17a51c7fc.json', $data['data']['location']);
$location = $data['refs']['https://api.twitter.com/1.1/geo/id/ac88a4f17a51c7fc.json'];
$location = $data['data']['refs']['https://api.twitter.com/1.1/geo/id/ac88a4f17a51c7fc.json'];
$this->assertEquals('adr', $location['type']);
$this->assertEquals('Portland', $location['locality']);
$this->assertEquals('United States', $location['country-name']);
@ -138,38 +138,38 @@ class TwitterTest extends PHPUnit_Framework_TestCase {
public function testRetweet() {
list($url, $json) = $this->loadTweet('818913351623245824');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertArrayNotHasKey('content', $data['data']);
$repostOf = 'https://twitter.com/aaronpk/status/817414679131660288';
$this->assertEquals($repostOf, $data['data']['repost-of']);
$tweet = $data['refs'][$repostOf];
$tweet = $data['data']['refs'][$repostOf];
$this->assertEquals('Yeah that\'s me http://xkcd.com/1782/', $tweet['content']['text']);
}
public function testRetweetWithPhoto() {
list($url, $json) = $this->loadTweet('820039442773798912');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertArrayNotHasKey('content', $data['data']);
$this->assertArrayNotHasKey('photo', $data['data']);
$repostOf = 'https://twitter.com/phlaimeaux/status/819943954724556800';
$this->assertEquals($repostOf, $data['data']['repost-of']);
$tweet = $data['refs'][$repostOf];
$tweet = $data['data']['refs'][$repostOf];
$this->assertEquals('this headline is such a rollercoaster', $tweet['content']['text']);
}
public function testQuotedTweet() {
list($url, $json) = $this->loadTweet('818913488609251331');
$data = $this->parse(['url' => $url, 'json' => $json]);
$data = $this->parse(['url' => $url, 'body' => $json]);
$this->assertEquals('entry', $data['data']['type']);
$this->assertEquals('Quoted tweet with a #hashtag https://twitter.com/aaronpk/status/817414679131660288', $data['data']['content']['text']);
$tweet = $data['refs']['https://twitter.com/aaronpk/status/817414679131660288'];
$tweet = $data['data']['refs']['https://twitter.com/aaronpk/status/817414679131660288'];
$this->assertEquals('Yeah that\'s me http://xkcd.com/1782/', $tweet['content']['text']);
}

Loading…
Cancel
Save