From 206e27ea2591746519e28a177aeb71d5d55fd1f6 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sat, 11 Nov 2017 13:04:20 -0800 Subject: [PATCH] add feed discovery API --- README.md | 53 +++++- composer.json | 1 + controllers/Feeds.php | 58 +++++++ lib/XRay.php | 5 + lib/XRay/Feeds.php | 113 +++++++++++++ public/index.php | 3 + tests/FindFeedsTest.php | 156 ++++++++++++++++++ .../h-feed-with-atom-alternate | 36 ++++ .../h-feed-with-rss-alternate | 36 ++++ .../feed.example.com/html-with-atom-alternate | 36 ++++ .../feed.example.com/html-with-json-and-atom | 37 +++++ tests/data/feed.example.com/redirect-to-atom | 15 ++ 12 files changed, 545 insertions(+), 4 deletions(-) create mode 100644 controllers/Feeds.php create mode 100644 lib/XRay/Feeds.php create mode 100644 tests/FindFeedsTest.php create mode 100644 tests/data/feed.example.com/h-feed-with-atom-alternate create mode 100644 tests/data/feed.example.com/h-feed-with-rss-alternate create mode 100644 tests/data/feed.example.com/html-with-atom-alternate create mode 100644 tests/data/feed.example.com/html-with-json-and-atom create mode 100644 tests/data/feed.example.com/redirect-to-atom diff --git a/README.md b/README.md index 67dde34..70b698e 100644 --- a/README.md +++ b/README.md @@ -99,14 +99,13 @@ You can also use XRay to fetch all the rel values on a page, merging the list of ```php $xray = new p3k\XRay(); -$xray->http = $this->http; $rels = $xray->rels('https://aaronparecki.com/'); ``` This will return a similar response to the parser, but instead of a `data` key containing the parsed page, there will be `rels`, an associative array. Each key will contain an array of all the values that match that rel value. ``` -$rels = Array +Array ( [url] => https://aaronparecki.com/ [code] => 200 @@ -125,6 +124,41 @@ $rels = Array ``` +### Feed Discovery + +You can use XRay to discover the types of feeds available at a URL. + +```php +$xray = new p3k\XRay(); +$feeds = $xray->feeds('http://percolator.today'); +``` + +This will fetch the URL, check for a Microformats feed, as well as check for rel=alternates pointing to Atom, RSS or JSONFeed URLs. The response will look like the below. + +``` +Array +( + [url] => https://percolator.today/ + [code] => 200 + [feeds] => Array + ( + [0] => Array + ( + [url] => https://percolator.today/ + [type] => microformats + ) + + [1] => Array + ( + [url] => https://percolator.today/podcast.xml + [type] => rss + ) + + ) + +) +``` + ### Customizing the User Agent To set a unique user agent, (some websites will require a user agent be set), you can set the `http` property of the object to a `p3k\HTTP` object. @@ -336,7 +370,8 @@ If the page being parsed represents a feed, then the response will look like the "data": { "type": "feed", "items": [ - + {...}, + {...} ] } } @@ -346,7 +381,7 @@ Each object in the `items` array will contain a parsed version of the item, in t Atom, RSS and JSONFeed will all be normalized to XRay's vocabulary, and only recognized properties will be returned. -## Rels +## Rels API There is also an API method to parse and return all rel values on the page, including HTTP `Link` headers and HTML rel values. @@ -354,6 +389,16 @@ There is also an API method to parse and return all rel values on the page, incl GET /rels?url=https://aaronparecki.com/ ``` +See [above](#rels) for the response format. + +## Feed Discovery API + +``` +GET /feeds?url=https://aaronparecki.com/ +``` + +See [above](#feed-discovery) for the response format. + ## Token API diff --git a/composer.json b/composer.json index 8f9cf85..1b981ac 100644 --- a/composer.json +++ b/composer.json @@ -35,6 +35,7 @@ "controllers/Parse.php", "controllers/Token.php", "controllers/Rels.php", + "controllers/Feeds.php", "controllers/Certbot.php" ] } diff --git a/controllers/Feeds.php b/controllers/Feeds.php new file mode 100644 index 0000000..d34679b --- /dev/null +++ b/controllers/Feeds.php @@ -0,0 +1,58 @@ +http = new p3k\HTTP(); + } + + private function respond(Response $response, $code, $params, $headers=[]) { + $response->setStatusCode($code); + foreach($headers as $k=>$v) { + $response->headers->set($k, $v); + } + $response->headers->set('Content-Type', 'application/json'); + $opts = JSON_UNESCAPED_SLASHES; + if($this->_pretty) $opts += JSON_PRETTY_PRINT; + $response->setContent(json_encode($params, $opts)."\n"); + return $response; + } + + public function find(Request $request, Response $response) { + $opts = []; + + if($request->get('timeout')) { + // We might make 2 HTTP requests, so each request gets half the desired timeout + $opts['timeout'] = $request->get('timeout') / 2; + } + + if($request->get('max_redirects')) { + $opts['max_redirects'] = (int)$request->get('max_redirects'); + } + + if($request->get('pretty')) { + $this->_pretty = true; + } + + $url = $request->get('url'); + + if(!$url) { + return $this->respond($response, 400, [ + 'error' => 'missing_url', + 'error_description' => 'Provide a URL to fetch' + ]); + } + + $xray = new p3k\XRay(); + $xray->http = $this->http; + $res = $xray->feeds($url, $opts); + + return $this->respond($response, !empty($res['error']) ? 400 : 200, $res); + } + +} diff --git a/lib/XRay.php b/lib/XRay.php index 7fe7192..f656b3c 100644 --- a/lib/XRay.php +++ b/lib/XRay.php @@ -13,6 +13,11 @@ class XRay { return $rels->parse($url, $opts); } + public function feeds($url, $opts=[]) { + $feeds = new XRay\Feeds($this->http); + return $feeds->find($url, $opts); + } + public function parse($url, $opts_or_body=false, $opts_for_body=[]) { if(!$opts_or_body || is_array($opts_or_body)) { $fetch = new XRay\Fetcher($this->http); diff --git a/lib/XRay/Feeds.php b/lib/XRay/Feeds.php new file mode 100644 index 0000000..590d9a0 --- /dev/null +++ b/lib/XRay/Feeds.php @@ -0,0 +1,113 @@ +http = $http; + } + + public function find($url, $opts=[]) { + if(isset($opts['timeout'])) + $this->http->set_timeout($opts['timeout']); + if(isset($opts['max_redirects'])) + $this->http->set_max_redirects($opts['max_redirects']); + + $scheme = parse_url($url, PHP_URL_SCHEME); + if(!in_array($scheme, ['http','https'])) { + return [ + 'error' => 'invalid_url', + 'error_description' => 'Only http and https URLs are supported' + ]; + } + + $host = parse_url($url, PHP_URL_HOST); + if(!$host) { + return [ + 'error' => 'invalid_url', + 'error_description' => 'The URL provided was not valid' + ]; + } + + $url = normalize_url($url); + + $result = $this->http->get($url); + $body = $result['body']; + + $feeds = []; + + // First check the content type of the response + $contentType = isset($result['headers']['Content-Type']) ? $result['headers']['Content-Type'] : ''; + + if(is_array($contentType)) + $contentType = $contentType[count($contentType)-1]; + + if(strpos($contentType, 'application/atom+xml') !== false) { + $feeds[] = [ + 'url' => $result['url'], + 'type' => 'atom' + ]; + } elseif(strpos($contentType, 'application/rss+xml') !== false) { + $feeds[] = [ + 'url' => $result['url'], + 'type' => 'rss' + ]; + } elseif(strpos($contentType, 'application/json') !== false + && substr($body, 0, 1) == '{' && strpos(substr($body, 0, 100), 'https://jsonfeed.org/version/1')) { + $feeds[] = [ + 'url' => $result['url'], + 'type' => 'jsonfeed' + ]; + } else { + // Some other document was returned, parse the HTML and look for rel alternates and Microformats + + $mf2 = \mf2\Parse($body, $result['url']); + if(isset($mf2['alternates'])) { + foreach($mf2['alternates'] as $alt) { + if(strpos($alt['type'], 'application/json') !== false) { + $feeds[] = [ + 'url' => $alt['url'], + 'type' => 'jsonfeed' + ]; + } + if(strpos($alt['type'], 'application/atom+xml') !== false) { + $feeds[] = [ + 'url' => $alt['url'], + 'type' => 'atom' + ]; + } + if(strpos($alt['type'], 'application/rss+xml') !== false) { + $feeds[] = [ + 'url' => $alt['url'], + 'type' => 'rss' + ]; + } + } + } + + $parsed = Formats\HTML::parse($this->http, $body, $result['url'], array_merge($opts, ['expect'=>'feed'])); + if($parsed && isset($parsed['data']['type']) && $parsed['data']['type'] == 'feed') { + $feeds[] = [ + 'url' => $result['url'], + 'type' => 'microformats' + ]; + } + } + + // Sort feeds by priority + $rank = ['microformats'=>0,'jsonfeed'=>1,'atom'=>2,'rss'=>3]; + usort($feeds, function($a, $b) use($rank) { + return $rank[$a['type']] > $rank[$b['type']]; + }); + + return [ + 'url' => $result['url'], + 'code' => $result['code'], + 'feeds' => $feeds + ]; + } + +} diff --git a/public/index.php b/public/index.php index d6e4b5d..61da86c 100644 --- a/public/index.php +++ b/public/index.php @@ -24,6 +24,9 @@ $router->addRoute('GET', '/parse', 'Parse::parse'); $router->addRoute('POST', '/parse', 'Parse::parse'); $router->addRoute('POST', '/token', 'Token::token'); +$router->addRoute('GET', '/feeds', 'Feeds::find'); +$router->addRoute('POST', '/feeds', 'Feeds::find'); + $router->addRoute('GET', '/rels', 'Rels::fetch'); $router->addRoute('GET', '/cert', 'Certbot::index'); diff --git a/tests/FindFeedsTest.php b/tests/FindFeedsTest.php new file mode 100644 index 0000000..1c9ef49 --- /dev/null +++ b/tests/FindFeedsTest.php @@ -0,0 +1,156 @@ +client = new Feeds(); + $this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/'); + $this->client->mc = null; + } + + private function parse($params) { + $request = new Request($params); + $response = new Response(); + return $this->client->find($request, $response); + } + + // h-feed with no alternates + public function testMf2HFeed() { + $url = 'http://feed.example.com/h-feed-with-child-author'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(1, count($feeds)); + $this->assertEquals('http://feed.example.com/h-feed-with-child-author', $feeds[0]->url); + $this->assertEquals('microformats', $feeds[0]->type); + } + + // h-feed that links to Atom alternate + public function testMf2WithAtomAlternate() { + $url = 'http://feed.example.com/h-feed-with-atom-alternate'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(2, count($feeds)); + // Should rank JSONFeed above Atom + $this->assertEquals('http://feed.example.com/h-feed-with-atom-alternate', $feeds[0]->url); + $this->assertEquals('microformats', $feeds[0]->type); + $this->assertEquals('http://feed.example.com/atom', $feeds[1]->url); + $this->assertEquals('atom', $feeds[1]->type); + } + + // h-feed that links to RSS alternate + public function testMf2WithRSSAlternate() { + $url = 'http://feed.example.com/h-feed-with-rss-alternate'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(2, count($feeds)); + // Should rank JSONFeed above Atom + $this->assertEquals('http://feed.example.com/h-feed-with-rss-alternate', $feeds[0]->url); + $this->assertEquals('microformats', $feeds[0]->type); + $this->assertEquals('http://feed.example.com/podcast.xml', $feeds[1]->url); + $this->assertEquals('rss', $feeds[1]->type); + } + + // No mf2 but links to Atom alternate + public function testNoMf2() { + $url = 'http://feed.example.com/html-with-atom-alternate'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(1, count($feeds)); + $this->assertEquals('http://feed.example.com/atom', $feeds[0]->url); + $this->assertEquals('atom', $feeds[0]->type); + } + + public function testNoMf2WithJSONAndAtom() { + $url = 'http://feed.example.com/html-with-json-and-atom'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(2, count($feeds)); + // Should rank JSONFeed above Atom + $this->assertEquals('http://feed.example.com/jsonfeed', $feeds[0]->url); + $this->assertEquals('jsonfeed', $feeds[0]->type); + $this->assertEquals('http://feed.example.com/atom', $feeds[1]->url); + $this->assertEquals('atom', $feeds[1]->type); + } + + // input URL is an Atom feed + public function testInputIsAtom() { + $url = 'http://feed.example.com/atom'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(1, count($feeds)); + $this->assertEquals('http://feed.example.com/atom', $feeds[0]->url); + $this->assertEquals('atom', $feeds[0]->type); + } + + // input URL redirects to an Atom feed + public function testInputIsRedirectToAtom() { + $url = 'http://feed.example.com/redirect-to-atom'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(1, count($feeds)); + $this->assertEquals('http://feed.example.com/atom', $feeds[0]->url); + $this->assertEquals('atom', $feeds[0]->type); + } + + // input URL is an RSS feed + public function testInputIsRSS() { + $url = 'http://feed.example.com/rss'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(1, count($feeds)); + $this->assertEquals('http://feed.example.com/rss', $feeds[0]->url); + $this->assertEquals('rss', $feeds[0]->type); + } + + // input URL is a JSON feed + public function testInputIsJSONFeed() { + $url = 'http://feed.example.com/jsonfeed'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $feeds = json_decode($body)->feeds; + + $this->assertEquals(1, count($feeds)); + $this->assertEquals('http://feed.example.com/jsonfeed', $feeds[0]->url); + $this->assertEquals('jsonfeed', $feeds[0]->type); + } + + +} \ No newline at end of file diff --git a/tests/data/feed.example.com/h-feed-with-atom-alternate b/tests/data/feed.example.com/h-feed-with-atom-alternate new file mode 100644 index 0000000..222e84d --- /dev/null +++ b/tests/data/feed.example.com/h-feed-with-atom-alternate @@ -0,0 +1,36 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + + + + Author Name + + + + + diff --git a/tests/data/feed.example.com/h-feed-with-rss-alternate b/tests/data/feed.example.com/h-feed-with-rss-alternate new file mode 100644 index 0000000..e69f167 --- /dev/null +++ b/tests/data/feed.example.com/h-feed-with-rss-alternate @@ -0,0 +1,36 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + + + + Author Name + + + + + diff --git a/tests/data/feed.example.com/html-with-atom-alternate b/tests/data/feed.example.com/html-with-atom-alternate new file mode 100644 index 0000000..b45fa6e --- /dev/null +++ b/tests/data/feed.example.com/html-with-atom-alternate @@ -0,0 +1,36 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + + + +

Author Name

+ + + + + diff --git a/tests/data/feed.example.com/html-with-json-and-atom b/tests/data/feed.example.com/html-with-json-and-atom new file mode 100644 index 0000000..e2e2b83 --- /dev/null +++ b/tests/data/feed.example.com/html-with-json-and-atom @@ -0,0 +1,37 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + + + + +

Author Name

+ + + + + diff --git a/tests/data/feed.example.com/redirect-to-atom b/tests/data/feed.example.com/redirect-to-atom new file mode 100644 index 0000000..33c125c --- /dev/null +++ b/tests/data/feed.example.com/redirect-to-atom @@ -0,0 +1,15 @@ +HTTP/1.1 301 Moved Permanently +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive +Location: http://feed.example.com/atom + + + + Moved + + + This page has moved + +