Browse Source

add feed discovery API

pull/49/head v1.4.0
Aaron Parecki 6 years ago
parent
commit
206e27ea25
No known key found for this signature in database GPG Key ID: 276C2817346D6056
12 changed files with 545 additions and 4 deletions
  1. +49
    -4
      README.md
  2. +1
    -0
      composer.json
  3. +58
    -0
      controllers/Feeds.php
  4. +5
    -0
      lib/XRay.php
  5. +113
    -0
      lib/XRay/Feeds.php
  6. +3
    -0
      public/index.php
  7. +156
    -0
      tests/FindFeedsTest.php
  8. +36
    -0
      tests/data/feed.example.com/h-feed-with-atom-alternate
  9. +36
    -0
      tests/data/feed.example.com/h-feed-with-rss-alternate
  10. +36
    -0
      tests/data/feed.example.com/html-with-atom-alternate
  11. +37
    -0
      tests/data/feed.example.com/html-with-json-and-atom
  12. +15
    -0
      tests/data/feed.example.com/redirect-to-atom

+ 49
- 4
README.md View File

@ -99,14 +99,13 @@ You can also use XRay to fetch all the rel values on a page, merging the list of
```php ```php
$xray = new p3k\XRay(); $xray = new p3k\XRay();
$xray->http = $this->http;
$rels = $xray->rels('https://aaronparecki.com/'); $rels = $xray->rels('https://aaronparecki.com/');
``` ```
This will return a similar response to the parser, but instead of a `data` key containing the parsed page, there will be `rels`, an associative array. Each key will contain an array of all the values that match that rel value. This will return a similar response to the parser, but instead of a `data` key containing the parsed page, there will be `rels`, an associative array. Each key will contain an array of all the values that match that rel value.
``` ```
$rels = Array
Array
( (
[url] => https://aaronparecki.com/ [url] => https://aaronparecki.com/
[code] => 200 [code] => 200
@ -125,6 +124,41 @@ $rels = Array
``` ```
### Feed Discovery
You can use XRay to discover the types of feeds available at a URL.
```php
$xray = new p3k\XRay();
$feeds = $xray->feeds('http://percolator.today');
```
This will fetch the URL, check for a Microformats feed, as well as check for rel=alternates pointing to Atom, RSS or JSONFeed URLs. The response will look like the below.
```
Array
(
[url] => https://percolator.today/
[code] => 200
[feeds] => Array
(
[0] => Array
(
[url] => https://percolator.today/
[type] => microformats
)
[1] => Array
(
[url] => https://percolator.today/podcast.xml
[type] => rss
)
)
)
```
### Customizing the User Agent ### Customizing the User Agent
To set a unique user agent, (some websites will require a user agent be set), you can set the `http` property of the object to a `p3k\HTTP` object. To set a unique user agent, (some websites will require a user agent be set), you can set the `http` property of the object to a `p3k\HTTP` object.
@ -336,7 +370,8 @@ If the page being parsed represents a feed, then the response will look like the
"data": { "data": {
"type": "feed", "type": "feed",
"items": [ "items": [
{...},
{...}
] ]
} }
} }
@ -346,7 +381,7 @@ Each object in the `items` array will contain a parsed version of the item, in t
Atom, RSS and JSONFeed will all be normalized to XRay's vocabulary, and only recognized properties will be returned. Atom, RSS and JSONFeed will all be normalized to XRay's vocabulary, and only recognized properties will be returned.
## Rels
## Rels API
There is also an API method to parse and return all rel values on the page, including HTTP `Link` headers and HTML rel values. There is also an API method to parse and return all rel values on the page, including HTTP `Link` headers and HTML rel values.
@ -354,6 +389,16 @@ There is also an API method to parse and return all rel values on the page, incl
GET /rels?url=https://aaronparecki.com/ GET /rels?url=https://aaronparecki.com/
``` ```
See [above](#rels) for the response format.
## Feed Discovery API
```
GET /feeds?url=https://aaronparecki.com/
```
See [above](#feed-discovery) for the response format.
## Token API ## Token API

+ 1
- 0
composer.json View File

@ -35,6 +35,7 @@
"controllers/Parse.php", "controllers/Parse.php",
"controllers/Token.php", "controllers/Token.php",
"controllers/Rels.php", "controllers/Rels.php",
"controllers/Feeds.php",
"controllers/Certbot.php" "controllers/Certbot.php"
] ]
} }

+ 58
- 0
controllers/Feeds.php View File

@ -0,0 +1,58 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class Feeds {
public $http;
private $_pretty = false;
public function __construct() {
$this->http = new p3k\HTTP();
}
private function respond(Response $response, $code, $params, $headers=[]) {
$response->setStatusCode($code);
foreach($headers as $k=>$v) {
$response->headers->set($k, $v);
}
$response->headers->set('Content-Type', 'application/json');
$opts = JSON_UNESCAPED_SLASHES;
if($this->_pretty) $opts += JSON_PRETTY_PRINT;
$response->setContent(json_encode($params, $opts)."\n");
return $response;
}
public function find(Request $request, Response $response) {
$opts = [];
if($request->get('timeout')) {
// We might make 2 HTTP requests, so each request gets half the desired timeout
$opts['timeout'] = $request->get('timeout') / 2;
}
if($request->get('max_redirects')) {
$opts['max_redirects'] = (int)$request->get('max_redirects');
}
if($request->get('pretty')) {
$this->_pretty = true;
}
$url = $request->get('url');
if(!$url) {
return $this->respond($response, 400, [
'error' => 'missing_url',
'error_description' => 'Provide a URL to fetch'
]);
}
$xray = new p3k\XRay();
$xray->http = $this->http;
$res = $xray->feeds($url, $opts);
return $this->respond($response, !empty($res['error']) ? 400 : 200, $res);
}
}

+ 5
- 0
lib/XRay.php View File

@ -13,6 +13,11 @@ class XRay {
return $rels->parse($url, $opts); return $rels->parse($url, $opts);
} }
public function feeds($url, $opts=[]) {
$feeds = new XRay\Feeds($this->http);
return $feeds->find($url, $opts);
}
public function parse($url, $opts_or_body=false, $opts_for_body=[]) { public function parse($url, $opts_or_body=false, $opts_for_body=[]) {
if(!$opts_or_body || is_array($opts_or_body)) { if(!$opts_or_body || is_array($opts_or_body)) {
$fetch = new XRay\Fetcher($this->http); $fetch = new XRay\Fetcher($this->http);

+ 113
- 0
lib/XRay/Feeds.php View File

@ -0,0 +1,113 @@
<?php
namespace p3k\XRay;
use p3k\XRay\Formats;
class Feeds {
private $http;
public function __construct($http) {
$this->http = $http;
}
public function find($url, $opts=[]) {
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
$this->http->set_max_redirects($opts['max_redirects']);
$scheme = parse_url($url, PHP_URL_SCHEME);
if(!in_array($scheme, ['http','https'])) {
return [
'error' => 'invalid_url',
'error_description' => 'Only http and https URLs are supported'
];
}
$host = parse_url($url, PHP_URL_HOST);
if(!$host) {
return [
'error' => 'invalid_url',
'error_description' => 'The URL provided was not valid'
];
}
$url = normalize_url($url);
$result = $this->http->get($url);
$body = $result['body'];
$feeds = [];
// First check the content type of the response
$contentType = isset($result['headers']['Content-Type']) ? $result['headers']['Content-Type'] : '';
if(is_array($contentType))
$contentType = $contentType[count($contentType)-1];
if(strpos($contentType, 'application/atom+xml') !== false) {
$feeds[] = [
'url' => $result['url'],
'type' => 'atom'
];
} elseif(strpos($contentType, 'application/rss+xml') !== false) {
$feeds[] = [
'url' => $result['url'],
'type' => 'rss'
];
} elseif(strpos($contentType, 'application/json') !== false
&& substr($body, 0, 1) == '{' && strpos(substr($body, 0, 100), 'https://jsonfeed.org/version/1')) {
$feeds[] = [
'url' => $result['url'],
'type' => 'jsonfeed'
];
} else {
// Some other document was returned, parse the HTML and look for rel alternates and Microformats
$mf2 = \mf2\Parse($body, $result['url']);
if(isset($mf2['alternates'])) {
foreach($mf2['alternates'] as $alt) {
if(strpos($alt['type'], 'application/json') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'jsonfeed'
];
}
if(strpos($alt['type'], 'application/atom+xml') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'atom'
];
}
if(strpos($alt['type'], 'application/rss+xml') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'rss'
];
}
}
}
$parsed = Formats\HTML::parse($this->http, $body, $result['url'], array_merge($opts, ['expect'=>'feed']));
if($parsed && isset($parsed['data']['type']) && $parsed['data']['type'] == 'feed') {
$feeds[] = [
'url' => $result['url'],
'type' => 'microformats'
];
}
}
// Sort feeds by priority
$rank = ['microformats'=>0,'jsonfeed'=>1,'atom'=>2,'rss'=>3];
usort($feeds, function($a, $b) use($rank) {
return $rank[$a['type']] > $rank[$b['type']];
});
return [
'url' => $result['url'],
'code' => $result['code'],
'feeds' => $feeds
];
}
}

+ 3
- 0
public/index.php View File

@ -24,6 +24,9 @@ $router->addRoute('GET', '/parse', 'Parse::parse');
$router->addRoute('POST', '/parse', 'Parse::parse'); $router->addRoute('POST', '/parse', 'Parse::parse');
$router->addRoute('POST', '/token', 'Token::token'); $router->addRoute('POST', '/token', 'Token::token');
$router->addRoute('GET', '/feeds', 'Feeds::find');
$router->addRoute('POST', '/feeds', 'Feeds::find');
$router->addRoute('GET', '/rels', 'Rels::fetch'); $router->addRoute('GET', '/rels', 'Rels::fetch');
$router->addRoute('GET', '/cert', 'Certbot::index'); $router->addRoute('GET', '/cert', 'Certbot::index');

+ 156
- 0
tests/FindFeedsTest.php View File

@ -0,0 +1,156 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class FindFeedsTest extends PHPUnit_Framework_TestCase {
private $http;
public function setUp() {
$this->client = new Feeds();
$this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/');
$this->client->mc = null;
}
private function parse($params) {
$request = new Request($params);
$response = new Response();
return $this->client->find($request, $response);
}
// h-feed with no alternates
public function testMf2HFeed() {
$url = 'http://feed.example.com/h-feed-with-child-author';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/h-feed-with-child-author', $feeds[0]->url);
$this->assertEquals('microformats', $feeds[0]->type);
}
// h-feed that links to Atom alternate
public function testMf2WithAtomAlternate() {
$url = 'http://feed.example.com/h-feed-with-atom-alternate';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(2, count($feeds));
// Should rank JSONFeed above Atom
$this->assertEquals('http://feed.example.com/h-feed-with-atom-alternate', $feeds[0]->url);
$this->assertEquals('microformats', $feeds[0]->type);
$this->assertEquals('http://feed.example.com/atom', $feeds[1]->url);
$this->assertEquals('atom', $feeds[1]->type);
}
// h-feed that links to RSS alternate
public function testMf2WithRSSAlternate() {
$url = 'http://feed.example.com/h-feed-with-rss-alternate';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(2, count($feeds));
// Should rank JSONFeed above Atom
$this->assertEquals('http://feed.example.com/h-feed-with-rss-alternate', $feeds[0]->url);
$this->assertEquals('microformats', $feeds[0]->type);
$this->assertEquals('http://feed.example.com/podcast.xml', $feeds[1]->url);
$this->assertEquals('rss', $feeds[1]->type);
}
// No mf2 but links to Atom alternate
public function testNoMf2() {
$url = 'http://feed.example.com/html-with-atom-alternate';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/atom', $feeds[0]->url);
$this->assertEquals('atom', $feeds[0]->type);
}
public function testNoMf2WithJSONAndAtom() {
$url = 'http://feed.example.com/html-with-json-and-atom';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(2, count($feeds));
// Should rank JSONFeed above Atom
$this->assertEquals('http://feed.example.com/jsonfeed', $feeds[0]->url);
$this->assertEquals('jsonfeed', $feeds[0]->type);
$this->assertEquals('http://feed.example.com/atom', $feeds[1]->url);
$this->assertEquals('atom', $feeds[1]->type);
}
// input URL is an Atom feed
public function testInputIsAtom() {
$url = 'http://feed.example.com/atom';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/atom', $feeds[0]->url);
$this->assertEquals('atom', $feeds[0]->type);
}
// input URL redirects to an Atom feed
public function testInputIsRedirectToAtom() {
$url = 'http://feed.example.com/redirect-to-atom';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/atom', $feeds[0]->url);
$this->assertEquals('atom', $feeds[0]->type);
}
// input URL is an RSS feed
public function testInputIsRSS() {
$url = 'http://feed.example.com/rss';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/rss', $feeds[0]->url);
$this->assertEquals('rss', $feeds[0]->type);
}
// input URL is a JSON feed
public function testInputIsJSONFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/jsonfeed', $feeds[0]->url);
$this->assertEquals('jsonfeed', $feeds[0]->type);
}
}

+ 36
- 0
tests/data/feed.example.com/h-feed-with-atom-alternate View File

@ -0,0 +1,36 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/atom+xml" href="/atom">
</head>
<body>
<a href="/author" class="h-card">Author Name</a>
<ul>
<li class="h-entry">
<a href="/1" class="u-url p-name">One</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/2" class="u-url p-name">Two</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/3" class="u-url p-name">Three</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/4" class="u-url p-name">Four</a>
<a href="/author" class="u-author"></a>
</li>
</ul>
</body>
</html>

+ 36
- 0
tests/data/feed.example.com/h-feed-with-rss-alternate View File

@ -0,0 +1,36 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/rss+xml" href="/podcast.xml">
</head>
<body>
<a href="/author" class="h-card">Author Name</a>
<ul>
<li class="h-entry">
<a href="/1" class="u-url p-name">One</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/2" class="u-url p-name">Two</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/3" class="u-url p-name">Three</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/4" class="u-url p-name">Four</a>
<a href="/author" class="u-author"></a>
</li>
</ul>
</body>
</html>

+ 36
- 0
tests/data/feed.example.com/html-with-atom-alternate View File

@ -0,0 +1,36 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/atom+xml" href="/atom">
</head>
<body>
<h1><a href="/author">Author Name</a></h1>
<ul>
<li>
<a href="/1">One</a>
<a href="/author"></a>
</li>
<li>
<a href="/2">Two</a>
<a href="/author"></a>
</li>
<li>
<a href="/3">Three</a>
<a href="/author"></a>
</li>
<li>
<a href="/4">Four</a>
<a href="/author"></a>
</li>
</ul>
</body>
</html>

+ 37
- 0
tests/data/feed.example.com/html-with-json-and-atom View File

@ -0,0 +1,37 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/atom+xml" href="/atom">
<link rel="alternate" type="application/json" href="/jsonfeed">
</head>
<body>
<h1><a href="/author">Author Name</a></h1>
<ul>
<li>
<a href="/1">One</a>
<a href="/author"></a>
</li>
<li>
<a href="/2">Two</a>
<a href="/author"></a>
</li>
<li>
<a href="/3">Three</a>
<a href="/author"></a>
</li>
<li>
<a href="/4">Four</a>
<a href="/author"></a>
</li>
</ul>
</body>
</html>

+ 15
- 0
tests/data/feed.example.com/redirect-to-atom View File

@ -0,0 +1,15 @@
HTTP/1.1 301 Moved Permanently
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
Location: http://feed.example.com/atom
<html>
<head>
<title>Moved</title>
</head>
<body>
This page has moved
</body>
</html>

Loading…
Cancel
Save