Browse Source

add feed discovery API

pull/49/head v1.4.0
Aaron Parecki 6 years ago
parent
commit
206e27ea25
No known key found for this signature in database GPG Key ID: 276C2817346D6056
12 changed files with 545 additions and 4 deletions
  1. +49
    -4
      README.md
  2. +1
    -0
      composer.json
  3. +58
    -0
      controllers/Feeds.php
  4. +5
    -0
      lib/XRay.php
  5. +113
    -0
      lib/XRay/Feeds.php
  6. +3
    -0
      public/index.php
  7. +156
    -0
      tests/FindFeedsTest.php
  8. +36
    -0
      tests/data/feed.example.com/h-feed-with-atom-alternate
  9. +36
    -0
      tests/data/feed.example.com/h-feed-with-rss-alternate
  10. +36
    -0
      tests/data/feed.example.com/html-with-atom-alternate
  11. +37
    -0
      tests/data/feed.example.com/html-with-json-and-atom
  12. +15
    -0
      tests/data/feed.example.com/redirect-to-atom

+ 49
- 4
README.md View File

@ -99,14 +99,13 @@ You can also use XRay to fetch all the rel values on a page, merging the list of
```php
$xray = new p3k\XRay();
$xray->http = $this->http;
$rels = $xray->rels('https://aaronparecki.com/');
```
This will return a similar response to the parser, but instead of a `data` key containing the parsed page, there will be `rels`, an associative array. Each key will contain an array of all the values that match that rel value.
```
$rels = Array
Array
(
[url] => https://aaronparecki.com/
[code] => 200
@ -125,6 +124,41 @@ $rels = Array
```
### Feed Discovery
You can use XRay to discover the types of feeds available at a URL.
```php
$xray = new p3k\XRay();
$feeds = $xray->feeds('http://percolator.today');
```
This will fetch the URL, check for a Microformats feed, as well as check for rel=alternates pointing to Atom, RSS or JSONFeed URLs. The response will look like the below.
```
Array
(
[url] => https://percolator.today/
[code] => 200
[feeds] => Array
(
[0] => Array
(
[url] => https://percolator.today/
[type] => microformats
)
[1] => Array
(
[url] => https://percolator.today/podcast.xml
[type] => rss
)
)
)
```
### Customizing the User Agent
To set a unique user agent, (some websites will require a user agent be set), you can set the `http` property of the object to a `p3k\HTTP` object.
@ -336,7 +370,8 @@ If the page being parsed represents a feed, then the response will look like the
"data": {
"type": "feed",
"items": [
{...},
{...}
]
}
}
@ -346,7 +381,7 @@ Each object in the `items` array will contain a parsed version of the item, in t
Atom, RSS and JSONFeed will all be normalized to XRay's vocabulary, and only recognized properties will be returned.
## Rels
## Rels API
There is also an API method to parse and return all rel values on the page, including HTTP `Link` headers and HTML rel values.
@ -354,6 +389,16 @@ There is also an API method to parse and return all rel values on the page, incl
GET /rels?url=https://aaronparecki.com/
```
See [above](#rels) for the response format.
## Feed Discovery API
```
GET /feeds?url=https://aaronparecki.com/
```
See [above](#feed-discovery) for the response format.
## Token API

+ 1
- 0
composer.json View File

@ -35,6 +35,7 @@
"controllers/Parse.php",
"controllers/Token.php",
"controllers/Rels.php",
"controllers/Feeds.php",
"controllers/Certbot.php"
]
}

+ 58
- 0
controllers/Feeds.php View File

@ -0,0 +1,58 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class Feeds {
public $http;
private $_pretty = false;
public function __construct() {
$this->http = new p3k\HTTP();
}
private function respond(Response $response, $code, $params, $headers=[]) {
$response->setStatusCode($code);
foreach($headers as $k=>$v) {
$response->headers->set($k, $v);
}
$response->headers->set('Content-Type', 'application/json');
$opts = JSON_UNESCAPED_SLASHES;
if($this->_pretty) $opts += JSON_PRETTY_PRINT;
$response->setContent(json_encode($params, $opts)."\n");
return $response;
}
public function find(Request $request, Response $response) {
$opts = [];
if($request->get('timeout')) {
// We might make 2 HTTP requests, so each request gets half the desired timeout
$opts['timeout'] = $request->get('timeout') / 2;
}
if($request->get('max_redirects')) {
$opts['max_redirects'] = (int)$request->get('max_redirects');
}
if($request->get('pretty')) {
$this->_pretty = true;
}
$url = $request->get('url');
if(!$url) {
return $this->respond($response, 400, [
'error' => 'missing_url',
'error_description' => 'Provide a URL to fetch'
]);
}
$xray = new p3k\XRay();
$xray->http = $this->http;
$res = $xray->feeds($url, $opts);
return $this->respond($response, !empty($res['error']) ? 400 : 200, $res);
}
}

+ 5
- 0
lib/XRay.php View File

@ -13,6 +13,11 @@ class XRay {
return $rels->parse($url, $opts);
}
public function feeds($url, $opts=[]) {
$feeds = new XRay\Feeds($this->http);
return $feeds->find($url, $opts);
}
public function parse($url, $opts_or_body=false, $opts_for_body=[]) {
if(!$opts_or_body || is_array($opts_or_body)) {
$fetch = new XRay\Fetcher($this->http);

+ 113
- 0
lib/XRay/Feeds.php View File

@ -0,0 +1,113 @@
<?php
namespace p3k\XRay;
use p3k\XRay\Formats;
class Feeds {
private $http;
public function __construct($http) {
$this->http = $http;
}
public function find($url, $opts=[]) {
if(isset($opts['timeout']))
$this->http->set_timeout($opts['timeout']);
if(isset($opts['max_redirects']))
$this->http->set_max_redirects($opts['max_redirects']);
$scheme = parse_url($url, PHP_URL_SCHEME);
if(!in_array($scheme, ['http','https'])) {
return [
'error' => 'invalid_url',
'error_description' => 'Only http and https URLs are supported'
];
}
$host = parse_url($url, PHP_URL_HOST);
if(!$host) {
return [
'error' => 'invalid_url',
'error_description' => 'The URL provided was not valid'
];
}
$url = normalize_url($url);
$result = $this->http->get($url);
$body = $result['body'];
$feeds = [];
// First check the content type of the response
$contentType = isset($result['headers']['Content-Type']) ? $result['headers']['Content-Type'] : '';
if(is_array($contentType))
$contentType = $contentType[count($contentType)-1];
if(strpos($contentType, 'application/atom+xml') !== false) {
$feeds[] = [
'url' => $result['url'],
'type' => 'atom'
];
} elseif(strpos($contentType, 'application/rss+xml') !== false) {
$feeds[] = [
'url' => $result['url'],
'type' => 'rss'
];
} elseif(strpos($contentType, 'application/json') !== false
&& substr($body, 0, 1) == '{' && strpos(substr($body, 0, 100), 'https://jsonfeed.org/version/1')) {
$feeds[] = [
'url' => $result['url'],
'type' => 'jsonfeed'
];
} else {
// Some other document was returned, parse the HTML and look for rel alternates and Microformats
$mf2 = \mf2\Parse($body, $result['url']);
if(isset($mf2['alternates'])) {
foreach($mf2['alternates'] as $alt) {
if(strpos($alt['type'], 'application/json') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'jsonfeed'
];
}
if(strpos($alt['type'], 'application/atom+xml') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'atom'
];
}
if(strpos($alt['type'], 'application/rss+xml') !== false) {
$feeds[] = [
'url' => $alt['url'],
'type' => 'rss'
];
}
}
}
$parsed = Formats\HTML::parse($this->http, $body, $result['url'], array_merge($opts, ['expect'=>'feed']));
if($parsed && isset($parsed['data']['type']) && $parsed['data']['type'] == 'feed') {
$feeds[] = [
'url' => $result['url'],
'type' => 'microformats'
];
}
}
// Sort feeds by priority
$rank = ['microformats'=>0,'jsonfeed'=>1,'atom'=>2,'rss'=>3];
usort($feeds, function($a, $b) use($rank) {
return $rank[$a['type']] > $rank[$b['type']];
});
return [
'url' => $result['url'],
'code' => $result['code'],
'feeds' => $feeds
];
}
}

+ 3
- 0
public/index.php View File

@ -24,6 +24,9 @@ $router->addRoute('GET', '/parse', 'Parse::parse');
$router->addRoute('POST', '/parse', 'Parse::parse');
$router->addRoute('POST', '/token', 'Token::token');
$router->addRoute('GET', '/feeds', 'Feeds::find');
$router->addRoute('POST', '/feeds', 'Feeds::find');
$router->addRoute('GET', '/rels', 'Rels::fetch');
$router->addRoute('GET', '/cert', 'Certbot::index');

+ 156
- 0
tests/FindFeedsTest.php View File

@ -0,0 +1,156 @@
<?php
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;
class FindFeedsTest extends PHPUnit_Framework_TestCase {
private $http;
public function setUp() {
$this->client = new Feeds();
$this->client->http = new p3k\HTTP\Test(dirname(__FILE__).'/data/');
$this->client->mc = null;
}
private function parse($params) {
$request = new Request($params);
$response = new Response();
return $this->client->find($request, $response);
}
// h-feed with no alternates
public function testMf2HFeed() {
$url = 'http://feed.example.com/h-feed-with-child-author';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/h-feed-with-child-author', $feeds[0]->url);
$this->assertEquals('microformats', $feeds[0]->type);
}
// h-feed that links to Atom alternate
public function testMf2WithAtomAlternate() {
$url = 'http://feed.example.com/h-feed-with-atom-alternate';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(2, count($feeds));
// Should rank JSONFeed above Atom
$this->assertEquals('http://feed.example.com/h-feed-with-atom-alternate', $feeds[0]->url);
$this->assertEquals('microformats', $feeds[0]->type);
$this->assertEquals('http://feed.example.com/atom', $feeds[1]->url);
$this->assertEquals('atom', $feeds[1]->type);
}
// h-feed that links to RSS alternate
public function testMf2WithRSSAlternate() {
$url = 'http://feed.example.com/h-feed-with-rss-alternate';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(2, count($feeds));
// Should rank JSONFeed above Atom
$this->assertEquals('http://feed.example.com/h-feed-with-rss-alternate', $feeds[0]->url);
$this->assertEquals('microformats', $feeds[0]->type);
$this->assertEquals('http://feed.example.com/podcast.xml', $feeds[1]->url);
$this->assertEquals('rss', $feeds[1]->type);
}
// No mf2 but links to Atom alternate
public function testNoMf2() {
$url = 'http://feed.example.com/html-with-atom-alternate';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/atom', $feeds[0]->url);
$this->assertEquals('atom', $feeds[0]->type);
}
public function testNoMf2WithJSONAndAtom() {
$url = 'http://feed.example.com/html-with-json-and-atom';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(2, count($feeds));
// Should rank JSONFeed above Atom
$this->assertEquals('http://feed.example.com/jsonfeed', $feeds[0]->url);
$this->assertEquals('jsonfeed', $feeds[0]->type);
$this->assertEquals('http://feed.example.com/atom', $feeds[1]->url);
$this->assertEquals('atom', $feeds[1]->type);
}
// input URL is an Atom feed
public function testInputIsAtom() {
$url = 'http://feed.example.com/atom';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/atom', $feeds[0]->url);
$this->assertEquals('atom', $feeds[0]->type);
}
// input URL redirects to an Atom feed
public function testInputIsRedirectToAtom() {
$url = 'http://feed.example.com/redirect-to-atom';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/atom', $feeds[0]->url);
$this->assertEquals('atom', $feeds[0]->type);
}
// input URL is an RSS feed
public function testInputIsRSS() {
$url = 'http://feed.example.com/rss';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/rss', $feeds[0]->url);
$this->assertEquals('rss', $feeds[0]->type);
}
// input URL is a JSON feed
public function testInputIsJSONFeed() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url]);
$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$feeds = json_decode($body)->feeds;
$this->assertEquals(1, count($feeds));
$this->assertEquals('http://feed.example.com/jsonfeed', $feeds[0]->url);
$this->assertEquals('jsonfeed', $feeds[0]->type);
}
}

+ 36
- 0
tests/data/feed.example.com/h-feed-with-atom-alternate View File

@ -0,0 +1,36 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/atom+xml" href="/atom">
</head>
<body>
<a href="/author" class="h-card">Author Name</a>
<ul>
<li class="h-entry">
<a href="/1" class="u-url p-name">One</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/2" class="u-url p-name">Two</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/3" class="u-url p-name">Three</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/4" class="u-url p-name">Four</a>
<a href="/author" class="u-author"></a>
</li>
</ul>
</body>
</html>

+ 36
- 0
tests/data/feed.example.com/h-feed-with-rss-alternate View File

@ -0,0 +1,36 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/rss+xml" href="/podcast.xml">
</head>
<body>
<a href="/author" class="h-card">Author Name</a>
<ul>
<li class="h-entry">
<a href="/1" class="u-url p-name">One</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/2" class="u-url p-name">Two</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/3" class="u-url p-name">Three</a>
<a href="/author" class="u-author"></a>
</li>
<li class="h-entry">
<a href="/4" class="u-url p-name">Four</a>
<a href="/author" class="u-author"></a>
</li>
</ul>
</body>
</html>

+ 36
- 0
tests/data/feed.example.com/html-with-atom-alternate View File

@ -0,0 +1,36 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/atom+xml" href="/atom">
</head>
<body>
<h1><a href="/author">Author Name</a></h1>
<ul>
<li>
<a href="/1">One</a>
<a href="/author"></a>
</li>
<li>
<a href="/2">Two</a>
<a href="/author"></a>
</li>
<li>
<a href="/3">Three</a>
<a href="/author"></a>
</li>
<li>
<a href="/4">Four</a>
<a href="/author"></a>
</li>
</ul>
</body>
</html>

+ 37
- 0
tests/data/feed.example.com/html-with-json-and-atom View File

@ -0,0 +1,37 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
<html>
<head>
<title>Test</title>
<link rel="alternate" type="application/atom+xml" href="/atom">
<link rel="alternate" type="application/json" href="/jsonfeed">
</head>
<body>
<h1><a href="/author">Author Name</a></h1>
<ul>
<li>
<a href="/1">One</a>
<a href="/author"></a>
</li>
<li>
<a href="/2">Two</a>
<a href="/author"></a>
</li>
<li>
<a href="/3">Three</a>
<a href="/author"></a>
</li>
<li>
<a href="/4">Four</a>
<a href="/author"></a>
</li>
</ul>
</body>
</html>

+ 15
- 0
tests/data/feed.example.com/redirect-to-atom View File

@ -0,0 +1,15 @@
HTTP/1.1 301 Moved Permanently
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive
Location: http://feed.example.com/atom
<html>
<head>
<title>Moved</title>
</head>
<body>
This page has moved
</body>
</html>

Loading…
Cancel
Save