From 5f5392a7b8a5901a9a8b82675529739174df0486 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Sat, 15 Apr 2017 09:36:56 -0700 Subject: [PATCH] deduplicate categories, and strip leading hashtags --- lib/Formats/Mf2.php | 16 +++++++++++--- tests/ParseTest.php | 22 +++++++++++++++++++ .../h-entry-duplicate-categories | 18 +++++++++++++++ .../h-entry-strip-hashtag-from-categories | 19 ++++++++++++++++ 4 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 tests/data/source.example.com/h-entry-duplicate-categories create mode 100644 tests/data/source.example.com/h-entry-strip-hashtag-from-categories diff --git a/lib/Formats/Mf2.php b/lib/Formats/Mf2.php index 09ea6ef..c40710c 100644 --- a/lib/Formats/Mf2.php +++ b/lib/Formats/Mf2.php @@ -195,18 +195,21 @@ class Mf2 { foreach($item['properties'][$p] as $v) { if(is_string($v)) { if(!array_key_exists($p, $data)) $data[$p] = []; - $data[$p][] = $v; + if(!in_array($v, $data[$p])) + $data[$p][] = $v; } elseif(self::isMicroformat($v)) { if(($u=self::getPlaintext($v, 'url')) && self::isURL($u)) { if(!array_key_exists($p, $data)) $data[$p] = []; - $data[$p][] = $u; + if(!in_array($u, $data[$p])) + $data[$p][] = $u; $ref = self::parse(['items'=>[$v]], $u, $http); if($ref) { $refs[$u] = $ref['data']; } } else { if(!array_key_exists($p, $data)) $data[$p] = []; - $data[$p][] = $v['value']; + if(!in_array($v['value'], $data[$p])) + $data[$p][] = $v['value']; } } } @@ -285,6 +288,13 @@ class Mf2 { self::collectSingleValues(['published','summary','rsvp','swarm-coins'], ['url'], $item, $data); // These properties are always returned as arrays and may contain plaintext content + // First strip leading hashtags from category values if present + if(array_key_exists('category', $item['properties'])) { + foreach($item['properties']['category'] as $i=>$c) { + if(is_string($c)) + $item['properties']['category'][$i] = ltrim($c, '#'); + } + } self::collectArrayValues(['category','invitee'], $item, $data, $refs, $http); // These properties are always returned as arrays and always URLs diff --git a/tests/ParseTest.php b/tests/ParseTest.php index 5c328ea..ab2e773 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -153,6 +153,28 @@ class ParseTest extends PHPUnit_Framework_TestCase { $this->assertObjectNotHasAttribute('content', $data->data); } + public function testEntryWithDuplicateCategories() { + $url = 'http://source.example.com/h-entry-duplicate-categories'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertEquals(['indieweb'], $data->data->category); + } + + public function testEntryStripHashtagWithDuplicateCategories() { + $url = 'http://source.example.com/h-entry-strip-hashtag-from-categories'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertContains('indieweb', $data->data->category); + $this->assertContains('xray', $data->data->category); + $this->assertEquals(2, count($data->data->category)); + } + public function testNoHEntryMarkup() { $url = 'http://source.example.com/no-h-entry'; $response = $this->parse(['url' => $url]); diff --git a/tests/data/source.example.com/h-entry-duplicate-categories b/tests/data/source.example.com/h-entry-duplicate-categories new file mode 100644 index 0000000..b845719 --- /dev/null +++ b/tests/data/source.example.com/h-entry-duplicate-categories @@ -0,0 +1,18 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This page is an h-entry and has some duplicate categories #indieweb.

+ + + + diff --git a/tests/data/source.example.com/h-entry-strip-hashtag-from-categories b/tests/data/source.example.com/h-entry-strip-hashtag-from-categories new file mode 100644 index 0000000..a560348 --- /dev/null +++ b/tests/data/source.example.com/h-entry-strip-hashtag-from-categories @@ -0,0 +1,19 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This page is an h-entry and has some duplicate categories #indieweb.

+ + + +