<?php
|
|
namespace Telegraph;
|
|
|
|
use DOMXPath, DOMDocument;
|
|
|
|
class FindLinks {
|
|
|
|
public static function all($input) {
|
|
if(is_string($input)) {
|
|
return self::inHTML($input);
|
|
} elseif(is_array($input)) {
|
|
$links = [];
|
|
// This recursively iterates over the whole input array and searches for
|
|
// everything that looks like a URL regardless of its depth or property name.
|
|
// For items with a key of "html", it parses the value as HTML instead of text.
|
|
// This supports handling the XRay parsed result format
|
|
foreach(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($input)) as $key => $value) {
|
|
if($key === 'html') {
|
|
$links = array_merge($links, self::inHTML($value));
|
|
}
|
|
else {
|
|
$links = array_merge($links, self::inText($value));
|
|
}
|
|
}
|
|
return array_unique($links);
|
|
} else {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* find all links in text.
|
|
* @param $input string text block
|
|
* @return mixed array of links in text block.
|
|
*/
|
|
public static function inText($input) {
|
|
if(!is_string($input)) return [];
|
|
preg_match_all('/https?:\/\/[^ ]+/', $input, $matches);
|
|
return array_unique($matches[0]);
|
|
}
|
|
|
|
/**
|
|
* find all links in text.
|
|
* @param $input string text block
|
|
* @return mixed array of links in text block.
|
|
*/
|
|
public static function inHTML($html) {
|
|
if(!is_string($html)) return [];
|
|
|
|
$doc = new DOMDocument();
|
|
libxml_use_internal_errors(true); # suppress parse errors and warnings
|
|
@$doc->loadHTML(self::toHtmlEntities($html), LIBXML_NOWARNING|LIBXML_NOERROR);
|
|
libxml_clear_errors();
|
|
if(!$doc) return [];
|
|
$xpath = new DOMXPath($doc);
|
|
|
|
$links = [];
|
|
foreach($xpath->query('//a[@href]') as $href) {
|
|
$links[] = $href->getAttribute('href');
|
|
}
|
|
|
|
return array_unique($links);
|
|
}
|
|
|
|
private static function toHtmlEntities($input) {
|
|
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
|
|
}
|
|
|
|
}
|