You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.0 KiB

namespace Telegraph;
use DOMXPath, DOMDocument;
class FindLinks {
public static function all($input) {
if(is_string($input)) {
return self::inHTML($input);
} elseif(is_array($input)) {
$links = [];
// This recursively iterates over the whole input array and searches for
// everything that looks like a URL regardless of its depth or property name.
// For items with a key of "html", it parses the value as HTML instead of text.
// This supports handling the XRay parsed result format
foreach(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($input)) as $key => $value) {
if($key === 'html') {
$links = array_merge($links, self::inHTML($value));
else {
$links = array_merge($links, self::inText($value));
return array_unique($links);
} else {
return [];
* find all links in text.
* @param $input string text block
* @return mixed array of links in text block.
public static function inText($input) {
if(!is_string($input)) return [];
preg_match_all('/https?:\/\/[^ ]+/', $input, $matches);
return array_unique($matches[0]);
* find all links in text.
* @param $input string text block
* @return mixed array of links in text block.
public static function inHTML($html) {
if(!is_string($html)) return [];
$doc = new DOMDocument();
libxml_use_internal_errors(true); # suppress parse errors and warnings
@$doc->loadHTML(self::toHtmlEntities($html), LIBXML_NOWARNING|LIBXML_NOERROR);
if(!$doc) return [];
$xpath = new DOMXPath($doc);
$links = [];
foreach($xpath->query('//a[@href]') as $href) {
$links[] = $href->getAttribute('href');
return array_unique($links);
private static function toHtmlEntities($input) {
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));