You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.0 KiB

<?php
namespace Telegraph;
use DOMXPath, DOMDocument;
class FindLinks {
public static function all($input) {
if(is_string($input)) {
return self::inHTML($input);
} elseif(is_array($input)) {
$links = [];
// This recursively iterates over the whole input array and searches for
// everything that looks like a URL regardless of its depth or property name.
// For items with a key of "html", it parses the value as HTML instead of text.
// This supports handling the XRay parsed result format
foreach(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($input)) as $key => $value) {
if($key === 'html') {
$links = array_merge($links, self::inHTML($value));
}
else {
$links = array_merge($links, self::inText($value));
}
}
return array_unique($links);
} else {
return [];
}
}
/**
* find all links in text.
* @param $input string text block
* @return mixed array of links in text block.
*/
public static function inText($input) {
if(!is_string($input)) return [];
preg_match_all('/https?:\/\/[^ ]+/', $input, $matches);
return array_unique($matches[0]);
}
/**
* find all links in text.
* @param $input string text block
* @return mixed array of links in text block.
*/
public static function inHTML($html) {
if(!is_string($html)) return [];
$doc = new DOMDocument();
libxml_use_internal_errors(true); # suppress parse errors and warnings
@$doc->loadHTML(self::toHtmlEntities($html), LIBXML_NOWARNING|LIBXML_NOERROR);
libxml_clear_errors();
if(!$doc) return [];
$xpath = new DOMXPath($doc);
$links = [];
foreach($xpath->query('//a[@href]') as $href) {
$links[] = $href->getAttribute('href');
}
return array_unique($links);
}
private static function toHtmlEntities($input) {
return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
}
}