You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
1.9 KiB

5 years ago
5 years ago
  1. <?php
  2. namespace Telegraph;
  3. use DOMXPath, DOMDocument;
  4. class FindLinks {
  5. public static function all($input) {
  6. if(is_string($input)) {
  7. return self::inHTML($input);
  8. } elseif(is_array($input)) {
  9. $links = [];
  10. // This recursively iterates over the whole input array and searches for
  11. // everything that looks like a URL regardless of its depth or property name.
  12. // For items with a key of "html", it parses the value as HTML instead of text.
  13. // This supports handling the XRay parsed result format
  14. foreach(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($input)) as $key => $value) {
  15. if($key === 'html') {
  16. $links = array_merge($links, self::inHTML($value));
  17. }
  18. else {
  19. $links = array_merge($links, self::inText($value));
  20. }
  21. }
  22. return array_unique($links);
  23. } else {
  24. return [];
  25. }
  26. }
  27. /**
  28. * find all links in text.
  29. * @param $input string text block
  30. * @return mixed array of links in text block.
  31. */
  32. public static function inText(\string $input) {
  33. preg_match_all('/https?:\/\/[^ ]+/', $input, $matches);
  34. return array_unique($matches[0]);
  35. }
  36. /**
  37. * find all links in text.
  38. * @param $input string text block
  39. * @return mixed array of links in text block.
  40. */
  41. public static function inHTML(\string $html) {
  42. $doc = new DOMDocument();
  43. libxml_use_internal_errors(true); # suppress parse errors and warnings
  44. @$doc->loadHTML(self::toHtmlEntities($html), LIBXML_NOWARNING|LIBXML_NOERROR);
  45. libxml_clear_errors();
  46. if(!$doc) return [];
  47. $xpath = new DOMXPath($doc);
  48. $links = [];
  49. foreach($xpath->query('//a[@href]') as $href) {
  50. $links[] = $href->getAttribute('href');
  51. }
  52. return array_unique($links);
  53. }
  54. private static function toHtmlEntities($input) {
  55. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  56. }
  57. }