You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.0 KiB

5 years ago
  1. <?php
  2. namespace Telegraph;
  3. use DOMXPath, DOMDocument;
  4. class FindLinks {
  5. public static function all($input) {
  6. if(is_string($input)) {
  7. return self::inHTML($input);
  8. } elseif(is_array($input)) {
  9. $links = [];
  10. // This recursively iterates over the whole input array and searches for
  11. // everything that looks like a URL regardless of its depth or property name.
  12. // For items with a key of "html", it parses the value as HTML instead of text.
  13. // This supports handling the XRay parsed result format
  14. foreach(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($input)) as $key => $value) {
  15. if($key === 'html') {
  16. $links = array_merge($links, self::inHTML($value));
  17. }
  18. else {
  19. $links = array_merge($links, self::inText($value));
  20. }
  21. }
  22. return array_unique($links);
  23. } else {
  24. return [];
  25. }
  26. }
  27. /**
  28. * find all links in text.
  29. * @param $input string text block
  30. * @return mixed array of links in text block.
  31. */
  32. public static function inText($input) {
  33. if(!is_string($input)) return [];
  34. preg_match_all('/https?:\/\/[^ ]+/', $input, $matches);
  35. return array_unique($matches[0]);
  36. }
  37. /**
  38. * find all links in text.
  39. * @param $input string text block
  40. * @return mixed array of links in text block.
  41. */
  42. public static function inHTML($html) {
  43. if(!is_string($html)) return [];
  44. $doc = new DOMDocument();
  45. libxml_use_internal_errors(true); # suppress parse errors and warnings
  46. @$doc->loadHTML(self::toHtmlEntities($html), LIBXML_NOWARNING|LIBXML_NOERROR);
  47. libxml_clear_errors();
  48. if(!$doc) return [];
  49. $xpath = new DOMXPath($doc);
  50. $links = [];
  51. foreach($xpath->query('//a[@href]') as $href) {
  52. $links[] = $href->getAttribute('href');
  53. }
  54. return array_unique($links);
  55. }
  56. private static function toHtmlEntities($input) {
  57. return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));
  58. }
  59. }