p3k
/
Telegraph
mirror of https://github.com/aaronpk/Telegraph.git

<?phpnamespace Telegraph;
use DOMXPath, DOMDocument;
class FindLinks {
  public static function all($input) {    if(is_string($input)) {      return self::inHTML($input);    } elseif(is_array($input)) {      $links = [];      // This recursively iterates over the whole input array and searches for
      // everything that looks like a URL regardless of its depth or property name.
      // For items with a key of "html", it parses the value as HTML instead of text.
      // This supports handling the XRay parsed result format
      foreach(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($input)) as $key => $value) {        if($key === 'html') {          $links = array_merge($links, self::inHTML($value));        }        else {          $links = array_merge($links, self::inText($value));        }      }      return array_unique($links);    } else {      return [];    }  }
  /**   * find all links in text.   * @param $input string text block   * @return mixed array of links in text block.   */  public static function inText($input) {    if(!is_string($input)) return [];    preg_match_all('/https?:\/\/[^ ]+/', $input, $matches);    return array_unique($matches[0]);  }
  /**   * find all links in text.   * @param $input string text block   * @return mixed array of links in text block.   */  public static function inHTML($html) {    if(!is_string($html)) return [];
    $doc = new DOMDocument();    libxml_use_internal_errors(true); # suppress parse errors and warnings
    @$doc->loadHTML(self::toHtmlEntities($html), LIBXML_NOWARNING|LIBXML_NOERROR);    libxml_clear_errors();    if(!$doc) return [];    $xpath = new DOMXPath($doc);
    $links = [];    foreach($xpath->query('//a[@href]') as $href) {      $links[] = $href->getAttribute('href');    }
    return array_unique($links);  }
  private static function toHtmlEntities($input) {    return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));  }
}