p3k
/
Telegraph
mirror of https://github.com/aaronpk/Telegraph.git


								<?php

								namespace Telegraph;


								use DOMXPath, DOMDocument;


								class FindLinks {


								  public static function all($input) {

								    if(is_string($input)) {

								      return self::inHTML($input);

								    } elseif(is_array($input)) {

								      $links = [];

								      // This recursively iterates over the whole input array and searches for

								      // everything that looks like a URL regardless of its depth or property name.

								      // For items with a key of "html", it parses the value as HTML instead of text.

								      // This supports handling the XRay parsed result format

								      foreach(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($input)) as $key => $value) {

								        if($key === 'html') {

								          $links = array_merge($links, self::inHTML($value));

								        }

								        else {

								          $links = array_merge($links, self::inText($value));

								        }

								      }

								      return array_unique($links);

								    } else {

								      return [];

								    }

								  }


								  /**

								   * find all links in text.

								   * @param $input string text block

								   * @return mixed array of links in text block.

								   */

								  public static function inText($input) {

								    if(!is_string($input)) return [];

								    preg_match_all('/https?:\/\/[^ ]+/', $input, $matches);

								    return array_unique($matches[0]);

								  }


								  /**

								   * find all links in text.

								   * @param $input string text block

								   * @return mixed array of links in text block.

								   */

								  public static function inHTML($html) {

								    if(!is_string($html)) return [];


								    $doc = new DOMDocument();

								    libxml_use_internal_errors(true); # suppress parse errors and warnings

								    @$doc->loadHTML(self::toHtmlEntities($html), LIBXML_NOWARNING|LIBXML_NOERROR);

								    libxml_clear_errors();

								    if(!$doc) return [];

								    $xpath = new DOMXPath($doc);


								    $links = [];

								    foreach($xpath->query('//a[@href]') as $href) {

								      $links[] = $href->getAttribute('href');

								    }


								    return array_unique($links);

								  }


								  private static function toHtmlEntities($input) {

								    return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input));

								  }


								}