Anonymous - 2014-04-10

Firstly, I LOVE this package - wonderful coding, my hat is off to you.

For a number of reasons beyond the scope of this suggestion, we needed the ability to identify good, bad, or unknown links on our site. Below are the changes I made to have PHPCrawl examine the links and decide if they're blacklisted, whitelisted, or "greylisted" (not on the given black or white list).

What prompted us to do this was that Google had flagged our site as hacked. As it turned out, there was some malicious code that was only rendered when the useragent was GoogleBot, so we decided to crawl all of our sites to see if it appeared anywhere else.

Anyways, the code isn't nearly as good as the author's, and might be a little specific for our needs in a couple of places, but i think the core of it will work in most cases.

EXAMPLE.PHP
$crawler->setBlacklist("viagra.com,cialis.porn.com");   // Any domains that you DONT want on your site
$crawler->setWhitelist("facebook.com,oursite1.com,oursite2.com");   //Any sites that are definately approved sites

class MyCrawler extends PHPCrawler
{
    function handleDocumentInfo($DocInfo)
    {
        echo $DocInfo;
    }
}
//Yields (in addition to standard properties already there):
PHPCrawlerDocumentInfo Object
(
    [links_found] => Array
        (
            [0] => Array
                (
                    [url_rebuild] => http://www.yoursite.net/about-us/press/574-2014survey
                    [link_raw] => /about-us/press/574-2014survey
                    [linkcode] => <a href="/about-us/press/574-2014survey">We want to know what you like about us.  Please take our 5-minute survey. </a>
                    [linktext] => We want to know what you like about us.  Please take our 5-minute survey. 
                    [refering_url] => http://www.yoursite.net
                    [is_redirect_url] => 
                    [blacklisted] => 
                    [whitelisted] => 1
                    [greylisted] => 
                )
        )
    [links_found_url_descriptors] => Array
        (
            [0] => PHPCrawlerURLDescriptor Object
                (
                    [url_rebuild] => http://www.yoursite.net/about-us/press/574-2014survey
                    [link_raw] => /about-us/press/574-2014survey
                    [linkcode] => <a href="/about-us/press/574-2014survey">We want to know what you like about us.  Please take our 5-minute survey. </a>
                    [linktext] => We want to know what you like about us.  Please take our 5-minute survey. 
                    [refering_url] => http://www.yoursite.net
                    [is_redirect_url] => 
                )
        )
    [blacklist] => Array
        (
            [0] => viagra.com
            [1] => cialis.com
            [2] => porn.com
        )
    [whitelist] => Array
        (
            [0] => academicearth.org
            [1] => accessmylibrary.com
            [2] => activism.net
)

//Code changes to PHPCrawl:

PHPCrawler.class.php
    protected function processUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
        // Setup HTTP-request
        $this->PageRequest->setUrl($UrlDescriptor);
        $this->PageRequest->setBlackWhiteLists($this->blacklist, $this->whitelist);

    public function getProcessReport()
        $Report->blacklist          = implode(', ',$this->blacklist);
        $Report->whitelist          = implode(', ',$this->whitelist);

    public function setBlacklist($strBlacklistCsv)
    {
        $strBlacklistCsv = trim($strBlacklistCsv);
        if ($strBlacklistCsv != "" && is_string($strBlacklistCsv))
        {
            $this->blacklist = explode(',',$strBlacklistCsv);
            return true;
        }
        else return false;
    }

    public function setWhitelist($strWhitelistCsv)
    {
        $strWhitelistCsv = trim($strWhitelistCsv);
        $this->whitelist = array();
        if ($strWhitelistCsv != "" && is_string($strWhitelistCsv))
            $this->whitelist = explode(',',$strWhitelistCsv);
        // get host name from URL and add it to the whitelist
        preg_match('@^(?:http://)?([^/]+)@i', $this->starting_url, $matches);
        $host = $matches[1];
        preg_match('/[^.]+\.[^.]+$/', $host, $matches);
        $this_domain = $matches[0];
        $this->whitelist[] =$this_domain;
        $this->whitelist = array_unique($this->whitelist);
        sort($this->whitelist);
        return true;
    }

PHPCrawlerDocumentInfo.class.php
    /**
    * Examine links for whitelist, blacklist, or greylist links.
    *
    * @internal
    */
    public function examineLinksFoundArray()
    {
        $cnt = count($this->links_found_url_descriptors);
        for ($x=0; $x<$cnt; $x++)
        {
            $UrlDescriptor = $this->links_found_url_descriptors[$x];
            // Convert $UrlDescriptor-object to an array
            $object_vars = get_object_vars($UrlDescriptor);
            if(substr($UrlDescriptor->url_rebuild,0,5)=='file:'){
                $object_vars['whitelisted'] = true;
            }else{
                // get host name from URL
                preg_match('@^(?:http://|https://|mms://|ftp://)?([^/]+)@i', $UrlDescriptor->url_rebuild, $matches);
                $host = $matches[1];
                preg_match('/[^.]+\.[^.]+$/', $host, $matches);
                $this_domain = $matches[0];

                $object_vars['blacklisted'] = false;
                $object_vars['whitelisted'] = false;
                $object_vars['greylisted'] = false;
                if(in_array($this_domain, $this->blacklist)) {
                    $object_vars['blacklisted'] = true;
                }elseif(in_array($this_domain, $this->whitelist)) {
                    $object_vars['whitelisted'] = true;
                }else{
                    $object_vars['greylisted'] = true;
                }
            }
            $this->links_found[$x] = $object_vars;
        }
    }

PHPCrawlerHTTPRequest.class.php
  public function setBlackWhiteLists($blacklist = array(), $whitelist = array())
  {
    $this->blacklist = $blacklist;
    $this->whitelist = $whitelist;
  }