PHPCrawl / Forum / Help: Resume Crawling on big sites

Hello,

First of all this is the best PHP crawler I've found on the internet, thank you for supporting this project.

The question is about stability when crawling large sites, for example 1 million pages site.
Is it possible to resume crawl session if crawler was suddenly stopped?

I guess the key is in variables "urls_to_crawl", "url_map" of object:

=> Array
(
             => Array
                (
                     => Array
                        (
                             => http://www.google.com/webhp?hl=en&tab=ww
                             => Web
                             => Web
                             => http://www.google.com/search?q=test
                             => http://www.google.com/webhp?hl=en&tab=ww
                             => 0
                        )
)
)

=> Array
(
=> 1
=> 1
=> 1
)

Any suggestion will be appreciated,
Thanks.

Thanks for your reply and support this in upcoming version. Also i hope serving urls_to_crawl data in file will be better for large sites (with thouthsands of urls), due to free memory.

Below is my quick implementation. The main idea was running this script by cron every 10 minutes to check if crawling is alive. If process was hanged - restoring session.

<?php
 /*
  * Extend PHPCrawler class and override the handlePageData() method
  */
 class MyCrawler extends PHPCrawler
 {
    private $_enable_session = false;   // Use or not session restoring (true/false);
    private $_session_name = '';        // Session Name;
    private $_session_path = './protected/tmp'; // Session path;
    private $_session_atime = '';       // Edit/access time for session (timestamp);
    private $_session_status = '';      // Session status ('complete'/'');
    private $_n = 0;                    // Temp iterator;
    /**
     * Function <b>enableSession</b> Setter for enabling/disabling session support, default is disabled
     *
     * @param (bool) $enable Enable/disable session
     * @return (null)
     */
    public function enableSession($enable=false)
    {
        if (is_bool($enable))
            $this->_enable_session = $enable;
    }
    /**
     * Function <b>isAliveSession</b> checks if session exists and alive
     *
     * @return (bool) true if session exists, false if not
     */
    public function isAliveSession()
    {
        if ($this->_enable_session != true) return false;
        $file = $this->_session_path.'/'.$this->_session_name;
        if (file_exists($file) && is_writable($file))
            return true;
        return false;
    }
    /**
     * Function <b>isBuisySession</b> checks if session is buisy and was accessed by another process within n $minutes
     *
     * @param (int) $minutes Minutes
     * @return (bool) true if session is buisy, false if not
     */
    function isBuisySession($minutes=1)
    {
        if (empty($this->_session_name)) exit('isBuisuSession() cant be called before restoreSession() method<br>');
        $minutes = intval($minutes);
        $minutes = $minutes * 60; // 60 seconds * time;
        if ((time() - $this->_session_atime) >= $minutes) //last access to session
        {
            return false;
        }
        else
        {
            return true;
        }
    }
    /**
     * Function <b>saveSession</b> Saves current session, if crawler hangs up - we can resumeSession with saved data
     *
     * @return (bool) true/false
     */
    public function saveSession()
    {
        if ($this->_enable_session != true) return false;
        $this->_session_name = md5($this->url_to_crawl);
        $file = $this->_session_path.'/'.$this->_session_name;
        if (!$this->isAliveSession())
        {
            if (!is_dir($this->_session_path))
                mkdir($this->_session_path);
        }
        $create_time = time();
        $data = serialize(
                      array('urls_to_crawl'=>$this->urls_to_crawl,
                            'url_map'=>$this->url_map,
                            '_session_atime'=>$create_time,
                            '_session_status'=>$this->_session_status,
                          )
                    );
        if (!$fp = fopen($file, 'w')) return false;
        if (!fwrite($fp, $data)) return false;
        fclose($fp);
        $this->_session_atime = $create_time;
        return true;
    }
    /**
     * Function <b>restoreSession</b> Restores saved session, crawler continues work where he stopped last time
     *
     * @return (bool) true if restore was successfull, false on error
     */
    public function restoreSession()
    {
        if ($this->_enable_session != true) return false;
        $this->_session_name = md5($this->url_to_crawl);
        $file = $this->_session_path.'/'.$this->_session_name;
        if (file_exists($file))
        {
            if (!$data = file_get_contents($file)) return false;
            if (strlen($data) < 10) return false;
        }
        else return false;
        $data = unserialize($data);
        $this->_session_status = $data['_session_status'];
        $this->_session_atime = $data['_session_atime'];
        if ($this->_session_status == 'complete') die('Task completed<br> at '.date(DATE_RFC822, $this->_session_atime));
        $this->urls_to_crawl = $data['urls_to_crawl'];
        $this->url_map = $data['url_map'];
        echo ('Restoring session..restoreSession()<br>');
        return true;
    }
    /**
     * Function <b>endSession</b> marks session as completed
     *
     * @return (bool) true/false
     */
    public function endSession()
    {
        if ($this->_enable_session != true) return false;
        if (empty($this->_session_name)) return false;
        $this->_session_status = 'complete';
        $file = $this->_session_path.'/'.$this->_session_name;
        $data = serialize(
                      array('_session_status'=>$this->_session_status,
                            '_session_atime'=>$this->_session_atime,
                          )
                    );
        if (!$fp = fopen($file, 'w')) return false;
        if (!fwrite($fp, $data)) return false;
        fclose($fp);
        return true;
    }
    /**
     * Function <b>handlePageData</b> Perform user operations on every url crawl iteration
     *
     */
    function handlePageData(&$page_data)
    {
        $this->_n++;
        /*
         * SAVE SESSION EACH 10 ITERATIONS ***
         */
        if ($this->_n == 10)
        {
            $this->_n = 0;
            $this->saveSession();
            echo ('Saving session ...saveSession()<br>');
        }
        /*
         * PARSING DATA && PERFORM ANY USER ACTIONS ***
         */
        flush();
    }
    /**
     * Function <b>continueGo</b>, edited copy of go() method, performs crawling after session resuming
     *
     * @return (null)
     */
    public function continueGo()
    {
        echo ('Performing ContinueGo()...<br>');
        $starting_time = $this->getmicrotime();
        // Init, split given URL into host, port, path and file a.s.o.
        $url_parts = PHPCrawlerUtils::splitURL($this->url_to_crawl);
        // Set base-host and base-path "global" for this class,
        // we need it very often (i guess at this point...)
        $this->base_path = $url_parts["path"];
        $this->base_host = $url_parts["host"];
        $this->base_domain = $url_parts["domain"];
        // If the base port wasnt set by the user ->
        // take the one from the given start-URL.
        if ($this->base_port == "") $this->base_port = $url_parts["port"];
        // if the base-port WAS set by the user
        $url_parts["port"] = $this->base_port;
        // Reset the base_url
        $this->url_to_crawl = PHPCrawlerUtils::rebuildURL($url_parts);
        $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($this->url_to_crawl);
        // Init counters
        $links_followed=0;
        $files_received=0;
        // Put the first url into our main-array
/* COMMENTED PART */
/*      $tmp[0]["url_rebuild"] = $this->url_to_crawl;
        PHPCrawlerUtils::removeMatchingLinks($tmp, $this->not_follow_matches);
        if (isset($tmp[0]["url_rebuild"]) &&  $tmp[0]["url_rebuild"] != "")
        {
            PHPCrawlerUtils::addToArray($tmp, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
        }
*/
        // MAIN-LOOP -------------------------------------------------------------------
        // It works like this:
        // The first loop looks through all the "Priority"-arrays and checks if any
        // of these arrays is filled with URLS.
        for ($pri_level = $this->max_priority_level+1; $pri_level > -1; $pri_level--)
        {
            // Yep. Found a priority-array with at least one URL
            if (isset($this->urls_to_crawl[$pri_level]) && !isset($stop_crawling))
            {
                // Now "process" all URLS in this priroity-array
                @reset($this->urls_to_crawl[$pri_level]);
                while (list($key) = @each($this->urls_to_crawl[$pri_level]))
                {
                    $all_start = $this->getmicrotime();
                    $stop_crawling_this_level = false; // init
                    // Request URL (crawl())
                    unset($page_data);
                    if (!isset($this->urls_to_crawl[$pri_level][$key]["referer_url"]))
                    {
                        $this->urls_to_crawl[$pri_level][$key]["referer_url"] = "";
                    }
                    $page_data = $this->pageRequest->receivePage($this->urls_to_crawl[$pri_level][$key]["url_rebuild"],
                    $this->urls_to_crawl[$pri_level][$key]["referer_url"]);
                    // If the request-object just irnored the URL ->
                    // -> Stop and remove URL from Array
                    if ($page_data == false)
                    {
                        unset($this->urls_to_crawl[$pri_level][$key]);
                        continue;
                    }
                    $links_followed++;
                    // Now $page_data["links_found"] contains all found links at this point
                    // Check if a "<base href.."-tag is given in the source and xtract
                    // the base URL
                    // !! Doesnt have to be rebuild cause it only can be a full
                    // qualified URL !!
                    $base_url = PHPCrawlerUtils::getBasePathFromTag($page_data["source"]);
                    if ($base_url == "") $actual_url = &$this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                    else $actual_url = $base_url;
                    // Set flag "content_found" if..content was found
                    if (isset($page_data["http_status_code"]) && $page_data["http_status_code"]==200) $content_found = true;
                    // Check for a REDIRECT-header and if wanted, put it into the array of found links
                    $redirect = PHPCrawlerUtils::getRedirectLocation($page_data["header"]);
                    if ($redirect && $this->follow_redirects==true)
                    {
                        $tmp_array["link_raw"] = $redirect;
                        $tmp_array["referer_url"] = $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
                        $page_data["links_found"][] = $tmp_array;
                    }
                    // Count files that have been received completly
                    if ($page_data["received"] == true) $files_received++;
                    // If traffic-limit is reached -> stop crawling
                    if ($page_data["traffic_limit_reached"] == true) $stop_crawling = true;
                    // Check if pagelimit is reached if set
                    // (and check WHICH page-limit was set)
                    if ($this->page_limit_all > 0)
                    {
                        if ($this->page_limit_count_ct_only==true && $files_received >= $this->page_limit_all)
                        {
                            $stop_crawling = true;
                        }
                        elseif ($this->page_limit_count_ct_only==false && $links_followed >= $this->page_limit_all)
                        {
                            $stop_crawling = true;
                        }
                    }
                    // Add the actual referer to the page_data array for the handlePageData-method
                    $page_data["refering_linktext"] = &$this->urls_to_crawl[$pri_level][$key]["linktext"];
                    $page_data["refering_link_raw"] = &$this->urls_to_crawl[$pri_level][$key]["link_raw"];
                    $page_data["refering_linkcode"] = &$this->urls_to_crawl[$pri_level][$key]["linkcode"];
                    // build new absolute URLs from found links
                    $page_data["links_found"] = PHPCrawlerUtils::buildURLs($page_data["links_found"], $actual_url);
                    // Call the overridable user-function here, but first
                    // "save" the found links from user-manipulation
                    $links_found = $page_data["links_found"];
                    $user_return = $this->handlePageData($page_data);
                    // Stop crawling if user returned a negative value
                    if ($user_return < 0)
                    {
                        $stop_crawling=true;
                        $page_data["user_abort"] = true;
                    }
                    // Compare the found links with link-priorities set by the user
                    // and add the priority-level to our array $links_found
                    if ($this->benchmark==true) $bm_start = $this->getmicrotime();
                    PHPCrawlerUtils::addURLPriorities ($links_found, $this->link_priorities);
                    if ($this->benchmark==true) echo "addUrlPriorities(): ".($this->getmicrotime() - $bm_start)."<br>";
                    // Here we can delete the tmp-file maybe created by the pageRequest-object
                    if (file_exists($this->pageRequest->tmp_file)) @unlink($this->pageRequest->tmp_file);
                    // Stop everything if a limit was reached
                    if (isset($stop_crawling))
                    {
                        break;
                        $pri_level=1000;
                    }
                    // Remove links to other hosts if follow_mode is 2 or 3
                    if ($this->general_follow_mode == 2 || $this->general_follow_mode == 3)
                    {
                        PHPCrawlerUtils::removeURLsToOtherHosts($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                    }
                    // Remove links to other domains if follow_mode=1
                    if ($this->general_follow_mode == 1)
                    {
                        PHPCrawlerUtils::removeURLsToOtherDomains($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
                    }
                    // Remove "pathUp"-links if follow_mode=3
                    // (fe: base-site: www.foo.com/bar/index.htm -> dont follow: www.foo.com/anotherbar/xyz)
                    if ($this->general_follow_mode == 3)
                    {
                        PHPCrawlerUtils::removePathUpLinks($links_found, $this->url_to_crawl);
                    }
                    // If given, dont follow "not matching"-links
                    // (dont follow given preg_matches)
                    if (count($this->not_follow_matches) > 0)
                    {
                        PHPCrawlerUtils::removeMatchingLinks($links_found, $this->not_follow_matches);
                    }
                    // If given, just follow "matching"-links
                    // (only follow given preg_matches)
                    if (count($this->follow_matches) > 0)
                    {
                        $links_found=&PHPCrawlerUtils::removeNotMatchingLinks($links_found, $this->follow_matches);
                    }
                    // Add found and filtered links to the main_array urls_to_crawl
                    if ($this->benchmark == true) $bm_start = $this->getmicrotime();
                    PHPCrawlerUtils::addToArray($links_found, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                    if ($this->benchmark == true) echo "addToArray(): ".($this->getmicrotime() - $bm_start)."<br>";
                    // If there is wasnt any content found so far (code 200) and theres
                    // a redirect location
                    // -> follow it, doesnt matter what follow-mode was choosen !
                    // (put it into the main-array !)
                    if (!isset($content_found) && $redirect != "" && $this->follow_redirects_till_content == true)
                    {
                        $rd[0]["url_rebuild"] = phpcrawlerutils::buildURL($redirect, $actual_url);
                        $rd[0]["priority_level"] = 0;
                        PHPCrawlerUtils::addToArray($rd, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
                    }
                    // Now we remove the actual URL from the priority-array
                    unset($this->urls_to_crawl[$pri_level][$key]);
                    // Now we check if a priority-array with a higher priority
                    // contains URLS and if so, stop processing this pri-array and "switch" to the higher
                    // one
                    for ($pri_level_check = $this->max_priority_level+1; $pri_level_check > $pri_level; $pri_level_check--)
                    {
                        if (isset($this->urls_to_crawl[$pri_level_check]) && $pri_level_check > $pri_level)
                        {
                            $stop_crawling_this_level = true;
                        }
                    }
                    // Stop crawling this level
                    if ($stop_crawling_this_level == true)
                    {
                        $pri_level = $this->max_priority_level+1;
                        break;
                    }
                    // Unset crawled URL, not nedded anymore
                    unset($this->urls_to_crawl[$pri_level][$key]);
                    // echo "All:".($this->getmicrotime()-$all_start);
                } // end of loop over priority-array
                // If a priority_level was crawled completely -> unset the whole array
                if ($stop_crawling_this_level == false)
                {
                    unset($this->urls_to_crawl[$pri_level]);
                }
            } // end if priority-level exists
        } // end of main loop
        // Loop stopped here, build report-array (status_return)
        $this->status_return["links_followed"] = $links_followed;
        $this->status_return["files_received"] = $files_received;
        $this->status_return["bytes_received"] = $this->pageRequest->traffic_all;
        $this->status_return["traffic_limit_reached"] = $page_data["traffic_limit_reached"];
        if (isset($page_data["file_limit_reached"]))
        {
            $this->status_return["file_limit_reached"] = $page_data["file_limit_reached"];
        }
        else $this->status_return["file_limit_reached"] = false;
        if (isset($page_data["user_abort"]))
        {
            $this->status_return["user_abort"] = $page_data["user_abort"];
        }
        else $this->status_return["user_abort"] = false;
        if (isset($stop_crawling))
        {
            $this->status_return["limit_reached"] = true;
        }
        else {
            $this->status_return["limit_reached"] = false;
        }
        // Process-time
        $this->status_return["process_runtime"] = $this->getMicroTime() - $starting_time;
        // Average bandwith / throughput
        $this->status_return["data_throughput"] = round($this->status_return["bytes_received"] / $this->status_return["process_runtime"]);
    }
 }
?>

The user code is below. Of course, it doesn't show correctly total files downloaded and other statistics data.
Also it would be better to include all verifications and other stuff such as isBuisySession(), endSession() etc outside of user code, but here is how i did it:

<?php
/*
 * PREPARE MyCrawler() ***
 */
 $crawl = &new MyCrawler();
 $crawl->setURL('http://big-site.com/');
 $crawl->setPageLimit(10);
 $crawl->enableSession(true);   // enable session support;
 if ($crawl->restoreSession())
 {
    if($crawl->isBuisySession(2)) // If session file was saved within last 2 minutes;
    {
        die('Session is buisy<br>');
    }
    else    // continue crawling with restored session data;
    {
        $crawl->continueGo();
    }
 }
 else // session was not found, perform new crawling (create new session);
 {
    $crawl->go();
 }
/*
 * END SESSION, REQUIRED! ***
 */
 $crawl->endSession();
?>

I hope that someone will find this helpful when operating large sites.
Regards.

Resume Crawling on big sites

Forums

Help

Resume Crawling on big sites document.SUBSCRIPTION_OPTIONS = { "thing": "topic", "subscribed": false, "url": "subscribe", "icon": { "css": "fa fa-envelope-o" } };

Resume Crawling on big sites