Menu

Problem with addPostData

Help
Anonymous
2015-06-06
2015-06-06
  • Anonymous

    Anonymous - 2015-06-06

    Hi,
    I have a problem with POST data to some urls that when run $crawler->go, the report object is like :
    [links_followed] => 1
    [files_received] => 0
    [bytes_received] => 0
    ...
    The site is asp and make some requests with ajax that in every request some form data such as VIEWSTATE, STATEVALIDATION ,etc must send to retrieve the next page.
    The code is:

    ////////////////////////////////////////////////////////////////////

    public function start($multiProcessMode =             PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE){
        if(PHP_SAPI == 'cli'){
            $this-    >goMultiProcessed(getConfig('crawlerMultiProcessCount'), $multiProcessMode);
        }else{
            $this->go();
        }
    }
    

    ////////////////////////////////////////////////////////////////////////////

        protected  function _init(){
            $this->crawler->setURL($this->url);
            $this->crawler->setStreamTimeout(15); // defaults to 2 seconds
            $this->crawler->setConnectionTimeout(20); // defaults to 5 seconds
            $this->crawler->enableCookieHandling(true);
            //This rules lets the crawler receive the content/source of pages with the Content-Type "text/html"
            //Other pages or files with different content-types (e.g. "image/gif") won't be received
            $this->crawler->addContentTypeReceiveRule("#text/html#");
            $this->crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
            $this->crawler->addURLFilterRule("#\.(css|js)$# i");
            $this->__setProxyAndAgent();
    }
    

    ////////////////////////////////////////////////////////////////////////////

        $this->url = 'http://fa.journals.sid.ir/AdvanceJournal.aspx';
        setConfig('useAgents', true);
        $crawlingTryAgainCount = getConfig('crawlingTryAgainCount'); // for example is 3
        do{
            $this->_init();
            $this->crawler->addURLFollowRule("#!NOTHING!#");
            $this->crawler->start();
            $report = $this->_logCrawlerSummaryReport();
        }while($report->links_followed != $report->files_received && $crawlingTryAgainCount-- != 0);
        $regex = [
            'hiddenInputs' => '<input.*?type="hidden".*?name="(.*?)".*?value="(.*?)".*?>',
        ];
        preg_match_all($regex['hiddenInputs'], $this->crawler->plainContent, $hiddenInputsMatch);
        $postData = array_combine($hiddenInputsMatch[1], $hiddenInputsMatch[2]);
        $postData = array_merge($postData, [
            'ctl00$ContentPlaceHolder1$ScriptManager1' => 'ctl00$ContentPlaceHolder1$UpdatePanel1|ctl00$ContentPlaceHolder1$Timer1',
            '__ASYNCPOST' => 'true',
        ]);
        $this->crawler->plainContent = '';
        $crawlingTryAgainCount = getConfig('crawlingTryAgainCount');
        do{
            $this->crawler->addPostData("#{$this->url}#", $postData);
            $this->crawler->setFollowMode(1);
            $this->crawler->setFollowRedirects(TRUE);
            $this->crawler->setTrafficLimit(1000 * 1024);
            $this->crawler->start();
            $report = $this->_logCrawlerSummaryReport();
            debug($report);
        }while($report->links_followed != $report->files_received && $crawlingTryAgainCount-- != 0);
    
        debug($this->crawler->plainContent, 1);
    

    ///////////////////////////////////////////////////////////////////////////////////

    $this->crawler->plainContent is a variable that store $pageInfo->source.
    In this code in every two steps i use one crawler object.

    but in the last line of code (debug($this->crawler->plainContent, 1);) the plain content is empty.
    Can anyone please help me ?
    Thank you.

     

    Last edit: Anonymous 2015-06-06
    • Anonymous

      Anonymous - 2020-11-13
      Post awaiting moderation.
  • Anonymous

    Anonymous - 2017-08-30

    help

     

Anonymous
Anonymous

Add attachments
Cancel