Menu

Problem with addPostData

Help
Anonymous
2015-06-06
2015-06-06
  • Anonymous

    Anonymous - 2015-06-06

    Hi,
    I have a problem with POST data to some urls that when run $crawler->go, the report object is like :
    [links_followed] => 1
    [files_received] => 0
    [bytes_received] => 0
    ...
    The site is asp and make some requests with ajax that in every request some form data such as __VIEWSTATE, __STATEVALIDATION ,etc must send to retrieve the next page.
    The code is:

    ////////////////////////////////////////////////////////////////////

    public function start($multiProcessMode =             PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE){
        if(PHP_SAPI == 'cli'){
            $this-    >goMultiProcessed(getConfig('crawlerMultiProcessCount'), $multiProcessMode);
        }else{
            $this->go();
        }
    }
    

    ////////////////////////////////////////////////////////////////////////////

        protected  function _init(){
            $this->crawler->setURL($this->url);
            $this->crawler->setStreamTimeout(15); // defaults to 2 seconds
            $this->crawler->setConnectionTimeout(20); // defaults to 5 seconds
            $this->crawler->enableCookieHandling(true);
            //This rules lets the crawler receive the content/source of pages with the Content-Type "text/html"
            //Other pages or files with different content-types (e.g. "image/gif") won't be received
            $this->crawler->addContentTypeReceiveRule("#text/html#");
            $this->crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
            $this->crawler->addURLFilterRule("#\.(css|js)$# i");
            $this->__setProxyAndAgent();
    }
    

    ////////////////////////////////////////////////////////////////////////////

        $this->url = 'http://fa.journals.sid.ir/AdvanceJournal.aspx';
        setConfig('useAgents', true);
        $crawlingTryAgainCount = getConfig('crawlingTryAgainCount'); // for example is 3
        do{
            $this->_init();
            $this->crawler->addURLFollowRule("#!NOTHING!#");
            $this->crawler->start();
            $report = $this->_logCrawlerSummaryReport();
        }while($report->links_followed != $report->files_received && $crawlingTryAgainCount-- != 0);
        $regex = [
            'hiddenInputs' => '<input.*?type="hidden".*?name="(.*?)".*?value="(.*?)".*?>',
        ];
        preg_match_all($regex['hiddenInputs'], $this->crawler->plainContent, $hiddenInputsMatch);
        $postData = array_combine($hiddenInputsMatch[1], $hiddenInputsMatch[2]);
        $postData = array_merge($postData, [
            'ctl00$ContentPlaceHolder1$ScriptManager1' => 'ctl00$ContentPlaceHolder1$UpdatePanel1|ctl00$ContentPlaceHolder1$Timer1',
            '__ASYNCPOST' => 'true',
        ]);
        $this->crawler->plainContent = '';
        $crawlingTryAgainCount = getConfig('crawlingTryAgainCount');
        do{
            $this->crawler->addPostData("#{$this->url}#", $postData);
            $this->crawler->setFollowMode(1);
            $this->crawler->setFollowRedirects(TRUE);
            $this->crawler->setTrafficLimit(1000 * 1024);
            $this->crawler->start();
            $report = $this->_logCrawlerSummaryReport();
            debug($report);
        }while($report->links_followed != $report->files_received && $crawlingTryAgainCount-- != 0);
    
        debug($this->crawler->plainContent, 1);
    

    ///////////////////////////////////////////////////////////////////////////////////

    $this->crawler->plainContent is a variable that store $pageInfo->source.
    In this code in every two steps i use one crawler object.

    but in the last line of code (debug($this->crawler->plainContent, 1);) the plain content is empty.
    Can anyone please help me ?
    Thank you.

     

    Last edit: Anonymous 2015-06-06
    • Anonymous

      Anonymous - 2020-11-13
      Post awaiting moderation.
  • Anonymous

    Anonymous - 2017-08-30

    help

     

Anonymous
Anonymous

Add attachments
Cancel