Why Web-Harvest by casual stoped

Sergey
2012-03-14
2013-02-17
  • Sergey

    Sergey - 2012-03-14

    At the 25-30minites work Web-Harvest casual stoped. Why?

    <?xml version="1.0" encoding="UTF-8"?>
    
    <config>
    
        <var-def name="host">
            [url]http://www.fragrantica.ru[/url]
        </var-def>
        <var-def name="Path">d:\xml_harvest</var-def>   
        <var-def name="ImagePath">d:\xml_harvest\img</var-def>  
        <var-def name="OutFile"><template>d:\xml_harvest\${sys.date()+sys.time()}</template></var-def>  
        <var-def name="AllDesc">
            k0|k1|k2|country_name|country_URI|disainer_URI|disainer_Name|disainer_sImgURL|disainer_sImgName|disainer_lgImgName|disainer_Industia_url|disainer_Industia_name|disainer_WEB_PAGE|disainer_LIC_url|disainer_LIC_name|disainer_DESC|aromat_URI|aromat_Name|aromat_pol|aromat_year|aromat_sImgName|aromat_lgImgName|aromat_name_1|aromat_decr|aromat_group_name|aromat_group_url|note_verh_name|note_verh_id|note_srd_name|note_srd_id|note_niz_name|note_niz_id
        </var-def>  
        <file action="append" type="text" path="${OutFile}">
            <var name="AllDesc" />
        </file>
    
        <function name="download-file">
        <!--ImgName,ImageDir, ImgURL -->
        <return>
            <empty>
            <var-def name="ImgName">
                    <script return="name"><![CDATA[
                        name=ImgName.toString()+"."+ImgURL.toString().replaceAll("(.*\\.)", "").trim();
                        name=name.replaceAll("/","(047)");
                    ;]]>
                    </script>                               
            </var-def>
            <var-def name="isImageExists">
                    <script return="fil"><![CDATA[
                        fil1=new File(ImageDir+"/"+ImgName);
                        if (fil1.exists()) fil=true;
                        else fil=false;
                    ;]]>
                    </script>
            </var-def>
            <case>
                <if condition="${isImageExists}">
                </if>
                <else>
                    <file action="write" type="binary" path="${ImageDir}/${ImgName}">
                        <http url="${ImgURL}"/>
                    </file>
                </else>     
            </case>
            </empty>
        </return>
        </function>
    
        <empty>
        <var-def name="First_html">
            <html-to-xml  outputtype="pretty">
                <http method="get" url="${host}/designers-1/#A" charset="utf-8" />
            </html-to-xml>  
        </var-def>
        </empty>
        <var-def name="All">
            <!-- Country    loop-->
            <loop item="item_country" index="k0" filter="unique">
            <list>
                <xpath expression="(//select)[1]/option">
                    <var name="First_html" />
                </xpath>        
            </list>
            <body>
            <case>
            <if condition='${Integer.parseInt(k0.toString())>17}'>      <!-- k0 -->     
                <empty>
                <var-def name="country_URI">
                    <xpath expression="/option/@value">
                        <var name="item_country" />
                    </xpath>            
                </var-def>          
                <var-def name="country_Name">
                    <xpath expression="/option/text()">
                        <var name="item_country" />
                    </xpath>            
                </var-def>
                </empty>
                <case>
                    <if condition='${(country_URI.toString().trim().length()!= 0)}'>
                        <empty>
                        <var-def name="Country_html">
                            <html-to-xml  outputtype="pretty">
                                <http method="get" url="${host}${country_URI}" charset="utf-8" />
                            </html-to-xml>  
                        </var-def>
                        </empty>
                        <loop item="item_disainer" index="k1" filter="unique">
                        <list>
                            <xpath expression="//div[@class='nduList']/p/a">
                                <var name="Country_html" />
                            </xpath>        
                        </list>
                        <body>  
                            <case>
                            <if condition='${Integer.parseInt(k1.toString())>149}'>     <!-- k1 -->                 
                            <empty>
                            <var-def name="disainer_URI">
                                <xpath expression="/a/@href">
                                    <var name="item_disainer" />
                                </xpath>            
                            </var-def>          
                            <var-def name="disainer_Name"><!-- Gabriela Sabatini -->
                                <xpath expression="/a/text()">
                                    <var name="item_disainer" />
                                </xpath>            
                            </var-def>  
                            <var-def name="disainer_sImgURL">
                                <xpath expression="/a/img/@src">
                                    <var name="item_disainer" />
                                </xpath>            
                            </var-def>  
                            <var-def name="disainer_sImgName">
                                    <script return="r"><![CDATA[
                                                r="sml_dis_"+k1.toString()+"_"+disainer_Name.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "");
                                            ;]]>
                                    </script>           
                            </var-def>
                            <empty>
                            <call name="download-file"><!-- -->
                                <call-param name="ImgName"><var name="disainer_sImgName"/></call-param>
                                <call-param name="ImageDir"><var name="ImagePath"/></call-param>
                                <call-param name="ImgURL"><var name="disainer_sImgURL"/></call-param>
                            </call> 
                            <var-def name="disainer_html">
                                <html-to-xml  outputtype="pretty">
                                    <http method="get" url="${host}${disainer_URI}" charset="utf-8" />
                                </html-to-xml>  
                            </var-def>
                            </empty>
                            <var-def name="disainer_Name_a"><!-- Gabriela Sabatini aroma-->
                                <xpath expression="//div[@id='col1']/h1/text()">
                                    <var name="disainer_html" />
                                </xpath>            
                            </var-def>
                            <empty> 
                            <var-def name="disainer_lgImgName">
                                    <script return="r"><![CDATA[
                                                r="lg_dis_"+k1.toString()+"_"+disainer_Name.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "");
                                            ;]]>
                                    </script>           
                            </var-def>                      
                            <call name="download-file"><!-- Большое фото-->
                                <call-param name="ImgName"><var name="disainer_lgImgName" /></call-param>
                                <call-param name="ImageDir"><var name="ImagePath"/></call-param>
                                <call-param name="ImgURL">
                                    <xpath expression="//div[@id='col1']/div/img/@src">
                                            <var name="disainer_html" />
                                    </xpath>
                                </call-param>
                            </call> 
                            <var-def name="disainer_area"><!--har -->
                                <xpath expression="//div[@id='col1']">
                                    <var name="disainer_html" />
                                </xpath>            
                            </var-def>
                            </empty>
                            <var-def name="disainer_Industia_url"></var-def><!-- URL Indastia-->    
                            <var-def name="disainer_Industia_name"></var-def><!-- Neme-->   
                            <var-def name="disainer_Industia">  
                                <regexp flag-multiline="True" max="1"> 
                                    <regexp-pattern>Индустрия:.{1,20}&lt;a.{1,3}href="(.*?)".{0,3}&gt;(.*?)&lt;/a&gt;</regexp-pattern>
                                    <regexp-source> <var name="disainer_area"/> </regexp-source>
                                    <regexp-result><template>"${_1}"|+|"${_2}"</template></regexp-result>
                                </regexp>
                            </var-def>  
                            <var-def name="disainer_Industia_url">
                                <script return="r1"><![CDATA[
                                            r1 = disainer_Industia.toString().replaceAll("(?sim)\\|\\+\\|.*$", "");
                                        ;]]>
                                </script>
                            </var-def>
                            <var-def name="disainer_Industia_name">
                                <script return="r2"><![CDATA[
                                            r2 = disainer_Industia.toString().replaceAll("(?sim)^.*?\\|\\+\\|", "");
                                            r2 = r2.replaceAll("(?sim)<.*?>", "").trim();
                                        ;]]>
                                </script>
                            </var-def>
    
                            <var-def name="disainer_WEB_PAGE">  <!-- URL disainer_WEB_PAGE-->   
                                <regexp flag-multiline="True" max="1"> 
                                    <regexp-pattern>Веб-страница дизайнера:.{1,20}&lt;a.{1,3}href="(.*?)".*?&gt;</regexp-pattern>
                                    <regexp-source> <var name="disainer_area"/> </regexp-source>
                                    <regexp-result><template>${_1}</template></regexp-result>
                                </regexp>
                            </var-def>
    
                            <var-def name="disainer_LIC_url"></var-def><!-- URL Lic-->  
                            <var-def name="disainer_LIC_name"></var-def><!-- Name Lic-->    
                            <var-def name="disainer_LIC">   
                                <regexp flag-multiline="True" max="1"> 
                                    <regexp-pattern>Владелец лицензии:.{1,20}&lt;a.{1,3}href="(.*?)".{0,3}&gt;(.*?)&lt;/a&gt;</regexp-pattern>
                                    <regexp-source> <var name="disainer_area"/> </regexp-source>
                                    <regexp-result><template>"${_1}"|+|"${_2}"</template></regexp-result>
                                </regexp>
                            </var-def>  
                            <var-def name="disainer_LIC_url">
                                <script return="r1"><![CDATA[
                                            r1 = disainer_Industia.toString().replaceAll("(?sim)\\|\\+\\|.*$", "");
                                        ;]]>
                                </script>
                            </var-def>
                            <var-def name="disainer_LIC_name">
                                <script return="r2"><![CDATA[
                                            r2 = disainer_Industia.toString().replaceAll("(?sim)^.*?\\|\\+\\|", "");
                                            r2 = r2.replaceAll("(?sim)<.*?>", "").trim();
                                        ;]]>
                                </script>
                            </var-def>
    
                            <var-def name="disainer_DESC">  <!-- Description--> 
                                <regexp flag-multiline="True" max="1"> 
                                    <regexp-pattern>&lt;iframe.*?&lt;/div&gt;(.*?)&lt;div.{1,10}style="clear:</regexp-pattern>
                                    <regexp-source> <var name="disainer_area"/> </regexp-source>
                                    <regexp-result><template>"${_1}"</template></regexp-result>
                                </regexp>
                            </var-def>
                            </empty>        
                            <!-- Aromat loop-->
                            <loop item="item_aromat" index="k2" filter="unique">
                            <list>
                                <xpath expression="//div[@id='col1']/div[@class]/div">
                                    <var name="disainer_html" />
                                </xpath>        
                            </list>
                            <body>  
                                <empty>
                                <var-def name="aromat_URI">
                                    <xpath expression="/div/p/a/@href">
                                        <var name="item_aromat" />
                                    </xpath>            
                                </var-def>  
                                <var-def name="aromat_Name"><!-- Gabriela Sabatini -->
                                    <xpath expression="/div/p/a/text()">
                                        <var name="item_aromat" />
                                    </xpath>            
                                </var-def>      
                                <var-def name="aromat_pol"><!-- Aroma pol -->
                                    <xpath expression="/div/p/span/text()">
                                        <var name="item_aromat" />
                                    </xpath>            
                                </var-def>  
                                <var-def name="aromat_year"><!--Aroma year -->
                                    <xpath expression="/div/p/span/span/strong/text()">
                                        <var name="item_aromat" />
                                    </xpath>            
                                </var-def>  
                                <var-def name="aromat_sImgName">
                                        <script return="r10"><![CDATA[
                                                    r10="sml_arm_"+k1.toString()+"_"+k2.toString()+"_"+disainer_Name.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "")+"_"+aromat_Name.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "");
                                                ;]]>
                                        </script>           
                                </var-def>  
                                <var-def name="aromat_lgImgName">
                                        <script return="r10"><![CDATA[
                                                    r10="lg_arm_"+k1.toString()+"_"+k2.toString()+"_"+disainer_Name.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "")+"_"+aromat_Name.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "");
                                                ;]]>
                                        </script>           
                                </var-def>  
                                <empty>                         
                                <var-def name="aromat_img_URL">
                                    <xpath expression="/div/p/a/img/@src">
                                        <var name="item_aromat" />
                                    </xpath>            
                                </var-def>      
                                <call name="download-file"><!---->
                                    <call-param name="ImgName"><var name="aromat_lgImgName" /></call-param>
                                    <call-param name="ImageDir"><var name="ImagePath"/></call-param>
                                    <call-param name="ImgURL"><var name="aromat_img_URL" /></call-param>
                                </call>
    
                                <var-def name="aromat_html">
                                    <html-to-xml  outputtype="pretty">
                                        <http method="get" url="${host}${aromat_URI}" charset="utf-8" />
                                    </html-to-xml>  
                                </var-def>  
                                </empty>                            
                                <var-def name="aromat_lgImgURL">
                                    <xpath expression="//div[@id='mainpicbox']/img/@src">
                                        <var name="aromat_html" />
                                    </xpath>            
                                </var-def>
                                <call name="download-file"><!---->
                                    <call-param name="ImgName"><var name="aromat_lgImgName" /></call-param>
                                    <call-param name="ImageDir"><var name="ImagePath"/></call-param>
                                    <call-param name="ImgURL"><var name="aromat_lgImgURL" /></call-param>
                                </call>                             
                                <var-def name="aromat_name_1">
                                    <xpath expression="(//div[@id='col1']/div)[2]/h1/text()">
                                        <var name="aromat_html" />
                                    </xpath>            
                                </var-def>  
                                <var-def name="aromat_decr">
                                    <xpath expression="(//div[@id='col1']/div)[2]/p[2]">
                                        <var name="aromat_html" />
                                    </xpath>            
                                </var-def>  
                                <var-def name="aromat_group_name">
                                    <xpath expression="(//div[@id='col1']/div)[2]/p[1]/span[2]/span[1]/text()">
                                        <var name="aromat_html" />
                                    </xpath>            
                                </var-def>
                                <var-def name="aromat_group_url">
                                    <xpath expression="(//div[@id='col1']/div)[2]/p[1]/span[2]/span[1]/@title">
                                        <var name="aromat_html" />
                                    </xpath>            
                                </var-def>              
                                <!-- Note   loop  up-->
                                <var-def name="note_verh_name"></var-def>
                                <var-def name="note_verh_id"></var-def>
                                <loop item="item_note" index="n1" filter="unique">
                                <list>
                                    <xpath expression="((//div[@id='col1']/div)[2]/div[@style]/div[@style])[1]/p[1]/span">
                                        <var name="aromat_html" />
                                    </xpath>        
                                </list>
                                <body>
                                    <empty>
                                    <var-def name="id_note">
                                        <xpath expression="/span/@id">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <var-def name="name_note">
                                        <xpath expression="/span/img/@title">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <var-def name="url_note">
                                        <xpath expression="/span/img/@src">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <call name="download-file"><!-- -->
                                        <call-param name="ImgName">
                                            <script return="r"><![CDATA[
                                                        r="note_"+id_note.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "");
                                                    ;]]>
                                            </script>
                                        </call-param>
                                        <call-param name="ImageDir"><var name="ImagePath"/></call-param>
                                        <call-param name="ImgURL"><var name="url_note"/></call-param>
                                    </call>
                                    </empty>
                                    <var-def name="note_verh_name">
                                        <script return="verh_name"><![CDATA[
                                                verh_name =  note_verh_name.toString()+"["+name_note.toString()+"],";
                                        ;]]>
                                        </script>
                                    </var-def>  
                                    <var-def name="note_verh_id">
                                        <script return="verh_id"><![CDATA[
                                                verh_id =  note_verh_id.toString()+"["+id_note.toString()+"],";
                                        ;]]>
                                        </script>
                                    </var-def>                                  
                                </body>
                                </loop>     
                                <!-- Note   loop  mid-->
                                <var-def name="note_srd_name"></var-def>
                                <var-def name="note_srd_id"></var-def>
                                <loop item="item_note" index="n1" filter="unique">
                                <list>
                                    <xpath expression="((//div[@id='col1']/div)[2]/div[@style]/div[@style])[1]/p[2]/span">
                                        <var name="aromat_html" />
                                    </xpath>        
                                </list>
                                <body>
                                    <empty>
                                    <var-def name="id_note">
                                        <xpath expression="/span/@id">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <var-def name="name_note">
                                        <xpath expression="/span/img/@title">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <var-def name="url_note">
                                        <xpath expression="/span/img/@src">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <call name="download-file"><!--  -->
                                        <call-param name="ImgName">
                                            <script return="r"><![CDATA[
                                                        r="note_"+id_note.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "");
                                                    ;]]>
                                            </script>
                                        </call-param>
                                        <call-param name="ImageDir"><var name="ImagePath"/></call-param>
                                        <call-param name="ImgURL"><var name="url_note"/></call-param>
                                    </call>
                                    </empty>
                                    <var-def name="note_srd_name">
                                        <script return="srd_name"><![CDATA[
                                                srd_name =  note_srd_name.toString()+"["+name_note.toString()+"],";
                                        ;]]>
                                        </script>
                                    </var-def>  
                                    <var-def name="note_srd_id">
                                        <script return="srd_id"><![CDATA[
                                                srd_id =  note_srd_id.toString()+"["+id_note.toString()+"],";
                                        ;]]>
                                        </script>
                                    </var-def>                                  
                                </body>
                                </loop>     
                                <!-- Note   loop  foot-->
                                <var-def name="note_niz_name"></var-def>
                                <var-def name="note_niz_id"></var-def>
                                <loop item="item_note" index="n1" filter="unique">
                                <list>
                                    <xpath expression="((//div[@id='col1']/div)[2]/div[@style]/div[@style])[1]/p[3]/span">
                                        <var name="aromat_html" />
                                    </xpath>        
                                </list>
                                <body>
                                    <empty>
                                    <var-def name="id_note">
                                        <xpath expression="/span/@id">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <var-def name="name_note">
                                        <xpath expression="/span/img/@title">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <var-def name="url_note">
                                        <xpath expression="/span/img/@src">
                                            <var name="item_note" />
                                        </xpath>            
                                    </var-def>  
                                    <call name="download-file"><!--  -->
                                        <call-param name="ImgName">
                                            <script return="r"><![CDATA[
                                                        r="note_"+id_note.toString().replaceAll("(?sim)[^a-zA-Z0-9]", "");
                                                    ;]]>
                                            </script>
                                        </call-param>
                                        <call-param name="ImageDir"><var name="ImagePath"/></call-param>
                                        <call-param name="ImgURL"><var name="url_note"/></call-param>
                                    </call>
                                    </empty>
                                    <var-def name="note_niz_name">
                                        <script return="niz_name"><![CDATA[
                                                niz_name =  note_niz_name.toString()+"["+name_note.toString()+"],";
                                        ;]]>
                                        </script>
                                    </var-def>  
                                    <var-def name="note_niz_id">
                                        <script return="niz_id"><![CDATA[
                                                niz_id =  note_niz_id.toString()+"["+id_note.toString()+"],";
                                        ;]]>
                                        </script>
                                    </var-def>                                  
                                </body>
                                </loop>
    
                                <!-- Save   AllComm.toString() + "\r\n" + -->           
                                <var-def name="AllComm">
                                    <script return="allcomm"><![CDATA[
                                            row =k0.toString()+"|"+k1.toString()+"|"+ k2.toString()+"|"
                                            +country_Name.toString()+"|"+ country_URI.toString()+"|"+disainer_URI.toString()+"|"
                                            +disainer_Name.toString()+"|"+disainer_sImgURL.toString()+"|"+disainer_sImgName.toString()+".jpg|"
                                            +disainer_lgImgName.toString()+".jpg|"+disainer_Industia_url.toString()+"|"+disainer_Industia_name.toString()+"|"
                                            +disainer_WEB_PAGE.toString()+"|"+disainer_LIC_url.toString()+"|"+disainer_LIC_name.toString()+"|"
                                            +disainer_DESC.toString()+"|"+aromat_URI.toString()+"|"+aromat_Name.toString()+"|"+aromat_pol.toString()+"|"
                                            +aromat_year.toString()+"|"+aromat_sImgName.toString()+".jpg|"+aromat_lgImgName.toString()+".jpg|"
                                            +aromat_name_1.toString()+"|"+aromat_decr.toString()+"|"+aromat_group_name.toString()+"|"
                                            +aromat_group_url.toString()+"|"+note_verh_name.toString()+"|"+note_verh_id.toString()+"|"
                                            +note_srd_name.toString()+"|"+note_srd_id.toString()+"|"+note_niz_name.toString()+"|"+note_niz_id.toString();
                                            row=row.replaceAll("(?sim)[\r\n\t]", "");
                                            allcomm ="\r\n"+  row.replace(';',',');
                                    ;]]>
                                    </script>
                                </var-def>  
                                </empty>                            
                                <file action="append" type="text" path="${OutFile}">
                                    <var name="AllComm" />
                                </file>
                            </body>
                            </loop> 
                        </if>   
                        <else><!-- k1   -->
                        </else> 
                        </case>                     
                        </body>
                        </loop>
    
                    </if>   
                    <else><!-- No Valid -->
                    </else> 
                </case> 
                <!-- No Valid
                <case>
                <if  condition="${k0}>4">
                    <var name="STOP" /> 
                </if>   
                <else>  
                </else> 
                </case> --> 
            </if>   
            <else><!-- k0-->
            </else> 
            </case>             
            </body>
            </loop>     
        </var-def>  
    </config>
    
     
  • Scott R. Turner

    Scott R. Turner - 2013-02-17

    Probably ran out of memory. It's a known bug.

     

Get latest updates about Open Source Projects, Conferences and News.

Sign up for the SourceForge newsletter:





No, thanks