Menu

xpaths failing in 2.1

Help
2013-05-17
2013-05-23
  • Patrick Cailly

    Patrick Cailly - 2013-05-17

    I'v tried several configs and most of them return empty xpath

    I've modified the google_image example , it runs , but does not collect anything

     
  • Patrick Cailly

    Patrick Cailly - 2013-05-17

    <config xmlns="http://web-harvest.sourceforge.net/schema/2.1/core" xmlns:var="http://web-harvest.sourceforge.net/schema/2.1/var" xmlns:p="http://web-harvest.sourceforge.net/schema/2.1/param">

    <include path="functions.xml"/>
    
    <!-- defines search keyword and start URL -->
    <set var="search" >platon</set>
    
    <!-- xpath fails no warning-->
    
    <set var="url">
        <xpath expression="//noscript//a/@href[1]">
            <html-to-xml>
                <http url="http://images.google.com/images?q=${search}&amp;hl=en&amp;btnG=Search+Images"/>
            </html-to-xml>
        </xpath>
    </set>
    
    <!-- collects all image URLs -->
    <set var="imgLinks">
        <call name="download-multipage-list">
            <call-param name="pageUrl"><get var="url"/></call-param>
            <call-param name="nextXPath">//td[.='Next']/a/@href</call-param>
            <call-param name="itemXPath">//img[contains(@src, 'images?q=tbn')]/@src</call-param>
            <call-param name="maxloops">5</call-param>
        </call>
    </set>
    
    <!-- download images and saves them to the files -->
    <loop item="link" index="i" filter="unique">
        <list>
            <get var="imgLinks"/>
        </list>
        <body>
            <file action="write" type="binary" path="/home/metronome/harvest/google_images/${search}_${i}.gif">
                <http url="${sys.fullUrl(url, link)}"/>
            </file>
        </body>
    </loop>
    

    </config>

     
  • Patrick Cailly

    Patrick Cailly - 2013-05-17

    <config xmlns="http://web-harvest.sourceforge.net/schema/2.1/core" xmlns:var="http://web-harvest.sourceforge.net/schema/2.1/var" xmlns:p="http://web-harvest.sourceforge.net/schema/2.1/param">
    <!--
    Download multi-page list of items.

        @param pageUrl       - URL of starting page
        @param itemXPath     - XPath expression to obtain single item in the list
        @param nextXPath     - XPath expression to URL for the next page
        @param maxloops      - maximum number of pages downloaded
    
        @return list of all downloaded items
     -->
    <function name="download-multipage-list">
        <return>
            <while condition="${pageUrl.toString().length() != 0}" maxloops="${maxloops}" index="i">
                <empty>
                    <set var="content">
                        <html-to-xml>
                            <http url="${pageUrl}"/>
                        </html-to-xml>
                    </set>
    
                    <set var="nextLinkUrl">
                        <xpath expression="${nextXPath}">
                            <get var="content"/>
                        </xpath>
                    </set>
    
                    <set var="pageUrl">
                        <template>${sys.fullUrl(pageUrl.toString(), nextLinkUrl.toString())}</template>
                    </set>
                </empty>
    
                <xpath expression="${itemXPath}">
                    <get var="content"/>
                </xpath>
            </while>
        </return>
    </function>
    

    </config>

     
  • Patrick Cailly

    Patrick Cailly - 2013-05-17

    But this one works though


    <?xml version="1.0" encoding="UTF-8"?>

    <config xmlns="http://web-harvest.sourceforge.net/schema/2.1/core" xmlns:var="http://web-harvest.sourceforge.net/schema/2.1/var" xmlns:p="http://web-harvest.sourceforge.net/schema/2.1/param">

    <!-- collects all tables for individual products -->
    <set var="products">    
        <xpath expression = '//li[@class="hproduct" or @class="hproduct first" or @class="hproduct last"]' >
            <html-to-xml>
                <http url = "http://shopping.yahoo.com/s:Digital%20Cameras:4168-Brand=Canon:browsename=Canon%20Digital%20Cameras:refspaceid=96303108;_ylt=AnHw0Qy0K6smBU.hHvYhlUO8cDMB;_ylu=X3oDMTBrcDE0a28wBF9zAzk2MzAzMTA4BHNlYwNibmF2" />
            </html-to-xml>
        </xpath>
    </set>
    
    <!-- iterates over all collected products and extract desired data -->
    <file action="write" path="/home/metronome/harvest/canon/single_page_catalog.xml" charset="UTF-8">
        <![CDATA[ <catalog> ]]>
        <loop item="item" index="i">
            <list><get var="products"/></list>
            <body>
                <xquery>
                    <xq-param name="item" type="node()"><get var="item"/></xq-param>
                    <xq-expression><![CDATA[
                            declare variable $item as node() external;
    
                            let $name := data($item//*[@class='title'])
                            let $desc := data($item//*[@class='desc'])
                            let $price := data($item//*[@class='price'])
                                return
                                    <product>
                                        <name>{normalize-space($name)}</name>
                                        <desc>{normalize-space($desc)}</desc>
                                        <price>{normalize-space($price)}</price>
                                    </product>
                    ]]></xq-expression>
                </xquery>
            </body>
        </loop>
        <![CDATA[ </catalog> ]]>
    </file>
    

    </config>

     
  • Patrick Cailly

    Patrick Cailly - 2013-05-22

    In fact , after many tests , the Bug seems to be in

    <html-to-xml>

    htmlcleaner generate wrong xml documents

    then the parsing fails

     
  • Patrick Cailly

    Patrick Cailly - 2013-05-22

    In fact , after many tests , the Bug seems to be in

    <html-to-xml>

    htmlcleaner generate wrong xml documents

    then the parsing fails

     
  • Patrick Cailly

    Patrick Cailly - 2013-05-23

    It is in fact the "namespacesaware" attribute of html-to xml processor whose default value is :
    true in 2.0 doc ( namual )
    true in 1.1 code ( xsd )
    but is false in 2.0 code !!

     

Log in to post a comment.