From: <bi...@us...> - 2012-01-26 20:53:09
|
Revision: 3607 http://archive-access.svn.sourceforge.net/archive-access/?rev=3607&view=rev Author: binzino Date: 2012-01-26 20:53:00 +0000 (Thu, 26 Jan 2012) Log Message: ----------- Initial revision of NutchWAX custom version of parse-html plugin. Main diffs are not enforcing robots meta tag nor trying to process redirects. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/XMLCharacterRecognizer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2012-01-26 20:51:04 UTC (rev 3606) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -92,6 +92,7 @@ <ant dir="scoring-nutchwax" target="deploy" /> <ant dir="urlfilter-nutchwax" target="deploy" /> <ant dir="parse-pdf2" target="deploy" /> + <ant dir="parse-html2" target="deploy" /> <ant dir="html-decorator" target="deploy" /> </target> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-html2" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-html2" + name="NutchWAX Html Parse Plug-in" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="parse-html2.jar"> + <export name="*"/> + </library> + <library name="tagsoup-1.2.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.parse.html" + name="NutchWAX HTML Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.archive.nutchwax.parse.html.HtmlParser" + class="org.archive.nutchwax.parse.html.HtmlParser"> + <parameter name="contentType" value="text/html"/> + <parameter name="pathSuffix" value=""/> + </implementation> + + </extension> + +</plugin> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,740 @@ +/* + * XXX ab...@ap...: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to + * avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $ + */ +package org.archive.nutchwax.parse.html; + +import java.util.Stack; + +import org.w3c.dom.Comment; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.Text; +import org.w3c.dom.CDATASection; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.ext.LexicalHandler; +/** + * This class takes SAX events (in addition to some extra events + * that SAX doesn't handle yet) and adds the result to a document + * or document fragment. + */ +public class DOMBuilder + implements ContentHandler, LexicalHandler +{ + + /** Root document */ + public Document m_doc; + + /** Current node */ + protected Node m_currentNode = null; + + /** First node of document fragment or null if not a DocumentFragment */ + public DocumentFragment m_docFrag = null; + + /** Vector of element nodes */ + protected Stack m_elemStack = new Stack(); + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document fragment. + * + * @param doc Root document + * @param node Current node + */ + public DOMBuilder(Document doc, Node node) + { + m_doc = doc; + m_currentNode = node; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document fragment. + * + * @param doc Root document + * @param docFrag Document fragment + */ + public DOMBuilder(Document doc, DocumentFragment docFrag) + { + m_doc = doc; + m_docFrag = docFrag; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document. + * + * @param doc Root document + */ + public DOMBuilder(Document doc) + { + m_doc = doc; + } + + /** + * Get the root node of the DOM being created. This + * is either a Document or a DocumentFragment. + * + * @return The root document or document fragment if not null + */ + public Node getRootNode() + { + return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; + } + + /** + * Get the node currently being processed. + * + * @return the current node being processed + */ + public Node getCurrentNode() + { + return m_currentNode; + } + + /** + * Return null since there is no Writer for this class. + * + * @return null + */ + public java.io.Writer getWriter() + { + return null; + } + + /** + * Append a node to the current container. + * + * @param newNode New node to append + */ + protected void append(Node newNode) throws org.xml.sax.SAXException + { + + Node currentNode = m_currentNode; + + if (null != currentNode) + { + currentNode.appendChild(newNode); + + // System.out.println(newNode.getNodeName()); + } + else if (null != m_docFrag) + { + m_docFrag.appendChild(newNode); + } + else + { + boolean ok = true; + short type = newNode.getNodeType(); + + if (type == Node.TEXT_NODE) + { + String data = newNode.getNodeValue(); + + if ((null != data) && (data.trim().length() > 0)) + { + throw new org.xml.sax.SAXException("Warning: can't output text before document element! Ignoring..."); + } + + ok = false; + } + else if (type == Node.ELEMENT_NODE) + { + if (m_doc.getDocumentElement() != null) + { + throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!"); + } + } + + if (ok) + m_doc.appendChild(newNode); + } + } + + /** + * Receive an object for locating the origin of SAX document events. + * + * <p>SAX parsers are strongly encouraged (though not absolutely + * required) to supply a locator: if it does so, it must supply + * the locator to the application by invoking this method before + * invoking any of the other methods in the ContentHandler + * interface.</p> + * + * <p>The locator allows the application to determine the end + * position of any document-related event, even if the parser is + * not reporting an error. Typically, the application will + * use this information for reporting its own errors (such as + * character content that does not match an application's + * business rules). The information returned by the locator + * is probably not sufficient for use with a search engine.</p> + * + * <p>Note that the locator will return correct information only + * during the invocation of the events in this interface. The + * application should not attempt to use it at any other time.</p> + * + * @param locator An object that can return the location of + * any SAX document event. + * @see org.xml.sax.Locator + */ + public void setDocumentLocator(Locator locator) + { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of a document. + * + * <p>The SAX parser will invoke this method only once, before any + * other methods in this interface or in DTDHandler (except for + * setDocumentLocator).</p> + */ + public void startDocument() throws org.xml.sax.SAXException + { + + // No action for the moment. + } + + /** + * Receive notification of the end of a document. + * + * <p>The SAX parser will invoke this method only once, and it will + * be the last method invoked during the parse. The parser shall + * not invoke this method until it has either abandoned parsing + * (because of an unrecoverable error) or reached the end of + * input.</p> + */ + public void endDocument() throws org.xml.sax.SAXException + { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of an element. + * + * <p>The Parser will invoke this method at the beginning of every + * element in the XML document; there will be a corresponding + * endElement() event for every startElement() event (even when the + * element is empty). All of the element's content will be + * reported, in order, before the corresponding endElement() + * event.</p> + * + * <p>If the element name has a namespace prefix, the prefix will + * still be attached. Note that the attribute list provided will + * contain only attributes with explicit values (specified or + * defaulted): #IMPLIED attributes will be omitted.</p> + * + * + * @param ns The namespace of the node + * @param localName The local part of the qualified name + * @param name The element name. + * @param atts The attributes attached to the element, if any. + * @see #endElement + * @see org.xml.sax.Attributes + */ + public void startElement( + String ns, String localName, String name, Attributes atts) + throws org.xml.sax.SAXException + { + + Element elem; + + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. + if ((null == ns) || (ns.length() == 0)) + elem = m_doc.createElementNS(null,name); + else + elem = m_doc.createElementNS(ns, name); + + append(elem); + + try + { + int nAtts = atts.getLength(); + + if (0 != nAtts) + { + for (int i = 0; i < nAtts; i++) + { + + //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) ); + // First handle a possible ID attribute + if (atts.getType(i).equalsIgnoreCase("ID")) + setIDAttribute(atts.getValue(i), elem); + + String attrNS = atts.getURI(i); + + if("".equals(attrNS)) + attrNS = null; // DOM represents no-namespace as null + + // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // Crimson won't let us set an xmlns: attribute on the DOM. + String attrQName = atts.getQName(i); + + // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace + if (attrQName.startsWith("xmlns:")) + attrNS = "http://www.w3.org/2000/xmlns/"; + + // ALWAYS use the DOM Level 2 call! + elem.setAttributeNS(attrNS,attrQName, atts.getValue(i)); + } + } + + // append(elem); + + m_elemStack.push(elem); + + m_currentNode = elem; + + // append(elem); + } + catch(java.lang.Exception de) + { + // de.printStackTrace(); + throw new org.xml.sax.SAXException(de); + } + + } + + /** + + + + * Receive notification of the end of an element. + * + * <p>The SAX parser will invoke this method at the end of every + * element in the XML document; there will be a corresponding + * startElement() event for every endElement() event (even when the + * element is empty).</p> + * + * <p>If the element name has a namespace prefix, the prefix will + * still be attached to the name.</p> + * + * + * @param ns the namespace of the element + * @param localName The local part of the qualified name of the element + * @param name The element name + */ + public void endElement(String ns, String localName, String name) + throws org.xml.sax.SAXException + { + m_elemStack.pop(); + m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek(); + } + + /** + * Set an ID string to node association in the ID table. + * + * @param id The ID string. + * @param elem The associated ID. + */ + public void setIDAttribute(String id, Element elem) + { + + // Do nothing. This method is meant to be overiden. + } + + /** + * Receive notification of character data. + * + * <p>The Parser will call this method to report each chunk of + * character data. SAX parsers may return all contiguous character + * data in a single chunk, or they may split it into several + * chunks; however, all of the characters in any single event + * must come from the same external entity, so that the Locator + * provides useful information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * <p>Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating + * parsers must do so).</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + if (m_inCData) + { + cdata(ch, start, length); + + return; + } + + String s = new String(ch, start, length); + Node childNode; + childNode = m_currentNode != null ? m_currentNode.getLastChild(): null; + if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){ + ((Text)childNode).appendData(s); + } + else{ + Text text = m_doc.createTextNode(s); + append(text); + } + } + + /** + * If available, when the disable-output-escaping attribute is used, + * output raw text without escaping. A PI will be inserted in front + * of the node with the name "lotusxsl-next-is-raw" and a value of + * "formatter-to-dom". + * + * @param ch Array containing the characters + * @param start Index to start of characters in the array + * @param length Number of characters in the array + */ + public void charactersRaw(char ch[], int start, int length) + throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + + String s = new String(ch, start, length); + + append(m_doc.createProcessingInstruction("xslt-next-is-raw", + "formatter-to-dom")); + append(m_doc.createTextNode(s)); + } + + /** + * Report the beginning of an entity. + * + * The start and end of the document entity are not reported. + * The start and end of the external DTD subset are reported + * using the pseudo-name "[dtd]". All other events must be + * properly nested within start/end entity events. + * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%'. + * @see #endEntity + * @see org.xml.sax.ext.DeclHandler#internalEntityDecl + * @see org.xml.sax.ext.DeclHandler#externalEntityDecl + */ + public void startEntity(String name) throws org.xml.sax.SAXException + { + + // Almost certainly the wrong behavior... + // entityReference(name); + } + + /** + * Report the end of an entity. + * + * @param name The name of the entity that is ending. + * @see #startEntity + */ + public void endEntity(String name) throws org.xml.sax.SAXException{} + + /** + * Receive notivication of a entityReference. + * + * @param name name of the entity reference + */ + public void entityReference(String name) throws org.xml.sax.SAXException + { + append(m_doc.createEntityReference(name)); + } + + /** + * Receive notification of ignorable whitespace in element content. + * + * <p>Validating Parsers must use this method to report each chunk + * of ignorable whitespace (see the W3C XML 1.0 recommendation, + * section 2.10): non-validating parsers may also use this method + * if they are capable of parsing and using content models.</p> + * + * <p>SAX parsers may return all contiguous whitespace in a single + * chunk, or they may split it into several chunks; however, all of + * the characters in any single event must come from the same + * external entity, so that the Locator provides useful + * information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #characters + */ + public void ignorableWhitespace(char ch[], int start, int length) + throws org.xml.sax.SAXException + { + if(isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createTextNode(s)); + } + + /** + * Tell if the current node is outside the document element. + * + * @return true if the current node is outside the document element. + */ + private boolean isOutsideDocElem() + { + return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } + + /** + * Receive notification of a processing instruction. + * + * <p>The Parser will invoke this method once for each processing + * instruction found: note that processing instructions may occur + * before or after the main document element.</p> + * + * <p>A SAX parser should never report an XML declaration (XML 1.0, + * section 2.8) or a text declaration (XML 1.0, section 4.3.1) + * using this method.</p> + * + * @param target The processing instruction target. + * @param data The processing instruction data, or null if + * none was supplied. + */ + public void processingInstruction(String target, String data) + throws org.xml.sax.SAXException + { + append(m_doc.createProcessingInstruction(target, data)); + } + + /** + * Report an XML comment anywhere in the document. + * + * This callback will be used for comments inside or outside the + * document element, including comments in the external DTD + * subset (if read). + * + * @param ch An array holding the characters in the comment. + * @param start The starting position in the array. + * @param length The number of characters to use from the array. + */ + public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException + { + // tagsoup sometimes submits invalid values here + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return; + append(m_doc.createComment(new String(ch, start, length))); + } + + /** Flag indicating that we are processing a CData section */ + protected boolean m_inCData = false; + + /** + * Report the start of a CDATA section. + * + * @see #endCDATA + */ + public void startCDATA() throws org.xml.sax.SAXException + { + m_inCData = true; + append(m_doc.createCDATASection("")); + } + + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + public void endCDATA() throws org.xml.sax.SAXException + { + m_inCData = false; + } + + /** + * Receive notification of cdata. + * + * <p>The Parser will call this method to report each chunk of + * character data. SAX parsers may return all contiguous character + * data in a single chunk, or they may split it into several + * chunks; however, all of the characters in any single event + * must come from the same external entity, so that the Locator + * provides useful information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * <p>Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating + * parsers must do so).</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + // XXX ab...@ap...: modified from the original, to accomodate TagSoup. + Node n = m_currentNode.getLastChild(); + if (n instanceof CDATASection) + ((CDATASection)n).appendData(s); + else if (n instanceof Comment) + ((Comment)n).appendData(s); + } + + /** + * Report the start of DTD declarations, if any. + * + * Any declarations are assumed to be in the internal subset + * unless otherwise indicated. + * + * @param name The document type name. + * @param publicId The declared public identifier for the + * external DTD subset, or null if none was declared. + * @param systemId The declared system identifier for the + * external DTD subset, or null if none was declared. + * @see #endDTD + * @see #startEntity + */ + public void startDTD(String name, String publicId, String systemId) + throws org.xml.sax.SAXException + { + + // Do nothing for now. + } + + /** + * Report the end of DTD declarations. + * + * @see #startDTD + */ + public void endDTD() throws org.xml.sax.SAXException + { + + // Do nothing for now. + } + + /** + * Begin the scope of a prefix-URI Namespace mapping. + * + * <p>The information from this event is not necessary for + * normal Namespace processing: the SAX XML reader will + * automatically replace prefixes for element and attribute + * names when the http://xml.org/sax/features/namespaces + * feature is true (the default).</p> + * + * <p>There are cases, however, when applications need to + * use prefixes in character data or in attribute values, + * where they cannot safely be expanded automatically; the + * start/endPrefixMapping event supplies the information + * to the application to expand prefixes in those contexts + * itself, if necessary.</p> + * + * <p>Note that start/endPrefixMapping events are not + * guaranteed to be properly nested relative to each-other: + * all startPrefixMapping events will occur before the + * corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, + * but their order is not guaranteed.</p> + * + * @param prefix The Namespace prefix being declared. + * @param uri The Namespace URI the prefix is mapped to. + * @see #endPrefixMapping + * @see #startElement + */ + public void startPrefixMapping(String prefix, String uri) + throws org.xml.sax.SAXException + { + + /* + // Not sure if this is needed or wanted + // Also, it fails in the stree. + if((null != m_currentNode) + && (m_currentNode.getNodeType() == Node.ELEMENT_NODE)) + { + String qname; + if(((null != prefix) && (prefix.length() == 0)) + || (null == prefix)) + qname = "xmlns"; + else + qname = "xmlns:"+prefix; + + Element elem = (Element)m_currentNode; + String val = elem.getAttribute(qname); // Obsolete, should be DOM2...? + if(val == null) + { + elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", + qname, uri); + } + } + */ + } + + /** + * End the scope of a prefix-URI mapping. + * + * <p>See startPrefixMapping for details. This event will + * always occur after the corresponding endElement event, + * but the order of endPrefixMapping events is not otherwise + * guaranteed.</p> + * + * @param prefix The prefix that was being mapping. + * @see #startPrefixMapping + * @see #endElement + */ + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{} + + /** + * Receive notification of a skipped entity. + * + * <p>The Parser will invoke this method once for each entity + * skipped. Non-validating processors may skip entities if they + * have not seen the declarations (because, for example, the + * entity was declared in an external DTD subset). All processors + * may skip external entities, depending on the values of the + * http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities + * properties.</p> + * + * @param name The name of the skipped entity. If it is a + * parameter entity, the name will begin with '%'. + */ + public void skippedEntity(String name) throws org.xml.sax.SAXException{} +} Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,419 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.parse.html; + +import java.net.URL; +import java.net.MalformedURLException; +import java.util.Collection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Stack; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.util.NodeWalker; +import org.apache.hadoop.conf.Configuration; + +import org.w3c.dom.*; + +/** + * A collection of methods for extracting content from DOM trees. + * + * This class holds a few utility methods for pulling content out of + * DOM nodes, such as getOutlinks, getText, etc. + * + */ +public class DOMContentUtils { + + public static class LinkParams { + public String elName; + public String attrName; + public int childLen; + + public LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } + } + + private HashMap linkParams = new HashMap(); + private Configuration conf; + + public DOMContentUtils(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + // forceTags is used to override configurable tag ignoring, later on + Collection<String> forceTags = new ArrayList<String>(1); + + this.conf = conf; + linkParams.clear(); + linkParams.put("a", new LinkParams("a", "href", 1)); + linkParams.put("area", new LinkParams("area", "href", 0)); + if (conf.getBoolean("parser.html.form.use_action", true)) { + linkParams.put("form", new LinkParams("form", "action", 1)); + if (conf.get("parser.html.form.use_action") != null) + forceTags.add("form"); + } + linkParams.put("frame", new LinkParams("frame", "src", 0)); + linkParams.put("iframe", new LinkParams("iframe", "src", 0)); + linkParams.put("script", new LinkParams("script", "src", 0)); + linkParams.put("link", new LinkParams("link", "href", 0)); + linkParams.put("img", new LinkParams("img", "src", 0)); + + // remove unwanted link tags from the linkParams map + String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); + for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) { + if ( ! forceTags.contains(ignoreTags[i]) ) + linkParams.remove(ignoreTags[i]); + } + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, + * and will append all the content text found beneath the DOM node to + * the <code>StringBuffer</code>. + * + * <p> + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will + * be aborted and the <code>StringBuffer</code> will not contain + * any text encountered after a nested anchor is found. + * + * <p> + * + * @return true if nested anchors were found + */ + public boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + return true; + } + return false; + } + + + /** + * This is a convinience method, equivalent to {@link + * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * + */ + public void getText(StringBuffer sb, Node node) { + getText(sb, node, false); + } + + // returns true if abortOnNestedAnchors is true and we find nested + // anchors + private boolean getTextHelper(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, + int anchorDepth) { + boolean abort = false; + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("script".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if ("style".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { + anchorDepth++; + if (anchorDepth > 1) { + abort = true; + break; + } + } + if (nodeType == Node.COMMENT_NODE) { + walker.skipChildren(); + } + if (nodeType == Node.TEXT_NODE) { + // cleanup and trim the value + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + text = text.trim(); + if (text.length() > 0) { + if (sb.length() > 0) sb.append(' '); + sb.append(text); + } + } + } + + return abort; + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, + * and will append the content text found beneath the first + * <code>title</code> node to the <code>StringBuffer</code>. + * + * @return true if a title node was found, false otherwise + */ + public boolean getTitle(StringBuffer sb, Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return false; + } + + if (nodeType == Node.ELEMENT_NODE) { + if ("title".equalsIgnoreCase(nodeName)) { + getText(sb, currentNode); + return true; + } + } + } + + return false; + } + + /** If Node contains a BASE tag then it's HREF is returned. */ + public URL getBase(Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + // is this node a BASE tag? + if (nodeType == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return null; + } + + if ("base".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + if ("href".equalsIgnoreCase(attr.getNodeName())) { + try { + return new URL(attr.getNodeValue()); + } catch (MalformedURLException e) {} + } + } + } + } + } + + // no. + return null; + } + + + private boolean hasOnlyWhiteSpace(Node node) { + String val= node.getNodeValue(); + for (int i= 0; i < val.length(); i++) { + if (!Character.isWhitespace(val.charAt(i))) + return false; + } + return true; + } + + // this only covers a few cases of empty links that are symptomatic + // of nekohtml's DOM-fixup process... + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { + if (childLen == 0) { + // this has no inner structure + if (params.childLen == 0) return false; + else return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // single nested link + return true; + + } else if (childLen == 2) { + + Node c0= children.item(0); + Node c1= children.item(1); + + if ((c0.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c0.getNodeName())) + && (c1.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c1) ) { + // single link followed by whitespace node + return true; + } + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) ) { + // whitespace node followed by single link + return true; + } + + } else if (childLen == 3) { + Node c0= children.item(0); + Node c1= children.item(1); + Node c2= children.item(2); + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2) ) { + // single link surrounded by whitespace nodes + return true; + } + } + + return false; + } + + /** + * Handles cases where the url param information is encoded into the base + * url as opposed to the target. + * <p> + * If the taget contains params (i.e. ';xxxx') information then the target + * params information is assumed to be correct and any base params information + * is ignored. If the base contains params information but the tareget does + * not, then the params information is moved to the target allowing it to be + * correctly determined by the java.net.URL class. + * + * @param base The base URL. + * @param target The target path from the base URL. + * + * @return URL A URL with the params information correctly encoded. + * + * @throws MalformedURLException If the url is not a well formed URL. + */ + private URL fixEmbeddedParams(URL base, String target) + throws MalformedURLException{ + + // the target contains params information or the base doesn't then no + // conversion necessary, return regular URL + if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + return new URL(base, target); + } + + // get the base url and it params information + String baseURL = base.toString(); + int startParams = baseURL.indexOf(';'); + String params = baseURL.substring(startParams); + + // if the target has a query string then put the params information after + // any path but before the query string, otherwise just append to the path + int startQS = target.indexOf('?'); + if (startQS >= 0) { + target = target.substring(0, startQS) + params + + target.substring(startQS); + } + else { + target += params; + } + + return new URL(base, target); + } + + /** + * This method finds all anchors below the supplied DOM + * <code>node</code>, and creates appropriate {@link Outlink} + * records for each (relative to the supplied <code>base</code> + * URL), and adds them to the <code>outlinks</code> {@link + * ArrayList}. + * + * <p> + * + * Links without inner structure (tags, text, etc) are discarded, as + * are links which contain only single nested links and empty text + * nodes (this is a common DOM-fixup artifact, at least with + * nekohtml). + */ + public void getOutlinks(URL base, ArrayList outlinks, + Node node) { + + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + NodeList children = currentNode.getChildNodes(); + int childLen = (children != null) ? children.getLength() : 0; + + if (nodeType == Node.ELEMENT_NODE) { + + nodeName = nodeName.toLowerCase(); + LinkParams params = (LinkParams)linkParams.get(nodeName); + if (params != null) { + if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { + + StringBuffer linkText = new StringBuffer(); + getText(linkText, currentNode, true); + + NamedNodeMap attrs = currentNode.getAttributes(); + String target = null; + boolean noFollow = false; + boolean post = false; + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); + } else if ("rel".equalsIgnoreCase(attrName) && + "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } else if ("method".equalsIgnoreCase(attrName) && + "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; + } + } + if (target != null && !noFollow && !post) + try { + + URL url = (base.toString().indexOf(';') > 0) ? + fixEmbeddedParams(base, target) : new URL(base, target); + outlinks.add(new Outlink(url.toString(), + linkText.toString().trim())); + } catch (MalformedURLException e) { + // don't care + } + } + // this should not have any children, skip them + if (params.childLen == 0) continue; + } + } + } + } + +} + Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.parse.html; + +import java.net.URL; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.w3c.dom.*; + +/** + * Class for parsing META Directives from DOM trees. This class + * handles specifically Robots META directives (all, none, nofollow, + * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache + * instructions. All meta directives are stored in a HTMLMetaTags instance. + */ +public class HTMLMetaProcessor { + + /** + * Utility class with indicators for the robots directives "noindex" + * and "nofollow", and HTTP-EQUIV/no-cache + */ + + /** + * Sets the indicators in <code>robotsMeta</code> to appropriate + * values, based on any META tags found under the given + * <code>node</code>. + */ + public static final void getMetaTags ( + HTMLMetaTags metaTags, Node node, URL currURL) { + + metaTags.reset(); + getMetaTagsHelper(metaTags, node, currURL); + } + + private static final void getMetaTagsHelper( + HTMLMetaTags metaTags, Node node, URL currURL) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(node.getNodeName())) { + // META tags should not be under body + return; + } + + if ("meta".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node nameNode = null; + Node equivNode = null; + Node contentNode = null; + // Retrieves name, http-equiv and content attribues + for (int i=0; i<attrs.g... [truncated message content] |