[Archive-access-cvs] SF.net SVN: archive-access:[3607] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3607
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3607&view=rev
Author:   binzino
Date:     2012-01-26 20:53:00 +0000 (Thu, 26 Jan 2012)
Log Message:
-----------
Initial revision of NutchWAX custom version of parse-html plugin.  Main diffs are not enforcing robots meta tag nor trying to process redirects.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml

Added Paths:
-----------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/XMLCharacterRecognizer.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml	2012-01-26 20:51:04 UTC (rev 3606)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml	2012-01-26 20:53:00 UTC (rev 3607)
@@ -92,6 +92,7 @@
      <ant dir="scoring-nutchwax" target="deploy" />
      <ant dir="urlfilter-nutchwax" target="deploy" />
      <ant dir="parse-pdf2" target="deploy" />
+     <ant dir="parse-html2" target="deploy" />
      <ant dir="html-decorator" target="deploy" />
 
   </target>

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml	2012-01-26 20:53:00 UTC (rev 3607)
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-html2" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar
===================================================================
(Binary files differ)


Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt	2012-01-26 20:53:00 UTC (rev 3607)
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml	2012-01-26 20:53:00 UTC (rev 3607)
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-html2"
+   name="NutchWAX Html Parse Plug-in"
+   version="1.0.0"
+   provider-name="archive.org">
+
+   <runtime>
+      <library name="parse-html2.jar">
+         <export name="*"/>
+      </library>
+      <library name="tagsoup-1.2.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.archive.nutchwax.parse.html"
+              name="NutchWAX HTML Parser"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.archive.nutchwax.parse.html.HtmlParser"
+                      class="org.archive.nutchwax.parse.html.HtmlParser">
+        <parameter name="contentType" value="text/html"/>
+        <parameter name="pathSuffix" value=""/>
+      </implementation>
+
+   </extension>
+
+</plugin>

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java	2012-01-26 20:53:00 UTC (rev 3607)
@@ -0,0 +1,740 @@
+/*
+ * XXX ab...@ap...: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.archive.nutchwax.parse.html;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+/**
+ * This class takes SAX events (in addition to some extra events
+ * that SAX doesn't handle yet) and adds the result to a document
+ * or document fragment.
+ */
+public class DOMBuilder
+        implements ContentHandler, LexicalHandler
+{
+
+  /** Root document          */
+  public Document m_doc;
+
+  /** Current node           */
+  protected Node m_currentNode = null;
+
+  /** First node of document fragment or null if not a DocumentFragment     */
+  public DocumentFragment m_docFrag = null;
+
+  /** Vector of element nodes          */
+  protected Stack m_elemStack = new Stack();
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document fragment.
+   *
+   * @param doc Root document
+   * @param node Current node
+   */
+  public DOMBuilder(Document doc, Node node)
+  {
+    m_doc = doc;
+    m_currentNode = node;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document fragment.
+   *
+   * @param doc Root document
+   * @param docFrag Document fragment
+   */
+  public DOMBuilder(Document doc, DocumentFragment docFrag)
+  {
+    m_doc = doc;
+    m_docFrag = docFrag;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes
+   * to the document.
+   *
+   * @param doc Root document
+   */
+  public DOMBuilder(Document doc)
+  {
+    m_doc = doc;
+  }
+
+  /**
+   * Get the root node of the DOM being created.  This
+   * is either a Document or a DocumentFragment.
+   *
+   * @return The root document or document fragment if not null
+   */
+  public Node getRootNode()
+  {
+    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+  }
+
+  /**
+   * Get the node currently being processed.
+   *
+   * @return the current node being processed
+   */
+  public Node getCurrentNode()
+  {
+    return m_currentNode;
+  }
+
+  /**
+   * Return null since there is no Writer for this class.
+   *
+   * @return null
+   */
+  public java.io.Writer getWriter()
+  {
+    return null;
+  }
+
+  /**
+   * Append a node to the current container.
+   *
+   * @param newNode New node to append
+   */
+  protected void append(Node newNode) throws org.xml.sax.SAXException
+  {
+
+    Node currentNode = m_currentNode;
+
+    if (null != currentNode)
+    {
+      currentNode.appendChild(newNode);
+
+      // System.out.println(newNode.getNodeName());
+    }
+    else if (null != m_docFrag)
+    {
+      m_docFrag.appendChild(newNode);
+    }
+    else
+    {
+      boolean ok = true;
+      short type = newNode.getNodeType();
+
+      if (type == Node.TEXT_NODE)
+      {
+        String data = newNode.getNodeValue();
+
+        if ((null != data) && (data.trim().length() > 0))
+        {
+          throw new org.xml.sax.SAXException("Warning: can't output text before document element!  Ignoring...");
+        }
+
+        ok = false;
+      }
+      else if (type == Node.ELEMENT_NODE)
+      {
+        if (m_doc.getDocumentElement() != null)
+        {
+          throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!");
+        }
+      }
+
+      if (ok)
+        m_doc.appendChild(newNode);
+    }
+  }
+
+  /**
+   * Receive an object for locating the origin of SAX document events.
+   *
+   * <p>SAX parsers are strongly encouraged (though not absolutely
+   * required) to supply a locator: if it does so, it must supply
+   * the locator to the application by invoking this method before
+   * invoking any of the other methods in the ContentHandler
+   * interface.</p>
+   *
+   * <p>The locator allows the application to determine the end
+   * position of any document-related event, even if the parser is
+   * not reporting an error.  Typically, the application will
+   * use this information for reporting its own errors (such as
+   * character content that does not match an application's
+   * business rules).  The information returned by the locator
+   * is probably not sufficient for use with a search engine.</p>
+   *
+   * <p>Note that the locator will return correct information only
+   * during the invocation of the events in this interface.  The
+   * application should not attempt to use it at any other time.</p>
+   *
+   * @param locator An object that can return the location of
+   *                any SAX document event.
+   * @see org.xml.sax.Locator
+   */
+  public void setDocumentLocator(Locator locator)
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of a document.
+   *
+   * <p>The SAX parser will invoke this method only once, before any
+   * other methods in this interface or in DTDHandler (except for
+   * setDocumentLocator).</p>
+   */
+  public void startDocument() throws org.xml.sax.SAXException
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the end of a document.
+   *
+   * <p>The SAX parser will invoke this method only once, and it will
+   * be the last method invoked during the parse.  The parser shall
+   * not invoke this method until it has either abandoned parsing
+   * (because of an unrecoverable error) or reached the end of
+   * input.</p>
+   */
+  public void endDocument() throws org.xml.sax.SAXException
+  {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of an element.
+   *
+   * <p>The Parser will invoke this method at the beginning of every
+   * element in the XML document; there will be a corresponding
+   * endElement() event for every startElement() event (even when the
+   * element is empty). All of the element's content will be
+   * reported, in order, before the corresponding endElement()
+   * event.</p>
+   *
+   * <p>If the element name has a namespace prefix, the prefix will
+   * still be attached.  Note that the attribute list provided will
+   * contain only attributes with explicit values (specified or
+   * defaulted): #IMPLIED attributes will be omitted.</p>
+   *
+   *
+   * @param ns The namespace of the node
+   * @param localName The local part of the qualified name
+   * @param name The element name.
+   * @param atts The attributes attached to the element, if any.
+   * @see #endElement
+   * @see org.xml.sax.Attributes
+   */
+  public void startElement(
+          String ns, String localName, String name, Attributes atts)
+            throws org.xml.sax.SAXException
+  {
+
+    Element elem;
+
+	// Note that the namespace-aware call must be used to correctly
+	// construct a Level 2 DOM, even for non-namespaced nodes.
+    if ((null == ns) || (ns.length() == 0))
+      elem = m_doc.createElementNS(null,name);
+    else
+      elem = m_doc.createElementNS(ns, name);
+
+    append(elem);
+
+    try
+    {
+      int nAtts = atts.getLength();
+
+      if (0 != nAtts)
+      {
+        for (int i = 0; i < nAtts; i++)
+        {
+
+          //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) );
+          // First handle a possible ID attribute
+          if (atts.getType(i).equalsIgnoreCase("ID"))
+            setIDAttribute(atts.getValue(i), elem);
+
+          String attrNS = atts.getURI(i);
+
+          if("".equals(attrNS))
+            attrNS = null; // DOM represents no-namespace as null
+
+          // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+          //                   +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+          // Crimson won't let us set an xmlns: attribute on the DOM.
+          String attrQName = atts.getQName(i);
+
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace
+          if (attrQName.startsWith("xmlns:"))
+            attrNS = "http://www.w3.org/2000/xmlns/";
+
+          // ALWAYS use the DOM Level 2 call!
+          elem.setAttributeNS(attrNS,attrQName, atts.getValue(i));
+        }
+      }
+
+      // append(elem);
+
+      m_elemStack.push(elem);
+
+      m_currentNode = elem;
+
+      // append(elem);
+    }
+    catch(java.lang.Exception de)
+    {
+      // de.printStackTrace();
+      throw new org.xml.sax.SAXException(de);
+    }
+
+  }
+
+  /**
+
+
+
+   * Receive notification of the end of an element.
+   *
+   * <p>The SAX parser will invoke this method at the end of every
+   * element in the XML document; there will be a corresponding
+   * startElement() event for every endElement() event (even when the
+   * element is empty).</p>
+   *
+   * <p>If the element name has a namespace prefix, the prefix will
+   * still be attached to the name.</p>
+   *
+   *
+   * @param ns the namespace of the element
+   * @param localName The local part of the qualified name of the element
+   * @param name The element name
+   */
+  public void endElement(String ns, String localName, String name)
+          throws org.xml.sax.SAXException
+  {
+    m_elemStack.pop();
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek();
+  }
+
+  /**
+   * Set an ID string to node association in the ID table.
+   *
+   * @param id The ID string.
+   * @param elem The associated ID.
+   */
+  public void setIDAttribute(String id, Element elem)
+  {
+
+    // Do nothing. This method is meant to be overiden.
+  }
+
+  /**
+   * Receive notification of character data.
+   *
+   * <p>The Parser will call this method to report each chunk of
+   * character data.  SAX parsers may return all contiguous character
+   * data in a single chunk, or they may split it into several
+   * chunks; however, all of the characters in any single event
+   * must come from the same external entity, so that the Locator
+   * provides useful information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * <p>Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating
+   * parsers must do so).</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+    if (m_inCData)
+    {
+      cdata(ch, start, length);
+
+      return;
+    }
+
+    String s = new String(ch, start, length);
+    Node childNode;
+    childNode =  m_currentNode != null ? m_currentNode.getLastChild(): null;
+    if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){
+       ((Text)childNode).appendData(s);
+    }
+    else{
+       Text text = m_doc.createTextNode(s);
+       append(text);
+    }
+  }
+
+  /**
+   * If available, when the disable-output-escaping attribute is used,
+   * output raw text without escaping.  A PI will be inserted in front
+   * of the node with the name "lotusxsl-next-is-raw" and a value of
+   * "formatter-to-dom".
+   *
+   * @param ch Array containing the characters
+   * @param start Index to start of characters in the array
+   * @param length Number of characters in the array
+   */
+  public void charactersRaw(char ch[], int start, int length)
+          throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+                                             "formatter-to-dom"));
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Report the beginning of an entity.
+   *
+   * The start and end of the document entity are not reported.
+   * The start and end of the external DTD subset are reported
+   * using the pseudo-name "[dtd]".  All other events must be
+   * properly nested within start/end entity events.
+   *
+   * @param name The name of the entity.  If it is a parameter
+   *        entity, the name will begin with '%'.
+   * @see #endEntity
+   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+   */
+  public void startEntity(String name) throws org.xml.sax.SAXException
+  {
+
+    // Almost certainly the wrong behavior...
+    // entityReference(name);
+  }
+
+  /**
+   * Report the end of an entity.
+   *
+   * @param name The name of the entity that is ending.
+   * @see #startEntity
+   */
+  public void endEntity(String name) throws org.xml.sax.SAXException{}
+
+  /**
+   * Receive notivication of a entityReference.
+   *
+   * @param name name of the entity reference
+   */
+  public void entityReference(String name) throws org.xml.sax.SAXException
+  {
+    append(m_doc.createEntityReference(name));
+  }
+
+  /**
+   * Receive notification of ignorable whitespace in element content.
+   *
+   * <p>Validating Parsers must use this method to report each chunk
+   * of ignorable whitespace (see the W3C XML 1.0 recommendation,
+   * section 2.10): non-validating parsers may also use this method
+   * if they are capable of parsing and using content models.</p>
+   *
+   * <p>SAX parsers may return all contiguous whitespace in a single
+   * chunk, or they may split it into several chunks; however, all of
+   * the characters in any single event must come from the same
+   * external entity, so that the Locator provides useful
+   * information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #characters
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+          throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem())
+      return;  // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Tell if the current node is outside the document element.
+   *
+   * @return true if the current node is outside the document element.
+   */
+   private boolean isOutsideDocElem()
+   {
+      return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+   }
+
+  /**
+   * Receive notification of a processing instruction.
+   *
+   * <p>The Parser will invoke this method once for each processing
+   * instruction found: note that processing instructions may occur
+   * before or after the main document element.</p>
+   *
+   * <p>A SAX parser should never report an XML declaration (XML 1.0,
+   * section 2.8) or a text declaration (XML 1.0, section 4.3.1)
+   * using this method.</p>
+   *
+   * @param target The processing instruction target.
+   * @param data The processing instruction data, or null if
+   *        none was supplied.
+   */
+  public void processingInstruction(String target, String data)
+          throws org.xml.sax.SAXException
+  {
+    append(m_doc.createProcessingInstruction(target, data));
+  }
+
+  /**
+   * Report an XML comment anywhere in the document.
+   *
+   * This callback will be used for comments inside or outside the
+   * document element, including comments in the external DTD
+   * subset (if read).
+   *
+   * @param ch An array holding the characters in the comment.
+   * @param start The starting position in the array.
+   * @param length The number of characters to use from the array.
+   */
+  public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    // tagsoup sometimes submits invalid values here
+    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return;
+    append(m_doc.createComment(new String(ch, start, length)));
+  }
+
+  /** Flag indicating that we are processing a CData section          */
+  protected boolean m_inCData = false;
+
+  /**
+   * Report the start of a CDATA section.
+   *
+   * @see #endCDATA
+   */
+  public void startCDATA() throws org.xml.sax.SAXException
+  {
+    m_inCData = true;
+    append(m_doc.createCDATASection(""));
+  }
+
+  /**
+   * Report the end of a CDATA section.
+   *
+   * @see #startCDATA
+   */
+  public void endCDATA() throws org.xml.sax.SAXException
+  {
+    m_inCData = false;
+  }
+
+  /**
+   * Receive notification of cdata.
+   *
+   * <p>The Parser will call this method to report each chunk of
+   * character data.  SAX parsers may return all contiguous character
+   * data in a single chunk, or they may split it into several
+   * chunks; however, all of the characters in any single event
+   * must come from the same external entity, so that the Locator
+   * provides useful information.</p>
+   *
+   * <p>The application must not attempt to read from the array
+   * outside of the specified range.</p>
+   *
+   * <p>Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating
+   * parsers must do so).</p>
+   *
+   * @param ch The characters from the XML document.
+   * @param start The start position in the array.
+   * @param length The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException
+  {
+    if(isOutsideDocElem()
+       && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return;  // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    // XXX ab...@ap...: modified from the original, to accomodate TagSoup. 
+    Node n = m_currentNode.getLastChild();
+    if (n instanceof CDATASection)
+      ((CDATASection)n).appendData(s);
+    else if (n instanceof Comment)
+      ((Comment)n).appendData(s);
+  }
+
+  /**
+   * Report the start of DTD declarations, if any.
+   *
+   * Any declarations are assumed to be in the internal subset
+   * unless otherwise indicated.
+   *
+   * @param name The document type name.
+   * @param publicId The declared public identifier for the
+   *        external DTD subset, or null if none was declared.
+   * @param systemId The declared system identifier for the
+   *        external DTD subset, or null if none was declared.
+   * @see #endDTD
+   * @see #startEntity
+   */
+  public void startDTD(String name, String publicId, String systemId)
+          throws org.xml.sax.SAXException
+  {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Report the end of DTD declarations.
+   *
+   * @see #startDTD
+   */
+  public void endDTD() throws org.xml.sax.SAXException
+  {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Begin the scope of a prefix-URI Namespace mapping.
+   *
+   * <p>The information from this event is not necessary for
+   * normal Namespace processing: the SAX XML reader will
+   * automatically replace prefixes for element and attribute
+   * names when the http://xml.org/sax/features/namespaces
+   * feature is true (the default).</p>
+   *
+   * <p>There are cases, however, when applications need to
+   * use prefixes in character data or in attribute values,
+   * where they cannot safely be expanded automatically; the
+   * start/endPrefixMapping event supplies the information
+   * to the application to expand prefixes in those contexts
+   * itself, if necessary.</p>
+   *
+   * <p>Note that start/endPrefixMapping events are not
+   * guaranteed to be properly nested relative to each-other:
+   * all startPrefixMapping events will occur before the
+   * corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event,
+   * but their order is not guaranteed.</p>
+   *
+   * @param prefix The Namespace prefix being declared.
+   * @param uri The Namespace URI the prefix is mapped to.
+   * @see #endPrefixMapping
+   * @see #startElement
+   */
+  public void startPrefixMapping(String prefix, String uri)
+          throws org.xml.sax.SAXException
+  {
+
+    /*
+    // Not sure if this is needed or wanted
+    // Also, it fails in the stree.
+    if((null != m_currentNode)
+       && (m_currentNode.getNodeType() == Node.ELEMENT_NODE))
+    {
+      String qname;
+      if(((null != prefix) && (prefix.length() == 0))
+         || (null == prefix))
+        qname = "xmlns";
+      else
+        qname = "xmlns:"+prefix;
+
+      Element elem = (Element)m_currentNode;
+      String val = elem.getAttribute(qname); // Obsolete, should be DOM2...?
+      if(val == null)
+      {
+        elem.setAttributeNS("http://www.w3.org/XML/1998/namespace",
+                            qname, uri);
+      }
+    }
+    */
+  }
+
+  /**
+   * End the scope of a prefix-URI mapping.
+   *
+   * <p>See startPrefixMapping for details.  This event will
+   * always occur after the corresponding endElement event,
+   * but the order of endPrefixMapping events is not otherwise
+   * guaranteed.</p>
+   *
+   * @param prefix The prefix that was being mapping.
+   * @see #startPrefixMapping
+   * @see #endElement
+   */
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{}
+
+  /**
+   * Receive notification of a skipped entity.
+   *
+   * <p>The Parser will invoke this method once for each entity
+   * skipped.  Non-validating processors may skip entities if they
+   * have not seen the declarations (because, for example, the
+   * entity was declared in an external DTD subset).  All processors
+   * may skip external entities, depending on the values of the
+   * http://xml.org/sax/features/external-general-entities and the
+   * http://xml.org/sax/features/external-parameter-entities
+   * properties.</p>
+   *
+   * @param name The name of the skipped entity.  If it is a
+   *        parameter entity, the name will begin with '%'.
+   */
+  public void skippedEntity(String name) throws org.xml.sax.SAXException{}
+}

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java	2012-01-26 20:53:00 UTC (rev 3607)
@@ -0,0 +1,419 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.parse.html;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.util.Collection;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Stack;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NodeWalker;
+import org.apache.hadoop.conf.Configuration;
+
+import org.w3c.dom.*;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ * 
+ * This class holds a few utility methods for pulling content out of 
+ * DOM nodes, such as getOutlinks, getText, etc.
+ *
+ */
+public class DOMContentUtils {
+
+  public static class LinkParams {
+    public String elName;
+    public String attrName;
+      public int childLen;
+      
+      public LinkParams(String elName, String attrName, int childLen) {
+          this.elName = elName;
+          this.attrName = attrName;
+          this.childLen = childLen;
+      }
+      
+      public String toString() {
+          return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+      }
+  }
+  
+  private HashMap linkParams = new HashMap();
+  private Configuration conf;
+  
+  public DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+  
+  public void setConf(Configuration conf) {
+    // forceTags is used to override configurable tag ignoring, later on
+    Collection<String> forceTags = new ArrayList<String>(1);
+
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", true)) {
+      linkParams.put("form", new LinkParams("form", "action", 1));
+      if (conf.get("parser.html.form.use_action") != null)
+        forceTags.add("form");
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
+
+    // remove unwanted link tags from the linkParams map
+    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+    for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) {
+      if ( ! forceTags.contains(ignoreTags[i]) )
+        linkParams.remove(ignoreTags[i]);
+    }
+  }
+  
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node},
+   * and will append all the content text found beneath the DOM node to 
+   * the <code>StringBuffer</code>.
+   *
+   * <p>
+   *
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
+   * be aborted and the <code>StringBuffer</code> will not contain
+   * any text encountered after a nested anchor is found.
+   * 
+   * <p>
+   *
+   * @return true if nested anchors were found
+   */
+  public boolean getText(StringBuffer sb, Node node, 
+                                      boolean abortOnNestedAnchors) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+      return true;
+    } 
+    return false;
+  }
+
+
+  /**
+   * This is a convinience method, equivalent to {@link
+   * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * 
+   */
+  public void getText(StringBuffer sb, Node node) {
+    getText(sb, node, false);
+  }
+
+  // returns true if abortOnNestedAnchors is true and we find nested 
+  // anchors
+  private boolean getTextHelper(StringBuffer sb, Node node, 
+                                             boolean abortOnNestedAnchors,
+                                             int anchorDepth) {
+    boolean abort = false;
+    NodeWalker walker = new NodeWalker(node);
+    
+    while (walker.hasNext()) {
+    
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      
+      if ("script".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if ("style".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+        anchorDepth++;
+        if (anchorDepth > 1) {
+          abort = true;
+          break;
+        }        
+      }
+      if (nodeType == Node.COMMENT_NODE) {
+        walker.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        // cleanup and trim the value
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        text = text.trim();
+        if (text.length() > 0) {
+          if (sb.length() > 0) sb.append(' ');
+        	sb.append(text);
+        }
+      }
+    }
+    
+    return abort;
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node},
+   * and will append the content text found beneath the first
+   * <code>title</code> node to the <code>StringBuffer</code>.
+   *
+   * @return true if a title node was found, false otherwise
+   */
+  public boolean getTitle(StringBuffer sb, Node node) {
+    
+    NodeWalker walker = new NodeWalker(node);
+    
+    while (walker.hasNext()) {
+  
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      
+      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+        return false;
+      }
+  
+      if (nodeType == Node.ELEMENT_NODE) {
+        if ("title".equalsIgnoreCase(nodeName)) {
+          getText(sb, currentNode);
+          return true;
+        }
+      }
+    }      
+    
+    return false;
+  }
+
+  /** If Node contains a BASE tag then it's HREF is returned. */
+  public URL getBase(Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+    
+    while (walker.hasNext()) {
+  
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      
+      // is this node a BASE tag?
+      if (nodeType == Node.ELEMENT_NODE) {
+  
+        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+          return null;
+        }
+  
+        if ("base".equalsIgnoreCase(nodeName)) {
+          NamedNodeMap attrs = currentNode.getAttributes();
+          for (int i= 0; i < attrs.getLength(); i++ ) {
+            Node attr = attrs.item(i);
+            if ("href".equalsIgnoreCase(attr.getNodeName())) {
+              try {
+                return new URL(attr.getNodeValue());
+              } catch (MalformedURLException e) {}
+            }
+          }
+        }
+      }
+    }
+
+    // no.
+    return null;
+  }
+
+
+  private boolean hasOnlyWhiteSpace(Node node) {
+    String val= node.getNodeValue();
+    for (int i= 0; i < val.length(); i++) {
+      if (!Character.isWhitespace(val.charAt(i)))
+        return false;
+    }
+    return true;
+  }
+
+  // this only covers a few cases of empty links that are symptomatic
+  // of nekohtml's DOM-fixup process...
+  private boolean shouldThrowAwayLink(Node node, NodeList children, 
+                                              int childLen, LinkParams params) {
+    if (childLen == 0) {
+      // this has no inner structure 
+      if (params.childLen == 0) return false;
+      else return true;
+    } else if ((childLen == 1) 
+               && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+               && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { 
+      // single nested link
+      return true;
+
+    } else if (childLen == 2) {
+
+      Node c0= children.item(0);
+      Node c1= children.item(1);
+
+      if ((c0.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+          && (c1.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c1) ) {
+        // single link followed by whitespace node
+        return true;
+      }
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c0) ) {
+        // whitespace node followed by single link
+        return true;
+      }
+
+    } else if (childLen == 3) {
+      Node c0= children.item(0);
+      Node c1= children.item(1);
+      Node c2= children.item(2);
+      
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) 
+          && (c2.getNodeType() == Node.TEXT_NODE) 
+          && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2) ) {
+        // single link surrounded by whitespace nodes
+        return true;
+      }
+    }
+
+    return false;
+  }
+  
+  /**
+   * Handles cases where the url param information is encoded into the base
+   * url as opposed to the target.
+   * <p>
+   * If the taget contains params (i.e. ';xxxx') information then the target 
+   * params information is assumed to be correct and any base params information
+   * is ignored.  If the base contains params information but the tareget does
+   * not, then the params information is moved to the target allowing it to be
+   * correctly determined by the java.net.URL class.
+   * 
+   * @param base The base URL.
+   * @param target The target path from the base URL.
+   * 
+   * @return URL A URL with the params information correctly encoded.
+   * 
+   * @throws MalformedURLException If the url is not a well formed URL.
+   */
+  private URL fixEmbeddedParams(URL base, String target) 
+    throws MalformedURLException{
+    
+    // the target contains params information or the base doesn't then no
+    // conversion necessary, return regular URL
+    if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+      return new URL(base, target);
+    }
+    
+    // get the base url and it params information
+    String baseURL = base.toString();
+    int startParams = baseURL.indexOf(';');
+    String params = baseURL.substring(startParams);
+    
+    // if the target has a query string then put the params information after
+    // any path but before the query string, otherwise just append to the path
+    int startQS = target.indexOf('?');
+    if (startQS >= 0) {
+      target = target.substring(0, startQS) + params + 
+        target.substring(startQS);
+    }
+    else {
+      target += params;
+    }
+    
+    return new URL(base, target);
+  }
+
+  /**
+   * This method finds all anchors below the supplied DOM
+   * <code>node</code>, and creates appropriate {@link Outlink}
+   * records for each (relative to the supplied <code>base</code>
+   * URL), and adds them to the <code>outlinks</code> {@link
+   * ArrayList}.
+   *
+   * <p>
+   *
+   * Links without inner structure (tags, text, etc) are discarded, as
+   * are links which contain only single nested links and empty text
+   * nodes (this is a common DOM-fixup artifact, at least with
+   * nekohtml).
+   */
+  public void getOutlinks(URL base, ArrayList outlinks, 
+                                       Node node) {
+    
+    NodeWalker walker = new NodeWalker(node);
+    while (walker.hasNext()) {
+      
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();      
+      NodeList children = currentNode.getChildNodes();
+      int childLen = (children != null) ? children.getLength() : 0; 
+      
+      if (nodeType == Node.ELEMENT_NODE) {
+        
+        nodeName = nodeName.toLowerCase();
+        LinkParams params = (LinkParams)linkParams.get(nodeName);
+        if (params != null) {
+          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
+  
+            StringBuffer linkText = new StringBuffer();
+            getText(linkText, currentNode, true);
+  
+            NamedNodeMap attrs = currentNode.getAttributes();
+            String target = null;
+            boolean noFollow = false;
+            boolean post = false;
+            for (int i= 0; i < attrs.getLength(); i++ ) {
+              Node attr = attrs.item(i);
+              String attrName = attr.getNodeName();
+              if (params.attrName.equalsIgnoreCase(attrName)) {
+                target = attr.getNodeValue();
+              } else if ("rel".equalsIgnoreCase(attrName) &&
+                         "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                noFollow = true;
+              } else if ("method".equalsIgnoreCase(attrName) &&
+                         "post".equalsIgnoreCase(attr.getNodeValue())) {
+                post = true;
+              }
+            }
+            if (target != null && !noFollow && !post)
+              try {
+                
+                URL url = (base.toString().indexOf(';') > 0) ? 
+                  fixEmbeddedParams(base, target) :  new URL(base, target);
+                outlinks.add(new Outlink(url.toString(),
+                                         linkText.toString().trim()));
+              } catch (MalformedURLException e) {
+                // don't care
+              }
+          }
+          // this should not have any children, skip them
+          if (params.childLen == 0) continue;
+        }
+      }
+    }
+  }
+
+}
+

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java	2012-01-26 20:53:00 UTC (rev 3607)
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.parse.html;
+
+import java.net.URL;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees.  This class
+ * handles specifically Robots META directives (all, none, nofollow,
+ * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
+ * instructions. All meta directives are stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+  /**
+   * Utility class with indicators for the robots directives "noindex"
+   * and "nofollow", and HTTP-EQUIV/no-cache
+   */
+  
+  /**
+   * Sets the indicators in <code>robotsMeta</code> to appropriate
+   * values, based on any META tags found under the given
+   * <code>node</code>.
+   */
+  public static final void getMetaTags (
+    HTMLMetaTags metaTags, Node node, URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static final void getMetaTagsHelper(
+    HTMLMetaTags metaTags, Node node, URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = null;
+        Node equivNode = null;
+        Node contentNode = null;
+        // Retrieves name, http-equiv and content attribues
+        for (int i=0; i<attrs.g...
 
[truncated message content]