[Archive-access-cvs] SF.net SVN: archive-access:[3608] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3608
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3608&view=rev
Author:   binzino
Date:     2012-01-26 21:57:22 +0000 (Thu, 26 Jan 2012)
Log Message:
-----------
Fix bug to restore title, parsed-text and outlinks.  Accidentally removed in previous edits.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java	2012-01-26 20:53:00 UTC (rev 3607)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java	2012-01-26 21:57:22 UTC (rev 3608)
@@ -156,33 +156,23 @@
       
     // get meta directives
     HTMLMetaProcessor.getMetaTags(metaTags, root, base);
-    /*
-    if (LOG.isTraceEnabled()) {
-      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
-    }
-    // check meta directives
-    if (!metaTags.getNoIndex()) {               // okay to index
-      StringBuffer sb = new StringBuffer();
-      if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
-      utils.getText(sb, root);          // extract text
-      text = sb.toString();
-      sb.setLength(0);
-      if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
-      utils.getTitle(sb, root);         // extract title
-      title = sb.toString().trim();
-    }
-      
-    if (!metaTags.getNoFollow()) {              // okay to follow links
-      ArrayList<Outlink> l = new ArrayList<Outlink>();   // extract outlinks
-      URL baseTag = utils.getBase(root);
-      if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); }
-      utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
-      outlinks = l.toArray(new Outlink[l.size()]);
-      if (LOG.isTraceEnabled()) {
-        LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl());
-      }
-    }
-    */    
+
+    // Extract body text
+    StringBuffer sb = new StringBuffer();
+    utils.getText(sb, root);          // extract text
+    text = sb.toString();
+    sb.setLength(0);
+
+    // Extract title
+    utils.getTitle(sb, root);
+    title = sb.toString().trim();
+
+    // Extract outlinks
+    ArrayList<Outlink> l = new ArrayList<Outlink>();
+    URL baseTag = utils.getBase(root);
+    utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
+    outlinks = l.toArray(new Outlink[l.size()]);
+
     ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
     /*
     if (metaTags.getRefresh()) {

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[Archive-access-cvs] SF.net SVN: archive-access:[3608] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

[Archive-access-cvs] SF.net SVN: archive-access:[3608] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser. java