Revision: 3608
http://archive-access.svn.sourceforge.net/archive-access/?rev=3608&view=rev
Author: binzino
Date: 2012-01-26 21:57:22 +0000 (Thu, 26 Jan 2012)
Log Message:
-----------
Fix bug to restore title, parsed-text and outlinks. Accidentally removed in previous edits.
Modified Paths:
--------------
tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java
Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java 2012-01-26 20:53:00 UTC (rev 3607)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java 2012-01-26 21:57:22 UTC (rev 3608)
@@ -156,33 +156,23 @@
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
- /*
- if (LOG.isTraceEnabled()) {
- LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
- }
- // check meta directives
- if (!metaTags.getNoIndex()) { // okay to index
- StringBuffer sb = new StringBuffer();
- if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
- utils.getText(sb, root); // extract text
- text = sb.toString();
- sb.setLength(0);
- if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
- utils.getTitle(sb, root); // extract title
- title = sb.toString().trim();
- }
-
- if (!metaTags.getNoFollow()) { // okay to follow links
- ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
- URL baseTag = utils.getBase(root);
- if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); }
- utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
- outlinks = l.toArray(new Outlink[l.size()]);
- if (LOG.isTraceEnabled()) {
- LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl());
- }
- }
- */
+
+ // Extract body text
+ StringBuffer sb = new StringBuffer();
+ utils.getText(sb, root); // extract text
+ text = sb.toString();
+ sb.setLength(0);
+
+ // Extract title
+ utils.getTitle(sb, root);
+ title = sb.toString().trim();
+
+ // Extract outlinks
+ ArrayList<Outlink> l = new ArrayList<Outlink>();
+ URL baseTag = utils.getBase(root);
+ utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
+ outlinks = l.toArray(new Outlink[l.size()]);
+
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
/*
if (metaTags.getRefresh()) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|