Revision: 3608 http://archive-access.svn.sourceforge.net/archive-access/?rev=3608&view=rev Author: binzino Date: 2012-01-26 21:57:22 +0000 (Thu, 26 Jan 2012) Log Message: ----------- Fix bug to restore title, parsed-text and outlinks. Accidentally removed in previous edits. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java 2012-01-26 20:53:00 UTC (rev 3607) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java 2012-01-26 21:57:22 UTC (rev 3608) @@ -156,33 +156,23 @@ // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); - /* - if (LOG.isTraceEnabled()) { - LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); - } - // check meta directives - if (!metaTags.getNoIndex()) { // okay to index - StringBuffer sb = new StringBuffer(); - if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } - utils.getText(sb, root); // extract text - text = sb.toString(); - sb.setLength(0); - if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } - utils.getTitle(sb, root); // extract title - title = sb.toString().trim(); - } - - if (!metaTags.getNoFollow()) { // okay to follow links - ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks - URL baseTag = utils.getBase(root); - if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } - utils.getOutlinks(baseTag!=null?baseTag:base, l, root); - outlinks = l.toArray(new Outlink[l.size()]); - if (LOG.isTraceEnabled()) { - LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl()); - } - } - */ + + // Extract body text + StringBuffer sb = new StringBuffer(); + utils.getText(sb, root); // extract text + text = sb.toString(); + sb.setLength(0); + + // Extract title + utils.getTitle(sb, root); + title = sb.toString().trim(); + + // Extract outlinks + ArrayList<Outlink> l = new ArrayList<Outlink>(); + URL baseTag = utils.getBase(root); + utils.getOutlinks(baseTag!=null?baseTag:base, l, root); + outlinks = l.toArray(new Outlink[l.size()]); + ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); /* if (metaTags.getRefresh()) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |