Revision: 2503 http://archive-access.svn.sourceforge.net/archive-access/?rev=2503&view=rev Author: miklosh Date: 2008-07-26 15:46:55 +0000 (Sat, 26 Jul 2008) Log Message: ----------- Normalize extracted image URLs. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java 2008-07-26 15:42:26 UTC (rev 2502) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java 2008-07-26 15:46:55 UTC (rev 2503) @@ -33,6 +33,7 @@ import org.apache.nutch.analysis.AnalyzerFactory; import org.apache.nutch.analysis.NutchAnalyzer; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NodeWalker; import org.archive.nutchwax.imagesearch.ImageSearch; import org.w3c.dom.*; @@ -42,7 +43,8 @@ public static final Log LOG = LogFactory.getLog(ImageParseFilter.class); - + private URLNormalizers normalizers; + private void findImages(Node doc, URL base, ParseData parentData, ParseResult parseResult) { // Get language @@ -87,8 +89,20 @@ try { imgSrc = new URL(base, attr.getValue()); imgUrl = imgSrc.toString(); + // Normalize it + // Replace spaces with %20 + imgUrl = imgUrl.replaceAll("\\s", "%20"); + imgUrl = normalizers.normalize(imgUrl, + URLNormalizers.SCOPE_FETCHER); + // TODO: apply NutchWAX specific URL canonicalization } catch (MalformedURLException mue) { - skipNode = true; + if (imgUrl != null) { + if (LOG.isInfoEnabled()) { + LOG.info("MalformedURL: " + imgUrl); + } + } else { + skipNode = true; + } } } else if ("alt".equalsIgnoreCase(attr.getName())) { altText = attr.getValue(); @@ -162,6 +176,7 @@ public void setConf(Configuration conf) { this.conf = conf; + this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); } public Configuration getConf() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |