From: <mi...@us...> - 2008-07-17 10:16:35
|
Revision: 2456 http://archive-access.svn.sourceforge.net/archive-access/?rev=2456&view=rev Author: miklosh Date: 2008-07-17 10:16:27 +0000 (Thu, 17 Jul 2008) Log Message: ----------- Switched to using multiple valued Lucene fields. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java 2008-07-15 01:56:50 UTC (rev 2455) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java 2008-07-17 10:16:27 UTC (rev 2456) @@ -41,30 +41,23 @@ private Configuration conf; - private String concatArray(String[] values, char separator) { - StringBuffer sb = new StringBuffer(); - for (String value : values) { - if (sb.length() > 0) { - sb.append(separator); - } - sb.append(value); - } - return sb.toString(); - } - public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta(); // Handle images - String parentUrl = metadata.get(ImageSearch.PARENT_URL_KEY); - if (parentUrl != null) { - // Store parent's URL - doc.add(new Field(ImageSearch.PARENT_URL_KEY, parentUrl, Field.Store.YES, Field.Index.NO)); + String[] parentUrls = metadata.getValues(ImageSearch.PARENT_URL_KEY); + if (parentUrls.length > 0) { + // Store parent URL(s) + for (String parent : parentUrls) { + doc.add(new Field(ImageSearch.PARENT_URL_KEY, parent, Field.Store.YES, Field.Index.NO)); + } // Index alternate text - String alt = metadata.get(ImageSearch.ALT_TEXT_KEY); - if (alt != null) { - doc.add(new Field(ImageSearch.ALT_TEXT_KEY, alt, Field.Store.NO, Field.Index.TOKENIZED)); + String[] altTexts = metadata.getValues(ImageSearch.ALT_TEXT_KEY); + if (altTexts != null) { + for (String altText : altTexts) { + doc.add(new Field(ImageSearch.ALT_TEXT_KEY, altText, Field.Store.NO, Field.Index.TOKENIZED)); + } } // Index image size //... @@ -83,15 +76,21 @@ return doc; } else { doc.add(new Field(ImageSearch.HAS_IMAGE_KEY, "1", Field.Store.YES, Field.Index.TOKENIZED)); - doc.add(new Field(ImageSearch.IMAGE_POS_KEY, - concatArray(imagePositions, ':'), Field.Store.YES, Field.Index.NO)); + for (String imagePos : imagePositions) { + doc.add(new Field(ImageSearch.IMAGE_POS_KEY, imagePos, + Field.Store.YES, Field.Index.NO)); + } } String[] imageIds = metadata.getValues(ImageSearch.IMAGE_IDS_KEY); - doc.add(new Field(ImageSearch.IMAGE_IDS_KEY, - concatArray(imageIds, ':'), Field.Store.YES, Field.Index.NO)); + for (String imageId : imageIds) { + doc.add(new Field(ImageSearch.IMAGE_IDS_KEY, imageId, + Field.Store.YES, Field.Index.NO)); + } String[] imageUrls = metadata.getValues(ImageSearch.IMAGE_URLS_KEY); - doc.add(new Field(ImageSearch.IMAGE_URLS_KEY, - concatArray(imageUrls, ' '), Field.Store.YES, Field.Index.TOKENIZED)); + for (String imageUrl : imageUrls) { + doc.add(new Field(ImageSearch.IMAGE_URLS_KEY, imageUrl, + Field.Store.YES, Field.Index.TOKENIZED)); + } return doc; } Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java 2008-07-15 01:56:50 UTC (rev 2455) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageParseFilter.java 2008-07-17 10:16:27 UTC (rev 2456) @@ -76,7 +76,7 @@ URL imgSrc = null; String imgUrl = null; - Metadata metadata = new Metadata(); + String altText = null; boolean skipNode = false; NamedNodeMap attributes = currentNode.getAttributes(); @@ -91,22 +91,18 @@ skipNode = true; } } else if ("alt".equalsIgnoreCase(attr.getName())) { - String altText = attr.getValue(); - if (altText.length() > 0) { - metadata.add(ImageSearch.ALT_TEXT_KEY, altText); - } + altText = attr.getValue(); + parentMeta.add(ImageSearch.ALT_TEXT_KEY, + altText.length() > 0 ? altText : " "); } } if (skipNode) { continue; } - // Add parent page's URL - metadata.add(ImageSearch.PARENT_URL_KEY, base.toString()); - // Add parent's title - metadata.add(Metadata.TITLE, parentData.getTitle()); - // Add source URL - metadata.add("img_src", imgUrl); - + if (altText == null) { + parentMeta.add(ImageSearch.ALT_TEXT_KEY, " "); + } + // Add info to parent's parse meta parentMeta.add(ImageSearch.IMAGE_POS_KEY, Integer.toString(currentPosition)); @@ -114,15 +110,9 @@ parentMeta.add(ImageSearch.IMAGE_IDS_KEY, hash.toString()); parentMeta.add(ImageSearch.IMAGE_URLS_KEY, imgUrl); - // Add image to ParseResult - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, - parentData.getTitle(), new Outlink[0], - parentData.getContentMeta(), metadata); - //parseResult.put(imgUrl, new ParseText(""), parseData); - if (LOG.isInfoEnabled()) { LOG.info("flushing " + imgUrl + " at " + currentPosition + - " alt="+metadata.get(ImageSearch.ALT_TEXT_KEY)); + " alt="+altText); } } if (nodeType == Node.COMMENT_NODE) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |