From: <bi...@us...> - 2009-10-30 22:20:34
|
Revision: 2870 http://archive-access.svn.sourceforge.net/archive-access/?rev=2870&view=rev Author: binzino Date: 2009-10-30 22:20:15 +0000 (Fri, 30 Oct 2009) Log Message: ----------- Fix WAX-68. Add use of optional "versions" file in segments directory to declare which segments are NW 0.10 format. Modified Paths: -------------- tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java Modified: tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java =================================================================== --- tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2009-10-29 00:25:57 UTC (rev 2869) +++ tags/nutchwax-0_12_9/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2009-10-30 22:20:15 UTC (rev 2870) @@ -24,7 +24,9 @@ import java.io.BufferedReader; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import java.util.Iterator; import org.apache.commons.logging.Log; @@ -123,6 +125,7 @@ } private HashMap segments = new HashMap( ); + private Set oldFormatSegments = null; private boolean perCollection = false; private Summarizer summarizer; @@ -175,6 +178,7 @@ } addRemaps( fs, collectionDir, (Map<String,Segment>) perCollectionSegments ); + checkForOldNutchWAXSegmentFormat( fs, collectionDir ); } else { @@ -188,11 +192,64 @@ if ( ! this.perCollection ) { addRemaps( fs, new Path(segmentsDir), (Map<String,Segment>) segments ); + checkForOldNutchWAXSegmentFormat( fs, new Path(segmentsDir) ); } LOG.info( "segments: " + segments ); } + protected void checkForOldNutchWAXSegmentFormat( FileSystem fs, Path segmentDir ) + throws IOException + { + Path versionsFile = new Path( segmentDir, "versions" ); + + if ( ! fs.exists( versionsFile ) ) + { + LOG.info( "Versions file doesn't exist: " + versionsFile ); + + return ; + } + + InputStream is = fs.open( versionsFile ); + + BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + String fields[] = line.trim( ).split( "\\s+" ); + + if ( fields.length < 2 ) + { + LOG.warn( "Malformed versions line, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + Segment segment = (Segment) segments.get( fields[0] ); + if ( segment == null ) + { + LOG.warn( "Segment doesn't exist: " + fields[0] ); + continue ; + } + + String version = fields[1]; + if ( ! ( "10".equals( version ) || "12".equals( version ) ) ) + { + LOG.warn( "Malformed versions line, invalid version ("+version+"): " + version ); + continue; + } + + LOG.info( "Version: " + fields[0] + " : " + fields[1] ); + + if ( this.oldFormatSegments == null ) + { + this.oldFormatSegments = new HashSet( ); + } + + this.oldFormatSegments.add( segment ); + } + } + protected void addRemaps( FileSystem fs, Path segmentDir, Map<String,Segment> segments ) throws IOException { @@ -205,7 +262,6 @@ return ; } - // InputStream is = segmentRemapFile.getFileSystem( conf ).open( segmentRemapFile ); InputStream is = fs.open( segmentRemapFile ); BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); @@ -241,20 +297,34 @@ } public byte[] getContent(HitDetails details) throws IOException { - return getSegment(details).getContent(getKey(details)); + // return getSegment(details).getContent(getKey(details)); + Segment s = getSegment( details ); + + return s.getContent( getKey( s, details ) ); } public ParseData getParseData(HitDetails details) throws IOException { - return getSegment(details).getParseData(getKey(details)); + //return getSegment(details).getParseData(getKey(details)); + + Segment s = getSegment( details ); + + return s.getParseData( getKey( s, details ) ); } public long getFetchDate(HitDetails details) throws IOException { - return getSegment(details).getCrawlDatum(getKey(details)) - .getFetchTime(); + //return getSegment(details).getCrawlDatum(getKey(details)).getFetchTime(); + + Segment s = getSegment( details ); + + return s.getCrawlDatum( getKey( s, details ) ).getFetchTime( ); } public ParseText getParseText(HitDetails details) throws IOException { - return getSegment(details).getParseText(getKey(details)); + //return getSegment(details).getParseText(getKey(details)); + + Segment s = getSegment( details ); + + return s.getParseText( getKey( s, details ) ); } public Summary getSummary(HitDetails details, Query query) @@ -269,7 +339,7 @@ { try { - ParseText parseText = segment.getParseText(getKey(details)); + ParseText parseText = segment.getParseText(getKey(segment, details)); text = (parseText != null) ? parseText.getText() : ""; } catch ( Exception e ) @@ -380,11 +450,30 @@ } } - private Text getKey(HitDetails details) { + /* + private Text getKey(HitDetails details) + { String url = details.getValue("url") + " " + details.getValue("digest"); return new Text(url); } + */ + private Text getKey( Segment segment, HitDetails details) + { + String key = null; + if ( this.oldFormatSegments != null && + this.oldFormatSegments.contains( segment ) ) + { + key = "c=" + details.getValue("collection") + ",u=" + details.getValue( "url"); + } + else + { + key = details.getValue("url") + " " + details.getValue("digest"); + } + + return new Text(key); + } + public void close() throws IOException { Iterator iterator = segments.values().iterator(); while (iterator.hasNext()) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |