From: <jle...@us...> - 2007-05-12 07:44:36
|
Revision: 1745 http://archive-access.svn.sourceforge.net/archive-access/?rev=1745&view=rev Author: jlee-archive Date: 2007-05-12 00:44:36 -0700 (Sat, 12 May 2007) Log Message: ----------- Added getArcName() and other small changes to allow subclasses of ImportArcs to specify a collectionName per AR C, not just per import job. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-05-03 06:48:32 UTC (rev 1744) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-05-12 07:44:36 UTC (rev 1745) @@ -202,7 +202,7 @@ job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class)); - job.setMapperClass(job.getClass("wax.import.mapper", ImportArcs.class)); + job.setMapperClass(job.getClass("wax.import.mapper", this.getClass())); job.setInputFormat(TextInputFormat.class); @@ -275,7 +275,7 @@ public void onARCClose() { // Nothing to do. } - + public void map(final WritableComparable key, final Writable value, final OutputCollector output, final Reporter r) throws IOException { @@ -286,13 +286,13 @@ ARCReporter reporter = (ARCReporter)r; // Its null first time map is called on an ARC. - if (this.arcName == null) { - this.arcName = getARCName(rec.getMetaData()); + checkArcName(rec); + + if (!isIndex(rec)) + { + return; } - - if (!isIndex(rec)) { - return; - } + checkCollectionName(); final ARCRecordMetaData arcData = rec.getMetaData(); @@ -462,7 +462,22 @@ parse != null ? new ParseImpl(parse) : null); output.collect(Nutchwax.generateWaxKey(url, this.collectionName), v); } - + + public void setCollectionName(String collectionName) { + this.collectionName = collectionName; + checkCollectionName(); + } + + public String getArcName() { + return this.arcName; + } + + public void checkArcName(ARCRecord rec) { + if ((this.arcName == null) || this.arcName.length() <= 0) { + this.arcName = getARCName(rec.getMetaData()); + } + } + protected boolean checkCollectionName() { if ((this.collectionName != null) && this.collectionName.length() > 0) { return true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jle...@us...> - 2007-09-20 19:01:15
|
Revision: 1999 http://archive-access.svn.sourceforge.net/archive-access/?rev=1999&view=rev Author: jlee-archive Date: 2007-09-20 12:01:12 -0700 (Thu, 20 Sep 2007) Log Message: ----------- Temporarily commented out CDX generation during import phase since the underlying code in org.archive.wayback is in flux. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-09-20 18:30:19 UTC (rev 1998) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-09-20 19:01:12 UTC (rev 1999) @@ -90,7 +90,7 @@ import org.archive.util.Base32; import org.archive.util.MimetypeUtils; import org.archive.util.TextUtils; -import org.archive.wayback.resourceindex.indexer.ArcIndexer; +//import org.archive.wayback.resourceindex.indexer.ArcIndexer; /** * Ingests ARCs writing ARC Record parse as Nutch FetcherOutputFormat. @@ -204,6 +204,8 @@ job.setInputPath(arcUrlsDir); + ARCMapRunner.test(); + job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class)); job.setMapperClass(job.getClass("wax.import.mapper", this.getClass())); @@ -466,6 +468,9 @@ mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName)); mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY), new Text(Long.toString(arcData.getOffset()))); + +/* XXX commented out while Wayback is refactored + String cdxLine = null; try @@ -479,6 +484,9 @@ } mw.put(CDXKEY, new Text(cdxLine)); + +*/ + datum.setMetaData(mw); Parse parse = null; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jle...@us...> - 2007-09-20 19:09:39
|
Revision: 2001 http://archive-access.svn.sourceforge.net/archive-access/?rev=2001&view=rev Author: jlee-archive Date: 2007-09-20 12:09:41 -0700 (Thu, 20 Sep 2007) Log Message: ----------- Removing call to ARCMapRunner.test(). Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-09-20 19:09:21 UTC (rev 2000) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-09-20 19:09:41 UTC (rev 2001) @@ -204,8 +204,6 @@ job.setInputPath(arcUrlsDir); - ARCMapRunner.test(); - job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class)); job.setMapperClass(job.getClass("wax.import.mapper", this.getClass())); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jle...@us...> - 2007-11-03 02:55:37
|
Revision: 2067 http://archive-access.svn.sourceforge.net/archive-access/?rev=2067&view=rev Author: jlee-archive Date: 2007-11-02 19:55:42 -0700 (Fri, 02 Nov 2007) Log Message: ----------- Use the arcname from the ARCRecordMetaData instead of trying to parse it out of the filedesc. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-11-03 01:29:12 UTC (rev 2066) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-11-03 02:55:42 UTC (rev 2067) @@ -306,7 +306,7 @@ } checkCollectionName(); - + final ARCRecordMetaData arcData = rec.getMetaData(); String oldUrl = url; @@ -320,7 +320,7 @@ { LOG.warn("Skipping record. Didn't pass normalization/filter " + oldUrl + ": " + e.toString()); - + return; } @@ -546,7 +546,8 @@ { if ((this.arcName == null) || this.arcName.length() <= 0) { - this.arcName = getARCName(rec.getMetaData()); + this.arcName = rec.getMetaData().getArcFile().getName(); + this.arcName = this.arcName.replace(".arc.gz", ""); } } @@ -571,47 +572,6 @@ (rec.getStatusCode() < 400))); } - /** - * @param firstARCRecordMeta The metadata record of the first record in an - * ARC (the filedesc record). - * @return Trimmed ARCName stripped of path preamble/prefix and suffix - * (At least WERA expects an ARC name without scheme and suffix: i.e. - * IAH-20060315203614-00000-debord). - * @throws NullPointerException If unable to find an ARC name. - */ - protected String getARCName(final ARCRecordMetaData firstARCRecordMeta) - { - String result = null; - - if (this.arcNameFromFirstRecord) - { - final Matcher m = FILEDESC_PATTERN.matcher(firstARCRecordMeta.getUrl()); - - if ((m != null) && m.matches()) - { - result = m.group(1); - } - } - else - { - final Matcher m = TAIL_PATTERN.matcher( - firstARCRecordMeta.getReaderIdentifier()); - - if ((m != null) && m.matches()) - { - result = m.group(1); - } - } - - if (result == null || result.length() <= 0) - { - throw new NullPointerException("Failed get of arcname: " + - firstARCRecordMeta); - } - - return result; - } - protected String getStatus(final String url, String oldUrl, final String recordLengthAsStr, final String noSpacesMimetype) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |