From: <jle...@us...> - 2007-11-03 02:55:37
|
Revision: 2067 http://archive-access.svn.sourceforge.net/archive-access/?rev=2067&view=rev Author: jlee-archive Date: 2007-11-02 19:55:42 -0700 (Fri, 02 Nov 2007) Log Message: ----------- Use the arcname from the ARCRecordMetaData instead of trying to parse it out of the filedesc. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java Modified: trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java =================================================================== --- trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-11-03 01:29:12 UTC (rev 2066) +++ trunk/archive-access/projects/nutchwax/nutchwax-core/src/main/java/org/archive/access/nutch/jobs/ImportArcs.java 2007-11-03 02:55:42 UTC (rev 2067) @@ -306,7 +306,7 @@ } checkCollectionName(); - + final ARCRecordMetaData arcData = rec.getMetaData(); String oldUrl = url; @@ -320,7 +320,7 @@ { LOG.warn("Skipping record. Didn't pass normalization/filter " + oldUrl + ": " + e.toString()); - + return; } @@ -546,7 +546,8 @@ { if ((this.arcName == null) || this.arcName.length() <= 0) { - this.arcName = getARCName(rec.getMetaData()); + this.arcName = rec.getMetaData().getArcFile().getName(); + this.arcName = this.arcName.replace(".arc.gz", ""); } } @@ -571,47 +572,6 @@ (rec.getStatusCode() < 400))); } - /** - * @param firstARCRecordMeta The metadata record of the first record in an - * ARC (the filedesc record). - * @return Trimmed ARCName stripped of path preamble/prefix and suffix - * (At least WERA expects an ARC name without scheme and suffix: i.e. - * IAH-20060315203614-00000-debord). - * @throws NullPointerException If unable to find an ARC name. - */ - protected String getARCName(final ARCRecordMetaData firstARCRecordMeta) - { - String result = null; - - if (this.arcNameFromFirstRecord) - { - final Matcher m = FILEDESC_PATTERN.matcher(firstARCRecordMeta.getUrl()); - - if ((m != null) && m.matches()) - { - result = m.group(1); - } - } - else - { - final Matcher m = TAIL_PATTERN.matcher( - firstARCRecordMeta.getReaderIdentifier()); - - if ((m != null) && m.matches()) - { - result = m.group(1); - } - } - - if (result == null || result.length() <= 0) - { - throw new NullPointerException("Failed get of arcname: " + - firstARCRecordMeta); - } - - return result; - } - protected String getStatus(final String url, String oldUrl, final String recordLengthAsStr, final String noSpacesMimetype) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |