You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bi...@us...> - 2012-01-24 18:28:20
|
Revision: 3602 http://archive-access.svn.sourceforge.net/archive-access/?rev=3602&view=rev Author: binzino Date: 2012-01-24 18:28:14 +0000 (Tue, 24 Jan 2012) Log Message: ----------- Change command-line option handling to allow for both manifests and naming (w)arc files directly. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:27:24 UTC (rev 3601) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:28:14 UTC (rev 3602) @@ -25,6 +25,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -693,62 +694,98 @@ return -1; } - JobConf job = new NutchJob( getConf() ); - - // Check for "-e <exclusions>" option. - int pos = 0; - if ( args[0].equals( "-e" ) ) + boolean isManifest = false; + boolean skipExisting = false; + String exclusions = null; + int i = 0; + for ( ; i < (args.length-1) ; i++ ) { - if ( args.length < 2 ) + if ( args[i].equals( "-e" ) ) { - System.out.println( "ERROR: Missing filename for option \"-e\"\n" ); - usage( ); - return -1; + i+=1; + if ( i >= (args.length-1) ) + { + usage(); + return 1; + } + + exclusions = args[i]; } + else if ( args[i].equals( "-m" ) ) + { + isManifest = true; + } + else if ( args[i].equals( "-s" ) ) + { + skipExisting = true; + } + else + { + break ; + } + } - job.set( "nutchwax.urlfilter.wayback.exclusions", args[1] ); - - pos = 2; - } - - if ( args.length - pos < 1 ) + if ( i > (args.length-2) ) { - System.out.println( "ERROR: Missing manifest file.\n" ); - usage( ); - return -1; + usage(); + return 1; } - Path manifestPath = new Path( args[pos++] ); + FileSystem fs = FileSystem.get( getConf() ); - Path segmentPath; - if ( args.length - pos < 1 ) + Path outputDir = new Path( args[args.length-1] ); + + if ( ! fs.getFileStatus( outputDir ).isDir() ) { - segmentPath = new Path( "segments", org.apache.nutch.crawl.Generator.generateSegmentName( ) ); + System.err.println( "ERROR: Output directory is not a directory: " + outputDir ); + return 2; } - else - { - segmentPath = new Path( args[pos] ); - } - + try { - job.setJobName( "Importer " + manifestPath ); - job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); + for ( ; i < (args.length-1) ; i++ ) + { + JobConf job = new NutchJob( getConf() ); + + if ( exclusions != null ) job.set( "nutchwax.urlfilter.wayback.exclusions", exclusions ); - FileInputFormat.addInputPath( job, manifestPath ); - job.setInputFormat( TextInputFormat.class ); + Path inputPath = new Path( args[i] ); + Path outputPath = new Path( outputDir, inputPath.getName() ); + + if ( fs.exists( outputPath ) ) + { + System.err.println( "ERROR: Output path already exists: " + outputPath ); + if ( ! skipExisting ) + { + return 3; + } + } + + job.setJobName( "Importer " + inputPath ); + job.set( Nutch.SEGMENT_NAME_KEY, outputPath.getName() ); - job.setMapperClass ( Importer.class ); - job.setReducerClass( Importer.class ); + FileInputFormat.setInputPaths( job, inputPath ); + if ( isManifest ) + { + job.setInputFormat( TextInputFormat.class ); + } + else + { + job.setInputFormat( FilenameInputFormat.class ); + } - FileOutputFormat.setOutputPath( job, segmentPath ); - job.setOutputFormat ( FetcherOutputFormat.class ); - job.setOutputKeyClass ( Text.class ); - job.setOutputValueClass( NutchWritable.class ); + job.setMapperClass ( Importer.class ); + job.setReducerClass( Importer.class ); + + FileOutputFormat.setOutputPath( job, outputPath ); + job.setOutputFormat ( FetcherOutputFormat.class ); + job.setOutputKeyClass ( Text.class ); + job.setOutputValueClass( NutchWritable.class ); + + RunningJob rj = JobClient.runJob( job ); + } - RunningJob rj = JobClient.runJob( job ); - - return rj.isSuccessful( ) ? 0 : 1; + return 0; } catch ( Exception e ) { @@ -765,13 +802,11 @@ public void usage( ) { String usage = - "Usage: Importer [opts] <manifest> [<segment>]\n" + "Usage: Importer [opts] <input> <output_dir>]\n" + "Options:\n" + " -e filename Exclusions file, over-rides configuration property.\n" + + " -m Inputs are manifest files\n" + "\n" - + "If <segment> not specified, a pathname will be automatically generated\n" - + "based on current time in sub-directory 'segments', which is created if\n" - + "necessary. This is to mirror the behavior of other Nutch actions.\n" ; System.out.println( usage ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-24 18:27:33
|
Revision: 3601 http://archive-access.svn.sourceforge.net/archive-access/?rev=3601&view=rev Author: binzino Date: 2012-01-24 18:27:24 +0000 (Tue, 24 Jan 2012) Log Message: ----------- Initial revision. Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/FilenameInputFormat.java 2012-01-24 18:27:24 UTC (rev 3601) @@ -0,0 +1,113 @@ +/* + * Copyright 2012 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RecordReader; + +/** + * Weird hack to take a filename of a file in HDFS and return that + * name as the 1 and only 1 record "read" from it. + */ +public class FilenameInputFormat extends FileInputFormat<Text,Text> +{ + /** + * Configure per Hadoop properties + */ + public void configure( JobConf conf ) + { + } + + /** + * By definition, not splitable. + */ + @Override + protected boolean isSplitable(FileSystem fs, Path file) + { + return false; + } + + /** + * Return a RecordReader which returns 1 record: the file path from + * the InputSplit. + */ + public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, + JobConf job, + Reporter reporter) + throws IOException + { + reporter.setStatus(genericSplit.toString()); + + FileSplit split = (FileSplit) genericSplit; + final Path file = split.getPath(); + + return new RecordReader<Text,Text>() + { + boolean done = false; + + public void close() + { + } + + public Text createKey() + { + return new Text(); + } + + public Text createValue() + { + return new Text(); + } + + public long getPos() + { + return 0; + } + + public float getProgress() + { + return 0.0f; + } + + public boolean next( Text key, Text value) + { + if ( done ) return false; + + key .set( file.toString() ); + value.set( file.toString() ); + + done = true ; + + return true; + } + + }; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-20 20:22:20
|
Wayback-1 - Build # 109 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/109/ to view the results. |
From: <vin...@us...> - 2012-01-20 20:13:13
|
Revision: 3600 http://archive-access.svn.sourceforge.net/archive-access/?rev=3600&view=rev Author: vinaygoel Date: 2012-01-20 20:13:07 +0000 (Fri, 20 Jan 2012) Log Message: ----------- BUGFIX: Passing only non-negative length values to LimitInputStream method in HTTPResponseResource.java (removed redundant check) Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java 2012-01-20 03:15:33 UTC (rev 3599) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java 2012-01-20 20:13:07 UTC (rev 3600) @@ -64,7 +64,7 @@ for(HttpHeader h : response.getHeaders()) { headers.putString(h.getName(),h.getValue()); } - if(forceCheck && (length != -1) && (length >= 0)) { + if(forceCheck && (length >= 0)) { LimitInputStream lis = new LimitInputStream(response, length); countingIS = new CountingInputStream(lis); } else { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-20 03:23:00
|
Wayback-1 - Build # 108 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/108/ to view the results. |
From: <vin...@us...> - 2012-01-20 03:15:39
|
Revision: 3599 http://archive-access.svn.sourceforge.net/archive-access/?rev=3599&view=rev Author: vinaygoel Date: 2012-01-20 03:15:33 +0000 (Fri, 20 Jan 2012) Log Message: ----------- BUGFIX: Passing only non-negative length values to LimitInputStream method in HTTPResponseResource.java Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java 2012-01-19 21:44:58 UTC (rev 3598) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/http/HTTPResponseResource.java 2012-01-20 03:15:33 UTC (rev 3599) @@ -64,7 +64,7 @@ for(HttpHeader h : response.getHeaders()) { headers.putString(h.getName(),h.getValue()); } - if(forceCheck && (length != -1)) { + if(forceCheck && (length != -1) && (length >= 0)) { LimitInputStream lis = new LimitInputStream(response, length); countingIS = new CountingInputStream(lis); } else { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-19 21:45:04
|
Revision: 3598 http://archive-access.svn.sourceforge.net/archive-access/?rev=3598&view=rev Author: binzino Date: 2012-01-19 21:44:58 +0000 (Thu, 19 Jan 2012) Log Message: ----------- Fix type-o. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2012-01-19 21:44:43 UTC (rev 3597) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2012-01-19 21:44:58 UTC (rev 3598) @@ -184,7 +184,7 @@ <property> <name>encodingdetector.charset.min.confidence</name> <value>1</value> - <description>A integer between 0-100 indicating minimum confidence value + <description>An integer between 0-100 indicating minimum confidence value for charset auto-detection. Any negative value disables auto-detection. </description> </property> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-19 21:44:49
|
Revision: 3597 http://archive-access.svn.sourceforge.net/archive-access/?rev=3597&view=rev Author: binzino Date: 2012-01-19 21:44:43 +0000 (Thu, 19 Jan 2012) Log Message: ----------- Fix splitting of line to allow for collection names with spaces in them. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-12 22:22:43 UTC (rev 3596) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-19 21:44:43 UTC (rev 3597) @@ -200,7 +200,7 @@ } // Each line of the manifest is "<url> <collection>" where <collection> is optional - String[] parts = line.split( "\\s+" ); + String[] parts = line.split( "\\s+", 2 ); arcUrl = parts[0]; if ( parts.length > 1 ) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-12 22:33:45
|
Wayback-1 - Build # 107 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/107/ to view the results. |
From: <nl...@ar...> - 2012-01-12 22:29:07
|
Wayback-1 - Build # 106 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/106/ to view the results. |
From: <vin...@us...> - 2012-01-12 22:22:52
|
Revision: 3596 http://archive-access.svn.sourceforge.net/archive-access/?rev=3596&view=rev Author: vinaygoel Date: 2012-01-12 22:22:43 +0000 (Thu, 12 Jan 2012) Log Message: ----------- BUGFIX: Fixed CSS bug with URL too short (String out of bounds exception) in ExtractingParseObserver.java. Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2012-01-12 22:21:14 UTC (rev 3595) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2012-01-12 22:22:43 UTC (rev 3596) @@ -394,7 +394,7 @@ url = url.substring(1, origUrlLength - 1); urlStart += 1; } else if (url.charAt(0) == '\\') { - if(origUrlLength == 2) + if(url.length() == 2) continue; url = url.substring(2, origUrlLength - 2); urlStart += 2; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <vin...@us...> - 2012-01-12 22:21:21
|
Revision: 3595 http://archive-access.svn.sourceforge.net/archive-access/?rev=3595&view=rev Author: vinaygoel Date: 2012-01-12 22:21:14 +0000 (Thu, 12 Jan 2012) Log Message: ----------- BUGFIX: Fixed CSS bug with URL too short (String out of bounds exception) in ExtractingParseObserver.java. Added NumberFormatException handling in GoogleURLCanonicalizer.java Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2012-01-12 04:57:09 UTC (rev 3594) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2012-01-12 22:21:14 UTC (rev 3595) @@ -394,6 +394,8 @@ url = url.substring(1, origUrlLength - 1); urlStart += 1; } else if (url.charAt(0) == '\\') { + if(origUrlLength == 2) + continue; url = url.substring(2, origUrlLength - 2); urlStart += 2; } Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-12 04:57:09 UTC (rev 3594) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-12 22:21:14 UTC (rev 3595) @@ -152,8 +152,12 @@ if(m2Group == null) return null; //int octet = Integer.parseInt(m2.group(i+1).substring((i==0)?0:1)); - int octet = Integer.parseInt(m2Group.substring((i==0)?0:1)); - + int octet; + try { + octet = Integer.parseInt(m2Group.substring((i==0)?0:1)); + } catch (NumberFormatException e){ + return null; + } if((octet < 0) || (octet > 255)) { return null; // throw new URIException("Bad Host("+host+")"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-12 05:07:48
|
Wayback-1 - Build # 105 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/105/ to view the results. |
From: <vin...@us...> - 2012-01-12 04:57:15
|
Revision: 3594 http://archive-access.svn.sourceforge.net/archive-access/?rev=3594&view=rev Author: vinaygoel Date: 2012-01-12 04:57:09 +0000 (Thu, 12 Jan 2012) Log Message: ----------- BUGFIX: Added NullPointerException handling in the GoogleURLCanonicalizer and RealCDXExtractorOuput classes Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java 2012-01-08 22:48:33 UTC (rev 3593) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java 2012-01-12 04:57:09 UTC (rev 3594) @@ -311,8 +311,11 @@ URL cUrl = new URL(context); URL resolved = new URL(cUrl,spec); return resolved.toURI().toASCIIString(); + } catch (URISyntaxException e) { } catch (MalformedURLException e) { + } catch (NullPointerException e) { + } return spec; } Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-08 22:48:33 UTC (rev 3593) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-12 04:57:09 UTC (rev 3594) @@ -147,7 +147,13 @@ } int[] ip = new int[]{0,0,0,0}; for(int i=0; i < parts; i++) { - int octet = Integer.parseInt(m2.group(i+1).substring((i==0)?0:1)); + + String m2Group = m2.group(i+1); + if(m2Group == null) + return null; + //int octet = Integer.parseInt(m2.group(i+1).substring((i==0)?0:1)); + int octet = Integer.parseInt(m2Group.substring((i==0)?0:1)); + if((octet < 0) || (octet > 255)) { return null; // throw new URIException("Bad Host("+host+")"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-08 22:57:17
|
Wayback-1 - Build # 104 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/104/ to view the results. |
From: <vin...@us...> - 2012-01-08 22:48:39
|
Revision: 3593 http://archive-access.svn.sourceforge.net/archive-access/?rev=3593&view=rev Author: vinaygoel Date: 2012-01-08 22:48:33 +0000 (Sun, 08 Jan 2012) Log Message: ----------- Changed default behavior in ResourceExtractor for STRICT_GZ (set to false by default). Added -strict option to enable the option. Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java 2012-01-08 04:57:28 UTC (rev 3592) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java 2012-01-08 22:48:33 UTC (rev 3593) @@ -76,8 +76,8 @@ ExtractorOutput out; int arg = 0; if(args.length > 0) { - if(args[0].equals("-nostrict")) { - ProducerUtils.STRICT_GZ = false; + if(args[0].equals("-strict")) { + ProducerUtils.STRICT_GZ = true; arg++; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-08 05:07:11
|
Wayback-1 - Build # 103 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/103/ to view the results. |
From: <vin...@us...> - 2012-01-08 04:57:35
|
Revision: 3592 http://archive-access.svn.sourceforge.net/archive-access/?rev=3592&view=rev Author: vinaygoel Date: 2012-01-08 04:57:28 +0000 (Sun, 08 Jan 2012) Log Message: ----------- Wrapped DNSParseException(RecoverableRecordException) as ResourceParseException. Added catch for other possible RecoverableRecordException that may be thrown. Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java 2012-01-08 03:09:20 UTC (rev 3591) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java 2012-01-08 04:57:28 UTC (rev 3592) @@ -13,6 +13,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.archive.RecoverableRecordFormatException; import org.archive.format.gzip.GZIPFormatException; import org.archive.resource.Resource; import org.archive.resource.ResourceConstants; @@ -66,7 +67,7 @@ if(args.length < 1) { return USAGE(1); } - if(args.length > 2) { + if(args.length > 3) { return USAGE(1); } int max = Integer.MAX_VALUE; @@ -118,14 +119,31 @@ out.output(r); } catch(GZIPFormatException e) { + LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); + //Log is not coming out for some damn reason....needs to be studied + System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + if(ProducerUtils.STRICT_GZ) { - LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); throw e; } e.printStackTrace(); } catch(ResourceParseException e) { LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); - throw e; + //Log is not coming out for some damn reason....needs to be studied + System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + + if(ProducerUtils.STRICT_GZ) { + throw e; + } + e.printStackTrace(); + } catch(RecoverableRecordFormatException e) { + // this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions... + LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage())); + //Log is not coming out for some damn reason....needs to be studied + System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + + e.printStackTrace(); + } } return 0; Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java 2012-01-08 03:09:20 UTC (rev 3591) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java 2012-01-08 04:57:28 UTC (rev 3592) @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.InputStream; +import org.archive.RecoverableRecordFormatException; import org.archive.format.dns.DNSResponse; import org.archive.format.dns.DNSResponseParser; import org.archive.resource.MetaData; @@ -20,7 +21,11 @@ ResourceContainer container) throws ResourceParseException, IOException { DNSResponse response = new DNSResponse(); - parser.parse(is, response); + try { + parser.parse(is, response); + } catch(RecoverableRecordFormatException e) { + throw new ResourceParseException(e); + } parentMetaData.putString(PAYLOAD_CONTENT_TYPE, PAYLOAD_TYPE_DNS); return new DNSResource(parentMetaData.createChild(DNS_METADATA), container, response); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-08 03:21:20
|
Wayback-1 - Build # 102 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/102/ to view the results. |
From: <nl...@ar...> - 2012-01-08 03:17:16
|
Wayback-1 - Build # 101 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/101/ to view the results. |
From: <vin...@us...> - 2012-01-08 03:09:26
|
Revision: 3591 http://archive-access.svn.sourceforge.net/archive-access/?rev=3591&view=rev Author: vinaygoel Date: 2012-01-08 03:09:20 +0000 (Sun, 08 Jan 2012) Log Message: ----------- Fixed CSS bug with URL too short (String out of bounds exception). Added basic test case for CSS extraction Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java Added Paths: ----------- trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java Property Changed: ---------------- trunk/archive-access/projects/archive-commons/ Property changes on: trunk/archive-access/projects/archive-commons ___________________________________________________________________ Added: svn:ignore + target Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2011-12-24 02:33:44 UTC (rev 3590) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2012-01-08 03:09:20 UTC (rev 3591) @@ -378,6 +378,9 @@ int urlStart = m.start(1); int urlEnd = m.end(1); idx = urlEnd; + if(url.length() < 2) { + continue; + } if ((url.charAt(0) == '(') && (url.charAt(origUrlLength-1) == ')')) { url = url.substring(1, origUrlLength - 1); Added: trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java 2012-01-08 03:09:20 UTC (rev 3591) @@ -0,0 +1,98 @@ +package org.archive.resource.html; + +import org.archive.resource.MetaData; +import org.htmlparser.nodes.TextNode; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; + +import junit.framework.TestCase; + +public class ExtractingParseObserverTest extends TestCase { + + public void testHandleStyleNodeExceptions() throws Exception { + String[] tests = { + "some css", + "url()", + "url () ", + "url ('')", + "url (' ')", + "url('\")", + "url(')", + "url('\"')" + }; + boolean except = false; + HTMLMetaData md = new HTMLMetaData(new MetaData()); + ExtractingParseObserver epo = new ExtractingParseObserver(md); + for(String css : tests) { + try { + TextNode tn = new TextNode(css); + epo.handleStyleNode(tn); + } catch(Exception e) { + System.err.format("And the winner is....(%s)\n", css); + e.printStackTrace(); + except = true; + throw e; + } + assertFalse(except); + } + } + public void testHandleStyleNode() throws Exception { + String[][] tests = { + {""}, + {"url(foo.gif)","foo.gif"}, + {"url('foo.gif')","foo.gif"}, + {"url(\"foo.gif\")","foo.gif"}, + {"url(\\\"foo.gif\\\")","foo.gif"}, + {"url(\\'foo.gif\\')","foo.gif"}, + + }; + for(String[] testa : tests) { + checkExtract(testa); + } + // boolean except = false; +// HTMLMetaData md = new HTMLMetaData(new MetaData()); +// ExtractingParseObserver epo = new ExtractingParseObserver(md); +// for(String css : tests) { +// try { +// TextNode tn = new TextNode(css); +// epo.handleStyleNode(tn); +// } catch(Exception e) { +// System.err.format("And the winner is....(%s)\n", css); +// e.printStackTrace(); +// except = true; +// throw e; +// } +// assertFalse(except); +// } + } + private void checkExtract(String[] data) throws JSONException { +// System.err.format("CSS(%s) want[0](%s)\n",css,want[0]); + String css = data[0]; + boolean except = false; + HTMLMetaData md = new HTMLMetaData(new MetaData()); + ExtractingParseObserver epo = new ExtractingParseObserver(md); + try { + TextNode tn = new TextNode(css); + epo.handleStyleNode(tn); + } catch(Exception e) { + fail("Exception with CSS:" + css); + } + JSONArray a = md.optJSONArray("Links"); + if(data.length > 1) { + assertNotNull(a); + assertEquals(data.length-1,a.length()); + for(int i = 1; i < data.length; i++) { + Object o = a.optJSONObject(i-1); + + assertTrue(o instanceof JSONObject); + JSONObject jo = (JSONObject) o; + assertEquals(data[i],jo.getString("href")); + } + } else { + assertNull(a); + } + } + + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2011-12-24 02:40:36
|
Wayback-1 - Build # 100 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/100/ to view the results. |
From: <ikr...@us...> - 2011-12-24 02:33:50
|
Revision: 3590 http://archive-access.svn.sourceforge.net/archive-access/?rev=3590&view=rev Author: ikreymer Date: 2011-12-24 02:33:44 +0000 (Sat, 24 Dec 2011) Log Message: ----------- BUGFIX: Make CSS 'url' match case-insensitive Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2011-12-24 00:46:42 UTC (rev 3589) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java 2011-12-24 02:33:44 UTC (rev 3590) @@ -75,7 +75,7 @@ private static Pattern cssImportNoUrlPattern = Pattern.compile(cssImportNoUrlPatString); - private static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString); + private static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString, Pattern.CASE_INSENSITIVE); /** * get (and cache) a regex Pattern for locating an HTML attribute value This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2011-12-24 00:55:47
|
Wayback-1 - Build # 99 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/99/ to view the results. |
From: <ikr...@us...> - 2011-12-24 00:46:49
|
Revision: 3589 http://archive-access.svn.sourceforge.net/archive-access/?rev=3589&view=rev Author: ikreymer Date: 2011-12-24 00:46:42 +0000 (Sat, 24 Dec 2011) Log Message: ----------- BUGFIX: Fix adding first character as a char not int! Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java 2011-12-21 20:55:11 UTC (rev 3588) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java 2011-12-24 00:46:42 UTC (rev 3589) @@ -209,7 +209,7 @@ //Skip the UTF-8 BOM 0xFEFF int firstChar = isr.read(); if ((firstChar != '\uFEFF') && (firstChar != -1)) { - sb.append(firstChar); + sb.append((char)firstChar); } for (int r = -1; (r = isr.read(cbuffer, 0, C_BUFFER_SIZE)) != -1;) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |