From: <vin...@us...> - 2012-01-08 04:57:35
|
Revision: 3592 http://archive-access.svn.sourceforge.net/archive-access/?rev=3592&view=rev Author: vinaygoel Date: 2012-01-08 04:57:28 +0000 (Sun, 08 Jan 2012) Log Message: ----------- Wrapped DNSParseException(RecoverableRecordException) as ResourceParseException. Added catch for other possible RecoverableRecordException that may be thrown. Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java 2012-01-08 03:09:20 UTC (rev 3591) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/ResourceExtractor.java 2012-01-08 04:57:28 UTC (rev 3592) @@ -13,6 +13,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.archive.RecoverableRecordFormatException; import org.archive.format.gzip.GZIPFormatException; import org.archive.resource.Resource; import org.archive.resource.ResourceConstants; @@ -66,7 +67,7 @@ if(args.length < 1) { return USAGE(1); } - if(args.length > 2) { + if(args.length > 3) { return USAGE(1); } int max = Integer.MAX_VALUE; @@ -118,14 +119,31 @@ out.output(r); } catch(GZIPFormatException e) { + LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); + //Log is not coming out for some damn reason....needs to be studied + System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + if(ProducerUtils.STRICT_GZ) { - LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); throw e; } e.printStackTrace(); } catch(ResourceParseException e) { LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage())); - throw e; + //Log is not coming out for some damn reason....needs to be studied + System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + + if(ProducerUtils.STRICT_GZ) { + throw e; + } + e.printStackTrace(); + } catch(RecoverableRecordFormatException e) { + // this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions... + LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage())); + //Log is not coming out for some damn reason....needs to be studied + System.err.format("%s: %s",exProducer.getContext(),e.getMessage()); + + e.printStackTrace(); + } } return 0; Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java 2012-01-08 03:09:20 UTC (rev 3591) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java 2012-01-08 04:57:28 UTC (rev 3592) @@ -3,6 +3,7 @@ import java.io.IOException; import java.io.InputStream; +import org.archive.RecoverableRecordFormatException; import org.archive.format.dns.DNSResponse; import org.archive.format.dns.DNSResponseParser; import org.archive.resource.MetaData; @@ -20,7 +21,11 @@ ResourceContainer container) throws ResourceParseException, IOException { DNSResponse response = new DNSResponse(); - parser.parse(is, response); + try { + parser.parse(is, response); + } catch(RecoverableRecordFormatException e) { + throw new ResourceParseException(e); + } parentMetaData.putString(PAYLOAD_CONTENT_TYPE, PAYLOAD_TYPE_DNS); return new DNSResource(parentMetaData.createChild(DNS_METADATA), container, response); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <vin...@us...> - 2012-01-12 04:57:15
|
Revision: 3594 http://archive-access.svn.sourceforge.net/archive-access/?rev=3594&view=rev Author: vinaygoel Date: 2012-01-12 04:57:09 +0000 (Thu, 12 Jan 2012) Log Message: ----------- BUGFIX: Added NullPointerException handling in the GoogleURLCanonicalizer and RealCDXExtractorOuput classes Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java 2012-01-08 22:48:33 UTC (rev 3593) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/extract/RealCDXExtractorOutput.java 2012-01-12 04:57:09 UTC (rev 3594) @@ -311,8 +311,11 @@ URL cUrl = new URL(context); URL resolved = new URL(cUrl,spec); return resolved.toURI().toASCIIString(); + } catch (URISyntaxException e) { } catch (MalformedURLException e) { + } catch (NullPointerException e) { + } return spec; } Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-08 22:48:33 UTC (rev 3593) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-12 04:57:09 UTC (rev 3594) @@ -147,7 +147,13 @@ } int[] ip = new int[]{0,0,0,0}; for(int i=0; i < parts; i++) { - int octet = Integer.parseInt(m2.group(i+1).substring((i==0)?0:1)); + + String m2Group = m2.group(i+1); + if(m2Group == null) + return null; + //int octet = Integer.parseInt(m2.group(i+1).substring((i==0)?0:1)); + int octet = Integer.parseInt(m2Group.substring((i==0)?0:1)); + if((octet < 0) || (octet > 255)) { return null; // throw new URIException("Bad Host("+host+")"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <vin...@us...> - 2012-01-12 22:21:21
|
Revision: 3595 http://archive-access.svn.sourceforge.net/archive-access/?rev=3595&view=rev Author: vinaygoel Date: 2012-01-12 22:21:14 +0000 (Thu, 12 Jan 2012) Log Message: ----------- BUGFIX: Fixed CSS bug with URL too short (String out of bounds exception) in ExtractingParseObserver.java. Added NumberFormatException handling in GoogleURLCanonicalizer.java Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2012-01-12 04:57:09 UTC (rev 3594) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/ExtractingParseObserver.java 2012-01-12 22:21:14 UTC (rev 3595) @@ -394,6 +394,8 @@ url = url.substring(1, origUrlLength - 1); urlStart += 1; } else if (url.charAt(0) == '\\') { + if(origUrlLength == 2) + continue; url = url.substring(2, origUrlLength - 2); urlStart += 2; } Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-12 04:57:09 UTC (rev 3594) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java 2012-01-12 22:21:14 UTC (rev 3595) @@ -152,8 +152,12 @@ if(m2Group == null) return null; //int octet = Integer.parseInt(m2.group(i+1).substring((i==0)?0:1)); - int octet = Integer.parseInt(m2Group.substring((i==0)?0:1)); - + int octet; + try { + octet = Integer.parseInt(m2Group.substring((i==0)?0:1)); + } catch (NumberFormatException e){ + return null; + } if((octet < 0) || (octet > 255)) { return null; // throw new URIException("Bad Host("+host+")"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |