You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2009-11-06 03:36:24
|
Revision: 2905 http://archive-access.svn.sourceforge.net/archive-access/?rev=2905&view=rev Author: bradtofel Date: 2009-11-06 03:36:06 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: moved around all hadoop indexing related code, including significant changes to the maven build Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java 2009-11-06 03:36:06 UTC (rev 2905) @@ -0,0 +1,186 @@ +/* AlphaPartitioner + * + * $Id$ + * + * Created on 6:08:33 PM Mar 29, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-svn; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.hadoop; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.Arrays; + +import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Partitioner; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class AlphaPartitioner implements Partitioner<Text, Text> { + private String boundaries[] = new String[0]; + public static final String DEFAULT_PATH = "_split.txt"; + public static final String SPLIT_PATH_NAME = "alpha-partition.txt"; + public static final String SPLIT_CACHE_NAME = "alpha-partition-cache.txt"; + public static final String CACHE_SPLIT_URI_CONFIG = + "alphapartitioner.cachesplituri"; + public static final String CACHE_SPLIT_PATH_CONFIG = + "alphapartitioner.cachesplitpath"; + public static final String CACHE_SPLIT_STAMP_CONFIG = + "alphapartitioner.cachesplitstamp"; + + /** + * Called by client prior to launching the job. The File argument is a + * split file, which is to be pushed into the FileSystem, and into the + * DistributedCache from there, for use by the Map tasks. + * @throws URISyntaxException + */ + public static void setPartitionFile(JobConf conf, File f) + throws IOException, URISyntaxException { + + FileSystem fs = FileSystem.get(conf); + + Path fsSplitPath = new Path(SPLIT_PATH_NAME); + fs.copyFromLocalFile(new Path(f.getAbsolutePath()), fsSplitPath); + + String cacheURIString = SPLIT_PATH_NAME + "#" + SPLIT_CACHE_NAME; + DistributedCache.addCacheFile(new URI(cacheURIString), conf); + + FileStatus fsStat = fs.getFileStatus(fsSplitPath); + String mtime = String.valueOf(fsStat.getModificationTime()); + System.err.println("Files mtime(" + mtime + ")"); + conf.set(AlphaPartitioner.CACHE_SPLIT_URI_CONFIG,cacheURIString); + conf.set(AlphaPartitioner.CACHE_SPLIT_PATH_CONFIG,SPLIT_CACHE_NAME); + conf.set(AlphaPartitioner.CACHE_SPLIT_STAMP_CONFIG,mtime); + } + + public static void setPartitionFileBad(JobConf conf, File f) + throws IOException, URISyntaxException { + + FileSystem fs = FileSystem.get(conf); + + Path fsSplitPath = new Path(SPLIT_PATH_NAME); + fs.copyFromLocalFile(new Path(f.getAbsolutePath()), fsSplitPath); + + String cacheURIString = SPLIT_PATH_NAME + "#" + SPLIT_CACHE_NAME; + DistributedCache.addCacheFile(new URI(cacheURIString), conf); + + FileStatus fsStat = fs.getFileStatus(fsSplitPath); + String mtime = String.valueOf(fsStat.getModificationTime()); + System.err.println("Files mtime(" + mtime + ")"); + conf.set(AlphaPartitioner.CACHE_SPLIT_URI_CONFIG,cacheURIString); + conf.set(AlphaPartitioner.CACHE_SPLIT_PATH_CONFIG,SPLIT_CACHE_NAME); + conf.set(AlphaPartitioner.CACHE_SPLIT_STAMP_CONFIG,mtime); + } + + /** + * Get a BufferedReader on the alphabetic split file stored in the + * DistributedCache + * @throws IOException + * @throws URISyntaxException + */ + private static BufferedReader getPartitionFile(JobConf conf) + throws IOException, URISyntaxException { + + System.err.println("Loading split partition file..."); + FileSystem fs = FileSystem.getLocal(conf); + Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf); + + +// System.err.println("Local FS:"+fs.toString()); +// URI cacheURI = new URI(conf.get(CACHE_SPLIT_URI_CONFIG)); +// System.err.println("CacheURI: " + cacheURI.toString()); +// long mtime = Long.valueOf(conf.get(CACHE_SPLIT_STAMP_CONFIG)); +// System.err.println("Cache split timestamp: " + mtime); +// Path localSplitPath = DistributedCache.getLocalCache(cacheURI, conf, +// conf.getLocalPath(conf.getJobLocalDir()), false, mtime, +// conf.getWorkingDirectory()); +// System.err.println("LocalSplitPath: " + localSplitPath.toString()); +// FSDataInputStream in = fs.open(localSplitPath); + FSDataInputStream in = fs.open(cacheFiles[0]); + InputStreamReader is = new InputStreamReader(in); + return new BufferedReader(is); + } + + public void configure(JobConf conf) { + try { + System.err.println("Loading split file from cache..."); + loadBoundaries(getPartitionFile(conf)); + System.err.println("Loaded and Sorted split file"); + } catch (IOException e) { + throw new RuntimeException(e); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + + public void loadBoundaries(BufferedReader bis) throws IOException { + ArrayList<String> l = new ArrayList<String>(); + while (true) { + String line = bis.readLine(); + if (line == null) { + break; + } + l.add(line); + } + boundaries = l.toArray(boundaries); + Arrays.sort(boundaries); + } + + /** + * @return the number of partitions in the configuration file. This is also + * the number of reduce tasks in the job. + */ + public int getNumPartitions() { + return boundaries.length; + } + + /** + * @param key + * @param value + * @param numReduceTasks + * @return int partition index for key + */ + public int getPartition(Text key, Text value, int numPartitions) { + String keyS = key.toString(); + int loc = Arrays.binarySearch(boundaries, keyS); + if (loc < 0) { + loc = (loc * -1) - 2; + if (loc < 0) { + loc = 0; + } + } + return loc; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/AlphaPartitioner.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java 2009-11-06 03:36:06 UTC (rev 2905) @@ -0,0 +1,296 @@ +package org.archive.wayback.hadoop; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.ClusterStatus; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; + +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.mapred.lib.IdentityMapper; +import org.apache.hadoop.mapred.lib.IdentityReducer; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +public class CDXSort extends Configured implements Tool { + private RunningJob jobResult = null; + + static int printUsage() { + System.out.println("cdxsort <split> <input> <output>"); + ToolRunner.printGenericCommandUsage(System.out); + return -1; + } + + /** + * The main driver for sort program. Invoke this method to submit the + * map/reduce job. + * + * @throws IOException + * When there is communication problems with the job tracker. + */ + public int run(String[] args) throws Exception { + + boolean compressOutput = false; + boolean dereferenceInputs = false; + boolean canonicalize = false; + + JobConf jobConf = new JobConf(getConf(), CDXSort.class); + jobConf.setJobName("cdxsort"); + + jobConf.setMapperClass(IdentityMapper.class); + jobConf.setReducerClass(IdentityReducer.class); + + JobClient client = new JobClient(jobConf); + ClusterStatus cluster = client.getClusterStatus(); + + List<String> otherArgs = new ArrayList<String>(); + + for (int i = 0; i < args.length; ++i) { + try { + if ("-m".equals(args[i])) { + jobConf.setNumMapTasks(Integer.parseInt(args[++i])); + } else if ("--compress-output".equals(args[i])) { + compressOutput = true; + } else if ("--dereference-inputs".equals(args[i])) { + dereferenceInputs = true; + } else if ("--canonicalize".equals(args[i])) { + canonicalize = true; + } else { + otherArgs.add(args[i]); + } + } catch (NumberFormatException except) { + System.out.println("ERROR: Integer expected instead of " + + args[i]); + return printUsage(); + } catch (ArrayIndexOutOfBoundsException except) { + System.out.println("ERROR: Required parameter missing from " + + args[i - 1]); + return printUsage(); // exits + } + } + + // Make sure there are exactly 3 parameters left: split input output + if (otherArgs.size() != 3) { + System.out.println("ERROR: Wrong number of parameters: " + + otherArgs.size() + " instead of 3."); + return printUsage(); + } + + String splitPath = otherArgs.get(0); + String inputPath = otherArgs.get(1); + String outputPath = otherArgs.get(2); + + // load the split file, find and set the number of reduces + AlphaPartitioner partitioner = new AlphaPartitioner(); + File localSplitFile = new File(splitPath); + FileReader is = new FileReader(localSplitFile); + BufferedReader bis = new BufferedReader(is); + try { + partitioner.loadBoundaries(bis); + } catch (IOException except) { + System.err.println("ERROR: Problem loading file " + splitPath); + return printUsage(); // exits + } + jobConf.setNumReduceTasks(partitioner.getNumPartitions()); + + // copy the split file into the FS, add to the DistributedCache: + AlphaPartitioner.setPartitionFile(jobConf, localSplitFile); + System.err.println("uploaded split file to FS and DistributedCache"); + + // Set job configs: + jobConf.setInputFormat(TextInputFormat.class); + + jobConf.setOutputFormat(TextOutputFormat.class); + if (canonicalize) { + jobConf.setMapperClass(CDXCanonicalizerMapClass.class); + } else { + jobConf.setMapperClass(CDXMapClass.class); + } + jobConf.setOutputKeyClass(Text.class); + jobConf.setOutputValueClass(Text.class); + jobConf.set("mapred.textoutputformat.separator", " "); + jobConf.setPartitionerClass(AlphaPartitioner.class); + + int inputCount = 0; + // Set job input: + if (dereferenceInputs) { + + // SO SLOW... can't add one at a time... +// FileReader is2 = new FileReader(new File(inputPath)); +// BufferedReader bis2 = new BufferedReader(is2); +// while (true) { +// String line = bis2.readLine(); +// if (line == null) { +// break; +// } +// FileInputFormat.addInputPath(jobConf, new Path(line)); +// inputCount++; +// System.err.println("Added path(" + inputCount + "): " + line); +// } + + FileReader is2 = new FileReader(new File(inputPath)); + BufferedReader bis2 = new BufferedReader(is2); + ArrayList<String> list = new ArrayList<String>(); + + while (true) { + String line = bis2.readLine(); + if (line == null) { + break; + } + list.add(line); + inputCount++; + } + Path arr[] = new Path[list.size()]; + for(int i=0; i < list.size(); i++) { + arr[i] = new Path(list.get(i)); + } + FileInputFormat.setInputPaths(jobConf, arr); + + + } else { + FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); + inputCount = 1; + } + + // Set job output: + FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); + + if (compressOutput) { + FileOutputFormat.setCompressOutput(jobConf, true); + FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class); + } + + System.out.println("Running on " + cluster.getTaskTrackers() + + " nodes, processing " + inputCount + " files/directories" + + " into " + outputPath + " with " + + partitioner.getNumPartitions() + " reduces."); + Date startTime = new Date(); + System.out.println("Job started: " + startTime); + jobResult = JobClient.runJob(jobConf); + Date end_time = new Date(); + System.out.println("Job ended: " + end_time); + System.out.println("The job took " + + (end_time.getTime() - startTime.getTime()) / 1000 + + " seconds."); + return 0; + } + + /** + * Mapper which reads a canonicalized CDX line, splitting into: key - URL + + * timestamp val - everything else + * + * @author brad + * @version $Date$, $Revision$ + */ + public static class CDXMapClass extends MapReduceBase implements + Mapper<LongWritable, Text, Text, Text> { + + private Text outKey = new Text(); + private Text outValue = new Text(); + + public void map(LongWritable lineNumber, Text line, + OutputCollector<Text, Text> output, Reporter reporter) + throws IOException { + + String tmp = line.toString(); + int i1 = tmp.lastIndexOf(' '); + if(i1 > 0) { + outKey.set(tmp.substring(0,i1)); + outValue.set(tmp.substring(i1+1)); + output.collect(outKey, outValue); + } else { + System.err.println("Problem with line(" + tmp + ")"); + } + +// output.collect(line, outValue); + // reporter.setStatus("Running"); + } + } + + /** + * Mapper which reads an identity CDX line, outputting: key - canonicalized + * original URL + timestamp val - everything else + * + * @author brad + * @version $Date$, $Revision$ + */ + public static class CDXCanonicalizerMapClass extends MapReduceBase + implements Mapper<LongWritable, Text, Text, Text> { + + private Text outKey = new Text(); + private Text outValue = new Text(); + AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + private StringBuilder ksb = new StringBuilder(); + + private int i1 = 0; + private int i2 = 0; + private int i3 = 0; + private int i4 = 0; + + public void map(LongWritable lineNumber, Text line, + OutputCollector<Text, Text> output, Reporter reporter) + throws IOException { + String s = line.toString(); + + boolean problems = true; + i1 = s.indexOf(' '); + if(i1 > 0) { + i2 = s.indexOf(' ', i1 + 1); + if(i2 > 0) { + i3 = s.indexOf(' ', i2 + 1); + if(i3 > 0) { + i4 = s.lastIndexOf(' '); + if(i4 > i3) { + ksb.setLength(0); + ksb.append(canonicalizer.urlStringToKey(s.substring(i2 + 1, i3))); + ksb.append(s.substring(i1,i4)); + outKey.set(ksb.toString()); + outValue.set(s.substring(i4+1)); + output.collect(outKey, outValue); + problems = false; + } + } + } + } + if(problems) { + System.err.println("Problem with line("+s+")"); + } + } + } + + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(new Configuration(), new CDXSort(), args); + System.exit(res); + } + + /** + * Get the last job that was run using this instance. + * + * @return the results of the last job that was run + */ + public RunningJob getResult() { + return jobResult; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSort.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java 2009-11-06 03:36:06 UTC (rev 2905) @@ -0,0 +1,26 @@ +package org.archive.wayback.hadoop; + +import org.apache.hadoop.util.ProgramDriver; + +public class SortDriver { + + /** + * @param args + */ + public static void main(String[] args) { + int exitCode = -1; + ProgramDriver pgd = new ProgramDriver(); + try { + pgd.addClass("cdxsort", CDXSort.class, + "A map/reduce program that counts the words in the input files."); + pgd.driver(args); + // Success + exitCode = 0; + } + catch(Throwable e){ + e.printStackTrace(); + } + + System.exit(exitCode); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 03:24:48
|
Revision: 2904 http://archive-access.svn.sourceforge.net/archive-access/?rev=2904&view=rev Author: bradtofel Date: 2009-11-06 03:24:42 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: moved around all hadoop indexing related code, including significant changes to the maven build Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/pom.xml trunk/archive-access/projects/wayback/dist/src/main/assembly/distribution.xml trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/pom.xml trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-hadoop/ trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop-java/ trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop-java/src/ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/ Modified: trunk/archive-access/projects/wayback/dist/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/pom.xml 2009-11-06 03:19:32 UTC (rev 2903) +++ trunk/archive-access/projects/wayback/dist/pom.xml 2009-11-06 03:24:42 UTC (rev 2904) @@ -49,26 +49,13 @@ <layout>default</layout> </repository> </repositories> - <dependencies> <dependency> <groupId>org.archive.wayback</groupId> - <artifactId>wayback-webapp</artifactId> + <artifactId>wayback-core</artifactId> <version>1.5.0-SNAPSHOT</version> - <type>war</type> </dependency> - <dependency> - <groupId>org.archive.wayback</groupId> - <artifactId>wayback-mapreduce</artifactId> - <version>1.5.0-SNAPSHOT</version> - </dependency> - <dependency> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - <version>2.5</version> - </dependency> </dependencies> - <build> <plugins> Modified: trunk/archive-access/projects/wayback/dist/src/main/assembly/distribution.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/main/assembly/distribution.xml 2009-11-06 03:19:32 UTC (rev 2903) +++ trunk/archive-access/projects/wayback/dist/src/main/assembly/distribution.xml 2009-11-06 03:24:42 UTC (rev 2904) @@ -14,11 +14,6 @@ <fileSets> <fileSet> <includes> - <include>../wayback-webapp/target/*.war</include> - </includes> - </fileSet> - <fileSet> - <includes> <include>README*</include> <include>LICENSE*</include> </includes> @@ -32,45 +27,18 @@ </includes> </fileSet> <fileSet> - <directory>target/site</directory> - <outputDirectory>/docs</outputDirectory> - </fileSet> - <fileSet> - <directory>../target/site</directory> - <outputDirectory>/docs</outputDirectory> - </fileSet> - </fileSets> - - <!-- - <fileSet> - <directory>wayback-mapreduce/target</directory> - <outputDirectory></outputDirectory> + <directory>../wayback-webapp/target/</directory> + <outputDirectory>/</outputDirectory> <includes> - <include>*.jar</include> - </includes> - </fileSet> - <fileSet> - <directory>wayback-core/target</directory> - <outputDirectory></outputDirectory> - <includes> - <include>*.jar</include> - </includes> - </fileSet> - <fileSet> - <directory>wayback-webapp/target</directory> - <outputDirectory></outputDirectory> - <includes> <include>*.war</include> </includes> </fileSet> - <fileSet> - <directory>wayback-core/src</directory> - <outputDirectory>/src</outputDirectory> - </fileSet> - <fileSet> - <directory>src</directory> - <outputDirectory>/src</outputDirectory> - </fileSet> </fileSets> - --> + <files> + <file> + <source>../wayback-hadoop/target/wayback-hadoop-jar-with-dependencies.jar</source> + <destName>wayback-hadoop-${project.version}.jar</destName> + </file> + </files> + </assembly> Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2009-11-06 03:19:32 UTC (rev 2903) +++ trunk/archive-access/projects/wayback/pom.xml 2009-11-06 03:24:42 UTC (rev 2904) @@ -1,4 +1,4 @@ -<?xml version="1.0"?> +<?xml version="1.0" encoding="UTF-8"?> <!-- POM reference: http://maven.apache.org/pom.html @@ -43,23 +43,13 @@ <name>Internet Archive</name> <url>http://www.archive.org/</url> </organization> -<!-- + <issueManagement> - <system>SourceForge</system> - <url>http://sourceforge.net/tracker/?group_id=118427</url> - </issueManagement> ---> - <issueManagement> <system>Jira</system> - <url>http://webteam.archive.org/jira/secure/IssueNavigator.jspa?component=10031</url> + <url>http://webarchive.jira.com/secure/IssueNavigator.jspa?component=10031</url> </issueManagement> -<!-- + <ciManagement> - <system>cruisecontrol</system> - <url>http://builds.archive.org:8080/cruisecontrol/</url> - </ciManagement> ---> - <ciManagement> <system>continuum</system> <url>http://builds.archive.org:8081/continuum/</url> </ciManagement> @@ -142,7 +132,7 @@ </scm> <prerequisites> - <maven>2.0.5</maven> + <maven>2.0.9</maven> </prerequisites> <dependencyManagement> @@ -155,12 +145,12 @@ </dependency> <dependency> <groupId>org.archive.wayback</groupId> - <artifactId>wayback-mapreduce-prereq</artifactId> + <artifactId>wayback-hadoop-java</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>org.archive.wayback</groupId> - <artifactId>wayback-mapreduce</artifactId> + <artifactId>wayback-hadoop</artifactId> <version>${project.version}</version> </dependency> <dependency> @@ -173,9 +163,9 @@ <modules> <module>wayback-core</module> - <module>wayback-mapreduce-prereq</module> - <module>wayback-mapreduce</module> <module>wayback-webapp</module> + <module>wayback-hadoop-java</module> + <module>wayback-hadoop</module> <module>dist</module> </modules> @@ -234,15 +224,15 @@ </plugins> </reporting> +<!--Needed because we have test code under src/java. <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> -<!--Needed because we have test code under src/java. <scope>test</scope> - --> </dependency> </dependencies> + --> -</project> +</project> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2009-11-06 03:19:32 UTC (rev 2903) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2009-11-06 03:24:42 UTC (rev 2904) @@ -54,12 +54,19 @@ <version>2.4</version> <scope>provided</scope> </dependency> + <!-- <dependency> <groupId>org.archive.heritrix</groupId> <artifactId>commons</artifactId> - <version>2.0.2-SNAPSHOT</version> + <version>3.0.0-SNAPSHOT</version> </dependency> + --> <dependency> + <groupId>org.archive.heritrix</groupId> + <artifactId>commons</artifactId> + <version>2.0.3-SNAPSHOT</version> + </dependency> + <dependency> <groupId>org.archive.access-control</groupId> <artifactId>access-control</artifactId> <version>0.0.1-SNAPSHOT</version> @@ -84,6 +91,11 @@ <artifactId>bsh</artifactId> <version>2.0b4</version> </dependency> + <dependency> + <groupId>org.htmlparser</groupId> + <artifactId>htmlparser</artifactId> + <version>1.6</version> + </dependency> <!-- Doh... I'm not sure what package is configuring org.apache.commons-logging to use log4j, but it's breaking some command line tools. Added: trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2009-11-06 03:24:42 UTC (rev 2904) @@ -0,0 +1,50 @@ +<?xml version="1.0" encoding="UTF-8"?><project> + <parent> + <artifactId>wayback</artifactId> + <groupId>org.archive</groupId> + <version>1.5.0-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>org.archive.wayback</groupId> + <artifactId>wayback-hadoop</artifactId> + <name>Wayback Hadoop Jar Packaging</name> + <version>1.5.0-SNAPSHOT</version> + <url>http://maven.apache.org</url> + <packaging>pom</packaging> + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.archive.wayback</groupId> + <artifactId>wayback-hadoop-java</artifactId> + <version>1.5.0-SNAPSHOT</version> + <scope>compile</scope> + </dependency> + </dependencies> + <build> + <plugins> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <version>2.2-beta-1</version> + <configuration> + <descriptorRefs> + <descriptorRef>jar-with-dependencies</descriptorRef> + </descriptorRefs> + <finalName>wayback-hadoop</finalName> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>attached</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2009-11-06 03:24:42 UTC (rev 2904) @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?><project> + <parent> + <artifactId>wayback</artifactId> + <groupId>org.archive</groupId> + <version>1.5.0-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + <groupId>org.archive.wayback</groupId> + <artifactId>wayback-hadoop-java</artifactId> + <name>Wayback Hadoop Java Code</name> + <version>1.5.0-SNAPSHOT</version> + <url>http://maven.apache.org</url> + <packaging>jar</packaging> + + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.mahout.hadoop</groupId> + <artifactId>hadoop-core</artifactId> + <scope>provided</scope> + <version>0.19.1</version> + </dependency> + <dependency> + <groupId>org.archive.wayback</groupId> + <artifactId>wayback-core</artifactId> + <scope>compile</scope> + <version>1.5.0-SNAPSHOT</version> + </dependency> + </dependencies> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <source>1.5</source> + <target>1.5</target> + </configuration> + </plugin> + </plugins> + </build> +</project> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2009-11-06 03:19:32 UTC (rev 2903) +++ trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2009-11-06 03:24:42 UTC (rev 2904) @@ -42,6 +42,17 @@ <version>5.5.15</version> <scope>provided</scope> </dependency> + <dependency> + <groupId>javax.servlet</groupId> + <artifactId>jstl</artifactId> + <version>1.0</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>taglibs</groupId> + <artifactId>standard</artifactId> + <version>1.1.2</version> + </dependency> </dependencies> </project> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 03:19:40
|
Revision: 2903 http://archive-access.svn.sourceforge.net/archive-access/?rev=2903&view=rev Author: bradtofel Date: 2009-11-06 03:19:32 +0000 (Fri, 06 Nov 2009) Log Message: ----------- FEATURE: now displays alternate URLs users can try, primarily useful with exactHostMatch AccessPoint property Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/HTMLError.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/HTMLError.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/HTMLError.jsp 2009-11-06 03:15:50 UTC (rev 2902) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/HTMLError.jsp 2009-11-06 03:19:32 UTC (rev 2903) @@ -1,12 +1,18 @@ <%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> +<%@ page import="java.util.List" %> <%@ page import="org.archive.wayback.exception.WaybackException" %> +<%@ page import="org.archive.wayback.exception.ResourceNotInArchiveException"%> <%@ page import="org.archive.wayback.core.UIResults" %> +<%@ page import="org.archive.wayback.core.WaybackRequest" %> <%@ page import="org.archive.wayback.util.StringFormatter" %> <% UIResults results = UIResults.extractException(request); WaybackException e = results.getException(); e.setupResponse(response); +String contextRoot = results.getWbRequest().getContextPrefix(); + %> + <jsp:include page="/WEB-INF/template/UI-header.jsp" flush="true" /> <% @@ -16,4 +22,24 @@ <h2><%= fmt.format(e.getTitleKey()) %></h2> <p><b><%= fmt.format(e.getMessageKey(),e.getMessage()) %></b></p> +<% +if(e instanceof ResourceNotInArchiveException) { + ResourceNotInArchiveException niae = (ResourceNotInArchiveException) e; + List<String> closeMatches = niae.getCloseMatches(); + if(closeMatches != null && !closeMatches.isEmpty()) { +%> + Other requests to try:<br> +<% + WaybackRequest tmp = results.getWbRequest().clone(); + for(String closeMatch : closeMatches) { + tmp.setRequestUrl(closeMatch); + String link = tmp.getContextPrefix() + "query?" + + tmp.getQueryArguments(); +%> + <a href="<%= link %>"><%= closeMatch %></a><br> +<% + } + } +} +%> <jsp:include page="/WEB-INF/template/UI-footer.jsp" flush="true" /> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 03:16:00
|
Revision: 2902 http://archive-access.svn.sourceforge.net/archive-access/?rev=2902&view=rev Author: bradtofel Date: 2009-11-06 03:15:50 +0000 (Fri, 06 Nov 2009) Log Message: ----------- Updated comments and provided additional deployment examples. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2009-11-06 02:56:35 UTC (rev 2901) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2009-11-06 03:15:50 UTC (rev 2902) @@ -24,7 +24,6 @@ The ResourceFileLocationDB implementation to use for mapping ARC/WARC names to absolute paths/URLs via a BDBJE database. --> - <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB"> <property name="bdbPath" value="${wayback.basedir}/file-db/db/" /> <property name="bdbName" value="DB1" /> @@ -32,6 +31,15 @@ </bean> <!-- + The following bean provides an alternate flat-file based LocationDB + implementation. +--> +<!-- + <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.FlatFileResourceFileLocationDB"> + <property name="path" value="${wayback.basedir}/path-index.txt" /> + </bean> +--> +<!-- To enable manual management of, or remote access to the above locationDB, uncomment the following bean. --> @@ -47,11 +55,10 @@ Required when using the SimpleResourceStore to access distributed ARC/WARC files over HTTP through a single reverse proxy. --> -<!-- + <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> <property name="locationDB" ref="resourcefilelocationdb" /> </bean> ---> <!-- @@ -61,11 +68,21 @@ --> <import resource="BDBCollection.xml"/> <!-- - <import resource="NutchCollection.xml"/> <import resource="CDXCollection.xml"/> <import resource="RemoteCollection.xml"/> + <import resource="NutchCollection.xml"/> --> +<!-- + LiveWeb.xml contains beans that enable fetching content from the live + web, and caching those results in ARC files. This import is needed if you + use the "excluder-factory-robot" exclusionFactory property of the + AccessPoints, which will cause live robots.txt files to be consulted + retroactively before showing archived content. +--> +<!-- + <import resource="LiveWeb.xml"/> +--> <!-- This is the only AccessPoint defined by default within this wayback.xml @@ -80,14 +97,26 @@ --> <import resource="ArchivalUrlReplay.xml"/> <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> + <property name="collection" ref="localbdbcollection" /> +<!-- + An example of a text file CDX collection, with a text file path index. + <property name="collection" ref="localcdxcollection" /> +--> <property name="replay" ref="archivalurlreplay" /> <property name="query"> <bean class="org.archive.wayback.query.Renderer"> <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> +<!-- + This .jsp provides a "search engine" style listing of results vertically + <property name="captureJsp" value="/WEB-INF/query/HTMLCaptureResults.jsp" /> +--> </bean> </property> +<!-- See the LiveWeb.xml import above. + <property name="exclusionFactory" ref="excluder-factory-robot" /> +--> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> <property name="replayURIPrefix" value="http://localhost.archive.org:8080/wayback/"/> @@ -97,15 +126,64 @@ <property name="parser"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"> <property name="maxRecords" value="1000" /> - <property name="earliestTimestamp" value="1996" /> + <!-- + <property name="earliestTimestamp" value="1999" /> + <property name="latestTimestamp" value="2004" /> + --> </bean> </property> - + <!-- + The following property will cause only results matching the exact host + the user requested to be displayed. URLs matching other versions of the + same host will be stored in the closeMatches list of the SearchResults, + and can be displayed by query .jsp files. + --> + <!-- + <property name="exactHostMatch" value="true" /> + --> </bean> + <!-- + + All beans defined below here represent examples of alternate + AccessPoint definitions and implementations. + +--> + + + + + + +<!-- The following AccessPoint inherits all configuration from the 8080:wayback + AccessPoint, but provides a OpenSearch format query results. + + Note: the links generated by this AccessPoint drive to the parent + 8080:wayback AccessPoint: presumably users following links from here + will prefer the HTML interface. + --> + <bean name="8080:opensearch" parent="8080:wayback"> + <property name="urlRoot" value="http://localhost.archive.org:8080/wayback/" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/OpenSearchCaptureResults.jsp" /> + <property name="urlJsp" value="/WEB-INF/query/OpenSearchUrlResults.jsp" /> + </bean> + </property> + <property name="exception"> + <bean class="org.archive.wayback.exception.BaseExceptionRenderer"> + <property name="xmlErrorJsp" value="/WEB-INF/exception/OpenSearchError.jsp" /> + <property name="errorJsp" value="/WEB-INF/exception/OpenSearchError.jsp" /> + </bean> + </property> + </bean> + + +<!-- + The following AccessPoint inherits all configuration from the 8080:wayback AccessPoint, but provides a DomainPrefix Replay UI to the same collection. These two access points can be used simultaneously on the same Tomcat installation. @@ -144,7 +222,6 @@ Note: using this AccessPoint requires adding a "Connector" on port 8090 in your Tomcat's server.xml file. --> - <!-- <import resource="ProxyReplay.xml"/> <bean name="8090" parent="8080:wayback"> <property name="urlRoot" value="http://localhost.archive.org:8090/" /> @@ -155,7 +232,7 @@ </bean> </property> <property name="parser"> - <bean class="org.archive.wayback.proxy.ProxyRequestParser"> + <bean class="org.archive.wayback.proxy.ProxyArchivalRequestParser"> <property name="localhostNames"> <list> <value>localhost.archive.org</value> @@ -165,7 +242,31 @@ </bean> </property> </bean> + + <bean name="8091" parent="8080:wayback"> + <property name="urlRoot" value="http://localhost.archive.org/" /> + <property name="replay" ref="proxyreplay" /> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org/"/> + </bean> +<!-- + <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> + <property name="redirectURI" value="http://localhost.archive.org:8090/jsp/QueryUI/Redirect.jsp" /> + </bean> --> + </property> + <property name="parser"> + <bean class="org.archive.wayback.proxy.ProxyArchivalRequestParser"> + <property name="localhostNames"> + <list> + <value>localhost.archive.org</value> + </list> + </property> + <property name="maxRecords" value="1000" /> + </bean> + </property> + </bean> <!-- The following AccessPoint inherits all configuration from the 8080:wayback This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:56:47
|
Revision: 2901 http://archive-access.svn.sourceforge.net/archive-access/?rev=2901&view=rev Author: bradtofel Date: 2009-11-06 02:56:35 +0000 (Fri, 06 Nov 2009) Log Message: ----------- FEATURE: if "close matches" are included within the search results, this .jsp will provide links to the close matches at the bottom of the resulting page. Especially useful for the new "exactHostMatch" AccessPoint configuration. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp 2009-11-06 02:55:10 UTC (rev 2900) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp 2009-11-06 02:56:35 UTC (rev 2901) @@ -1,4 +1,5 @@ <%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> +<%@ page import="java.util.List" %> <%@ page import="java.util.ArrayList" %> <%@ page import="java.util.Date" %> <%@ page import="java.util.Iterator" %> @@ -20,6 +21,7 @@ CaptureSearchResults cResults = results.getCaptureResults(); StringFormatter fmt = wbRequest.getFormatter(); String searchString = wbRequest.getRequestUrl(); +List<String> closeMatches = cResults.getCloseMatches(); Date searchStartDate = wbRequest.getStartDate(); @@ -153,6 +155,22 @@ <% +if(closeMatches != null && !closeMatches.isEmpty()) { + WaybackRequest tmp = wbRequest.clone(); + + + %> + Close Matches:<br> + <% + for(String closeMatch : closeMatches) { + tmp.setRequestUrl(closeMatch); + String link = tmp.getContextPrefix() + "query?" + + tmp.getQueryArguments(); + %> + <a href="<%= link %>"><%= closeMatch %></a><br> + <% + } +} // show page indicators: if(cResults.getNumPages() > 1) { int curPage = cResults.getCurPageNum(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:55:25
|
Revision: 2900 http://archive-access.svn.sourceforge.net/archive-access/?rev=2900&view=rev Author: bradtofel Date: 2009-11-06 02:55:10 +0000 (Fri, 06 Nov 2009) Log Message: ----------- INITIAL REV: placeholder error .jsp for OpenSearch contexts Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/OpenSearchError.jsp Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/OpenSearchError.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/OpenSearchError.jsp (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/OpenSearchError.jsp 2009-11-06 02:55:10 UTC (rev 2900) @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?><%@ + page language="java" pageEncoding="utf-8" contentType="text/xml;charset=utf-8" +%><%@ + page import="org.archive.wayback.exception.WaybackException" +%><%@ + page import="org.archive.wayback.core.UIResults" +%><%@ + page import="org.archive.wayback.util.StringFormatter" +%><% + +UIResults results = UIResults.extractException(request); +WaybackException e = results.getException(); +StringFormatter fmt = results.getWbRequest().getFormatter(); + +%> +<rss version="2.0" xmlns:openSearch="http://a9.com/-/spec/opensearch/1.1/"> + <channel> + <title>Wayback OpenSearch Error</title> + <link>http://archive-access.sourceforge.net/projects/wayback</link> + <description>OpenSearch Error</description> + <openSearch:totalResults>1</openSearch:totalResults> + <openSearch:startIndex>1</openSearch:startIndex> + <openSearch:itemsPerPage>1</openSearch:itemsPerPage> + <item> + <title><%= UIResults.encodeXMLContent(fmt.format(e.getTitleKey())) %></title> + <description><%= UIResults.encodeXMLContent(fmt.format(e.getMessageKey())) %></description> + </item> + </channel> + </rss> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:53:28
|
Revision: 2899 http://archive-access.svn.sourceforge.net/archive-access/?rev=2899&view=rev Author: bradtofel Date: 2009-11-06 02:53:20 +0000 (Fri, 06 Nov 2009) Log Message: ----------- FEATURE: extra logic to work around a bug in IE. If an anchor contains an "@" in the *text* of the link, then attempts to modify the href of the link cause the *text* of the link to be set to the URL.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/client-rewrite.js Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/client-rewrite.js =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/client-rewrite.js 2009-11-06 02:50:51 UTC (rev 2898) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/client-rewrite.js 2009-11-06 02:53:20 UTC (rev 2899) @@ -4,28 +4,41 @@ image.src = url; return image.src; } +var xWaybackIsIE = (navigator.appName=="Microsoft Internet Explorer"); function xLateUrl(aCollection, sProp) { var i = 0; for(i = 0; i < aCollection.length; i++) { if(aCollection[i].getAttribute(sProp) && (aCollection[i].getAttribute(sProp).length > 0) && - (typeof(aCollection[i][sProp]) == "string")) { + (typeof(aCollection[i][sProp]) == "string") && + (aCollection[i][sProp].indexOf("mailto:") == -1) && + (aCollection[i][sProp].indexOf("javascript:") == -1) && + (aCollection[i][sProp].indexOf(sWayBackCGI) == -1) ) { - if(aCollection[i][sProp].indexOf("mailto:") == -1 && - aCollection[i][sProp].indexOf("javascript:") == -1) { - var wmSpecial = aCollection[i].getAttribute("wmSpecial"); - if(wmSpecial && wmSpecial.length > 0) { + if((wmSpecial && wmSpecial.length > 0)) { } else { - if(aCollection[i][sProp].indexOf(sWayBackCGI) == -1) { - if(aCollection[i][sProp].indexOf("http") == 0) { - aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp]; - } else { - aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]); - } + var newUrl; + if(aCollection[i][sProp].indexOf("http") == 0) { + newUrl = sWayBackCGI + aCollection[i][sProp]; + } else { + newUrl = sWayBackCGI + xResolveUrl(aCollection[i][sProp]); + } + if(navigator.appName=="Microsoft Internet Explorer") { + var inTmp = aCollection[i].innerHTML; + aCollection[i][sProp] = newUrl; + if(inTmp && + ( (inTmp.indexOf("@") > 0) + || (inTmp.indexOf("www.") == 0) + || (inTmp.indexOf("http://") == 0) + ) + ) { + aCollection[i].innerHTML = inTmp; } + } else { + aCollection[i][sProp] = newUrl; + } } - } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:50:58
|
Revision: 2898 http://archive-access.svn.sourceforge.net/archive-access/?rev=2898&view=rev Author: bradtofel Date: 2009-11-06 02:50:51 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: extracted logic to determine if the "current" frame is the largest within a set of frames, and added a method to enable toggling the display of 2 elements depending if the "current" frame is the largest. This new method would enable a smaller "disclaimer" to be used in all other frames besides the largest, as a hint to users that the other frames are still loaded from Wayback. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/disclaim-element.js Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/disclaim-element.js =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/disclaim-element.js 2009-11-06 02:38:40 UTC (rev 2897) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/js/disclaim-element.js 2009-11-06 02:50:51 UTC (rev 2898) @@ -5,25 +5,40 @@ return 0; } +function isLargestFrame() { + if(top == self) { + return true; + } + if(top.document.body.tagName == "BODY") { + return false; + } + largestArea = 0; + largestFrame = null; + for(i=0;i<top.frames.length;i++) { + frame = top.frames[i]; + area = getFrameArea(frame); + if(area > largestArea) { + largestFrame = frame; + largestArea = area; + } + } + return (self == largestFrame); +} + function disclaimElement(element) { - if(top!=self) { - if(top.document.body.tagName == "BODY") { - return; - } - largestArea = 0; - largestFrame = null; - for(i=0;i<top.frames.length;i++) { - frame = top.frames[i]; - area = getFrameArea(frame); - if(area > largestArea) { - largestFrame = frame; - largestArea = area; - } - } - if(self!=largestFrame) { - return; - } - } - element.style.display="block"; - document.body.insertBefore(element,document.body.firstChild); + if(isLargestFrame()) { + element.style.display="block"; + document.body.insertBefore(element,document.body.firstChild); + } } + +function disclaimToggle(largest, nonLargest) { + if(isLargestFrame()) { + largest.style.display="block"; + nonLargest.style.display="none"; + } else { + largest.style.display="none"; + nonLargest.style.display="block"; + } +} + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:38:51
|
Revision: 2897 http://archive-access.svn.sourceforge.net/archive-access/?rev=2897&view=rev Author: bradtofel Date: 2009-11-06 02:38:40 +0000 (Fri, 06 Nov 2009) Log Message: ----------- TWEAK: removed two replay .jsp insert references which are not in SVN, so no one gets confused looking for them - should have removed them prior to last check in. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2009-11-06 02:37:07 UTC (rev 2896) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2009-11-06 02:38:40 UTC (rev 2897) @@ -97,11 +97,9 @@ <list> <value>/WEB-INF/replay/Timeline.jsp</value> <!-- - <value>/WEB-INF/replay/DisclaimerHead.jsp</value> <value>/WEB-INF/replay/ArchiveComment.jsp</value> <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> <value>/WEB-INF/replay/Disclaimer.jsp</value> - <value>/WEB-INF/replay/Timeline.jsp</value> <value>/WEB-INF/replay/DebugBanner.jsp</value> --> </list> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:37:22
|
Revision: 2896 http://archive-access.svn.sourceforge.net/archive-access/?rev=2896&view=rev Author: bradtofel Date: 2009-11-06 02:37:07 +0000 (Fri, 06 Nov 2009) Log Message: ----------- Improved comments, and now includes many more examples of the various Archival URL replay options. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlSaxReplay.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2009-11-06 02:28:19 UTC (rev 2895) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2009-11-06 02:37:07 UTC (rev 2896) @@ -2,23 +2,59 @@ <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.springframework.org/schema/beans - http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> +<!-- + The following bean controls how HTTP headers from original documents + are returned to clients. This configuration will cause + all original HTTP headers to be prefixed with "X-Archive-Orig-". To return + the original HTTP headers as-is (except for Content-Length) comment out the + "prefix" property. This has significant implications on caching and cookies. + --> <bean id="archivalurlhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor"> <property name="prefix" value="X-Archive-Orig-" /> </bean> - <bean id="archivaldateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" /> +<!-- + Renderer for both CSS and Javascript, causing a comment containing + archive inforation to be inserted in the returned documents. + --> <bean id="archivalcssreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlCSSReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveCSSComment.jsp</value> + </list> + </property> </bean> + <bean id="archivaljsreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlJSReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="jspInserts"> + <list> + <value>/WEB-INF/replay/ArchiveCSSComment.jsp</value> + </list> + </property> + </bean> +<!-- + Renderer which rewrites mms:// URLs inside ASX mime types to be http:// + --> <bean id="archivalasxreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlASXReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> </bean> + +<!-- + Renderer which returns documents as-is. Suitable for images, binary formats, + and anything else Wayback doesn't know how to handle yet. + --> <bean id="archivaltransparentreplayrenderer" class="org.archive.wayback.replay.TransparentReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> </bean> +<!-- + The following bean is an example of the experimental Regex-Based + server-side HTML rewriting Renderer + --> <bean id="archivalserversidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ServerSideHTMLReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> <property name="jspInserts"> @@ -31,14 +67,40 @@ </property> </bean> +<!-- + Custom rules for rewriting HTML content using a SAX parser, for fine-tuned + server-side rewriting. + --> + <import resource="ArchivalUrlSaxReplay.xml"/> + +<!-- + The following bean is an example of the new SAX based rewriting renderer. It + also uses a pluggable character encoding detector, which could allow clients + to issue special requests to Wayback to alter the detection strategy. + --> + <bean id="archivalsaxreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlSAXRewriteReplayRenderer"> + <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> + <property name="charsetDetector"> + <bean class="org.archive.wayback.replay.charset.RotatingCharsetDetector"/> + </property> + <property name="rules" ref="archivalSAXRules"/> + </bean> + +<!-- + The following bean is an example of the "classic" or most mature ArchivalUrl + Replay system - it uses a combination of server-side regex rewriting and + a client-side javascript insert to rewite links within an HTML page. +--> <bean id="archivalclientsidehtmlreplayrenderer" class="org.archive.wayback.archivalurl.ClientSideHTMLReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> <property name="jspInserts"> <list> + <value>/WEB-INF/replay/Timeline.jsp</value> +<!-- + <value>/WEB-INF/replay/DisclaimerHead.jsp</value> <value>/WEB-INF/replay/ArchiveComment.jsp</value> <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> <value>/WEB-INF/replay/Disclaimer.jsp</value> -<!-- <value>/WEB-INF/replay/Timeline.jsp</value> <value>/WEB-INF/replay/DebugBanner.jsp</value> --> @@ -46,15 +108,36 @@ </property> </bean> +<!-- + The main Archival URL replay dispatcher. It uses a list of Selectors to + determine which ReplayRenderer should be used for each document. + --> <bean id="archivalurlreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher"> <property name="selectors"> <list> <!-- REDIRECT IF NOT EXACT DATE --> <bean class="org.archive.wayback.replay.selector.DateMismatchSelector"> - <property name="renderer" ref="archivaldateredirectingreplayrenderer"/> + <property name="renderer"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlDateRedirectReplayRenderer" /> + </property> </bean> + <!-- Explicit (via "cs_" flag) CSS REPLAY --> + <bean class="org.archive.wayback.replay.selector.CSSRequestSelector"> + <property name="renderer" ref="archivalcssreplayrenderer"/> + </bean> + + <!-- Explicit (via "js_" flag) JS REPLAY --> + <bean class="org.archive.wayback.replay.selector.JSRequestSelector"> + <property name="renderer" ref="archivaljsreplayrenderer"/> + </bean> + + <!-- Explicit (via "im_" flag) IMG REPLAY --> + <bean class="org.archive.wayback.replay.selector.IMGRequestSelector"> + <property name="renderer" ref="archivaltransparentreplayrenderer"/> + </bean> + <!-- HTML REPLAY --> <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> <property name="mimeContains"> @@ -63,8 +146,12 @@ <value>application/xhtml</value> </list> </property> + <property name="renderer" ref="archivalsaxreplayrenderer"/> +<!-- <property name="renderer" ref="archivalclientsidehtmlreplayrenderer"/> + --> </bean> + <!-- CSS REPLAY --> <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlSaxReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlSaxReplay.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlSaxReplay.xml 2009-11-06 02:37:07 UTC (rev 2896) @@ -0,0 +1,163 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + + + <bean id="anchorURLRewriter" + class="org.archive.wayback.replay.html.transformer.URLStringTransformer"> + </bean> + <bean id="imageURLRewriter" + class="org.archive.wayback.replay.html.transformer.URLStringTransformer"> + <property name="flags" value="im_" /> + </bean> + <bean id="jsURLRewriter" + class="org.archive.wayback.replay.html.transformer.URLStringTransformer"> + <property name="flags" value="js_" /> + </bean> + <bean id="cssURLRewriter" + class="org.archive.wayback.replay.html.transformer.URLStringTransformer"> + <property name="flags" value="cs_" /> + </bean> + <bean id="baseHrefHandler" + class="org.archive.wayback.replay.html.transformer.BaseHrefStringTransformer"> + </bean> + <bean id="cssAttributeHandler" + class="org.archive.wayback.replay.html.transformer.InlineCSSStringTransformer"> + </bean> + <bean id="cssBlockHandler" + class="org.archive.wayback.replay.html.transformer.BlockCSSStringTransformer"> + </bean> + + + + <bean id="archivalSAXRules" + class="org.archive.wayback.replay.html.ReplayParseEventDelegator"> + <property name="parserVisitors"> + <list> + <bean class="org.archive.wayback.replay.html.rules.CommentRule"> + </bean> + <bean + class="org.archive.wayback.replay.html.rules.AfterBodyStartTagJSPExecRule"> + <property name="jspPath" value="/WEB-INF/replay/DebugBanner.jsp" /> + </bean> + + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="A" /> + <property name="modifyAttributeName" value="HREF" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="IMG" /> + <property name="modifyAttributeName" value="SRC" /> + <property name="transformer" ref="imageURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="SCRIPT" /> + <property name="modifyAttributeName" value="SRC" /> + <property name="transformer" ref="jsURLRewriter" /> + </bean> + + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="BASE" /> + <property name="modifyAttributeName" value="HREF" /> + <property name="transformer" ref="baseHrefHandler" /> + </bean> + + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="modifyAttributeName" value="BACKGROUND" /> + <property name="transformer" ref="imageURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="FRAME" /> + <property name="modifyAttributeName" value="SRC" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="INPUT" /> + <property name="modifyAttributeName" value="SRC" /> + <property name="transformer" ref="imageURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="IFRAME" /> + <property name="modifyAttributeName" value="SRC" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="META" /> + <property name="modifyAttributeName" value="URL" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="FORM" /> + <property name="modifyAttributeName" value="ACTION" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + + <property name="tagName" value="LINK" /> + <property name="whereAttributeName" value="rel" /> + <property name="whereAttributeValue" value="Stylesheet" /> + + <property name="modifyAttributeName" value="HREF" /> + <property name="transformer" ref="cssURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="LINK" /> + <property name="whereAttributeName" value="rel" /> + <property name="whereAttributeValue" value="shortcut icon" /> + <property name="modifyAttributeName" value="HREF" /> + <property name="transformer" ref="imageURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="LINK" /> + <property name="modifyAttributeName" value="HREF" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="AREA" /> + <property name="modifyAttributeName" value="HREF" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="OBJECT" /> + <property name="modifyAttributeName" value="CODEBASE" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="OBJECT" /> + <property name="modifyAttributeName" value="CDATA" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="APPLET" /> + <property name="modifyAttributeName" value="CODEBASE" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="APPLET" /> + <property name="modifyAttributeName" value="ARCHIVE" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="tagName" value="EMBED" /> + <property name="modifyAttributeName" value="SRC" /> + <property name="transformer" ref="anchorURLRewriter" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.StyleContentRule"> + <property name="transformer" ref="cssBlockHandler" /> + </bean> + <bean class="org.archive.wayback.replay.html.rules.AttributeModifyingRule"> + <property name="modifyAttributeName" value="style" /> + <property name="transformer" ref="cssAttributeHandler" /> + </bean> + </list> + </property> + </bean> + +</beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:28:38
|
Revision: 2895 http://archive-access.svn.sourceforge.net/archive-access/?rev=2895&view=rev Author: bradtofel Date: 2009-11-06 02:28:19 +0000 (Fri, 06 Nov 2009) Log Message: ----------- INITIAL REV: a .jsp which creates a comment suitable for insertion in either CSS or javascript documents. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveCSSComment.jsp Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveCSSComment.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveCSSComment.jsp (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveCSSComment.jsp 2009-11-06 02:28:19 UTC (rev 2895) @@ -0,0 +1,21 @@ +<%@ page language="java" pageEncoding="utf-8" contentType="text/css;charset=utf-8"%> +<%@ page import="java.util.Date" %> +<%@ page import="org.archive.wayback.core.UIResults" %> +<%@ page import="org.archive.wayback.util.StringFormatter" %> +<% +UIResults results = UIResults.extractReplay(request); +StringFormatter fmt = results.getWbRequest().getFormatter(); +Date exactDate = results.getResult().getCaptureDate(); +Date now = new Date(); +String prettyDateFormat = "{0,date,H:mm:ss MMM d, yyyy}"; +String prettyArchiveString = fmt.format(prettyDateFormat,exactDate); +String prettyRequestString = fmt.format(prettyDateFormat,now); +%> +/* + FILE ARCHIVED ON <%= prettyArchiveString %> AND RETRIEVED FROM THE + INTERNET ARCHIVE ON <%= prettyRequestString %>. + JAVASCRIPT APPENDED BY WAYBACK MACHINE, COPYRIGHT INTERNET ARCHIVE. + + ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT (17 U.S.C. + SECTION 108(a)(3)). +*/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:17:21
|
Revision: 2894 http://archive-access.svn.sourceforge.net/archive-access/?rev=2894&view=rev Author: bradtofel Date: 2009-11-06 02:17:12 +0000 (Fri, 06 Nov 2009) Log Message: ----------- BUGFIX: uses date from actual SearchResult being returned, which may be more accurate than the search result date Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp 2009-11-06 02:07:11 UTC (rev 2893) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp 2009-11-06 02:17:12 UTC (rev 2894) @@ -21,8 +21,8 @@ StringFormatter fmt = wbRequest.getFormatter(); CaptureSearchResults cResults = results.getCaptureResults(); -String exactDateStr = wbRequest.getReplayTimestamp(); -Date exactDate = wbRequest.getReplayDate(); +String exactDateStr = results.getResult().getCaptureTimestamp(); +Date exactDate = results.getResult().getCaptureDate(); String searchUrl = wbRequest.getRequestUrl(); String resolution = wbRequest.getTimelineResolution(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:07:18
|
Revision: 2893 http://archive-access.svn.sourceforge.net/archive-access/?rev=2893&view=rev Author: bradtofel Date: 2009-11-06 02:07:11 +0000 (Fri, 06 Nov 2009) Log Message: ----------- TWEAK: added Robots HTTP header string Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java 2009-11-06 02:05:52 UTC (rev 2892) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/WaybackConstants.java 2009-11-06 02:07:11 UTC (rev 2893) @@ -53,4 +53,8 @@ */ public final static String LOCATION_HTTP_HEADER = "Location"; + /** + * HTTP Header for robot instructions. See http://noarchive.net/xrobots/ + */ + public final static String X_ROBOTS_HTTP_HEADER = "X-Robots-Tag"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:06:02
|
Revision: 2892 http://archive-access.svn.sourceforge.net/archive-access/?rev=2892&view=rev Author: bradtofel Date: 2009-11-06 02:05:52 +0000 (Fri, 06 Nov 2009) Log Message: ----------- INITIAL REV: code we use at IA when extracting records from existing WARC files into a new WARC - this tool builds a WARC header record for the new WARC file. Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/scripts/warc-header trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java Added: trunk/archive-access/projects/wayback/dist/src/scripts/warc-header =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/warc-header (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/warc-header 2009-11-06 02:05:52 UTC (rev 2892) @@ -0,0 +1,78 @@ +#!/usr/bin/env sh +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main ArcIndexer class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.util.WARCHeader' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" + Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java 2009-11-06 02:05:52 UTC (rev 2892) @@ -0,0 +1,63 @@ +package org.archive.wayback.util; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.archive.io.warc.WARCWriter; +import org.archive.util.anvl.ANVLRecord; + +public class WARCHeader { + private void writeHeaderRecord(File target, File fieldsSrc, String id) + throws IOException { + + WARCWriter writer = null; + + BufferedOutputStream bos = + new BufferedOutputStream(new FileOutputStream(target)); + + FileInputStream is = new FileInputStream(fieldsSrc); + ANVLRecord ar = ANVLRecord.load(is); + + List<String> metadata = new ArrayList<String>(1); + metadata.add(ar.toString()); + + writer = new WARCWriter(null, bos, target, true, null, + metadata); + // Write a warcinfo record with description about how this WARC + // was made. + writer.writeWarcinfoRecord(target.getName(), "Made from " + + id + " by " + + this.getClass().getName()); + + } + + public static void main(String[] args) { + if (args.length != 3) { + System.err.println("USAGE: tgtWarc fieldsSrc id"); + System.err.println("\ttgtWarc is the path to the target WARC.gz"); + System.err.println("\tfieldsSrc is the path to the text of the record"); + System.err.println("\t\tmake sure each line is terminated by \\r\\n"); + System.err.println("\t\tand that the file ends with a blank, \\r\\n terminiated line"); + System.err.println("\tid is the XXX in:"); + System.err.println("\t\tContent-Description: Made from XXX by org.archive.wayback.util.WARCHeader"); + System.err.println("\t\tof the header record... header..."); + System.exit(1); + } + File target = new File(args[0]); + File fieldSrc = new File(args[1]); + String id = args[2]; + WARCHeader header = new WARCHeader(); + try { + header.writeHeaderRecord(target, fieldSrc, id); + } catch (IOException e) { + e.printStackTrace(); + System.exit(2); + } + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:03:14
|
Revision: 2891 http://archive-access.svn.sourceforge.net/archive-access/?rev=2891&view=rev Author: bradtofel Date: 2009-11-06 02:02:57 +0000 (Fri, 06 Nov 2009) Log Message: ----------- INITIAL REV: some basic tests for new code Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/cdx/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/cdx/format/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatTest.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/filters/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/filters/FileRegexFilterTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java 2009-11-06 02:02:57 UTC (rev 2891) @@ -0,0 +1,131 @@ +/* ReplayRequestParserTest + * + * $Id$ + * + * Created on 12:03:48 PM Feb 12, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.archivalurl.requestparser; + +import org.archive.wayback.archivalurl.ArchivalUrlRequestParser; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.requestparser.BaseRequestParser; + +import junit.framework.TestCase; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class ReplayRequestParserTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.archivalurl.requestparser.ReplayRequestParser#parse(java.lang.String)}. + */ + public void testParseString() { + BaseRequestParser wrapped = new ArchivalUrlRequestParser(); + ReplayRequestParser p = new ReplayRequestParser(wrapped); + WaybackRequest r; + r = p.parse(""); + assertNull("Should not parse empty string", r); + r = p.parse("20070101000000/foo.com"); + assertNotNull("Should parse legit request sans scheme", r); + assertEquals("parsed request Url",r.getRequestUrl(),"http://foo.com"); + assertEquals("Parsed timestamp","20070101000000",r.getReplayTimestamp()); + + r = p.parse("20070101000000/foo.com/"); + assertEquals("parsed request Url, maintaining trailing slash", + "http://foo.com/",r.getRequestUrl()); + + r = p.parse("200701010000/foo.com"); + assertEquals("parsed partial date", + "http://foo.com",r.getRequestUrl()); + assertEquals("Parsed partial timestamp to earliest", + "20070101000000",r.getReplayTimestamp()); + + r = p.parse("20070101000000/http://foo.com"); + assertEquals("parsed request Url with scheme", + "http://foo.com",r.getRequestUrl()); + + r = p.parse("20070101000000/http://foo.com/"); + assertEquals("parsed request Url with scheme and trailing slash", + "http://foo.com/",r.getRequestUrl()); + + r = p.parse("20070101000000/ftp://foo.com/"); + assertEquals("parsed request Url with ftp scheme", + "ftp://foo.com/",r.getRequestUrl()); + + r = p.parse("20070101000000/https://foo.com/"); + assertEquals("parsed request Url with https scheme", + "https://foo.com/",r.getRequestUrl()); + + r = p.parse("20070101000000js_/http://foo.com/"); + assertEquals("parsed request Url with js_ flag", + "http://foo.com/",r.getRequestUrl()); + assertTrue("parsed js_ flag",r.isJSContext()); + assertFalse("css not set",r.isCSSContext()); + + r = p.parse("20070101000000cs_/http://foo.com/"); + assertEquals("parsed request Url with cs_ flag", + "http://foo.com/",r.getRequestUrl()); + assertTrue("parsed cs_ flag",r.isCSSContext()); + assertFalse("js not set",r.isJSContext()); + + r = p.parse("20070101000000cs_js_/http://foo.com/"); + assertEquals("parsed request Url with cs_ and js_ flags", + "http://foo.com/",r.getRequestUrl()); + assertTrue("parsed cs_ flag",r.isCSSContext()); + assertTrue("parsed js_ flag",r.isJSContext()); + + r = p.parse("20070101000000js_cs_/http://foo.com/"); + assertEquals("parsed request Url with cs_ and js_ flags, backvards", + "http://foo.com/",r.getRequestUrl()); + assertTrue("parsed cs_ flag",r.isCSSContext()); + assertTrue("parsed js_ flag",r.isJSContext()); + + r = p.parse("20070101000000un_/http://foo.com/"); + assertEquals("parsed request Url with unknown flag", + "http://foo.com/",r.getRequestUrl()); + assertFalse("no cs_ flag",r.isCSSContext()); + assertFalse("no js_ flag",r.isJSContext()); + + r = p.parse("20070101000000un_js_cs_/http://foo.com/"); + assertEquals("parsed request Url with falgs and unknown flag", + "http://foo.com/",r.getRequestUrl()); + assertTrue("parsed cs_ flag",r.isCSSContext()); + assertTrue("parsed js_ flag",r.isJSContext()); + + r = p.parse("20070101000000js_cs_un_/http://foo.com/"); + assertEquals("parsed request Url with falgs and unknown flag at end", + "http://foo.com/",r.getRequestUrl()); + assertTrue("parsed cs_ flag",r.isCSSContext()); + assertTrue("parsed js_ flag",r.isJSContext()); + + r = p.parse("20070101000000un_js_cs_un_/http://foo.com/"); + assertEquals("parsed request Url with falgs and unknown flags", + "http://foo.com/",r.getRequestUrl()); + assertTrue("parsed cs_ flag",r.isCSSContext()); + assertTrue("parsed js_ flag",r.isJSContext()); + + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatTest.java 2009-11-06 02:02:57 UTC (rev 2891) @@ -0,0 +1,69 @@ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; + +import junit.framework.TestCase; + +public class CDXFormatTest extends TestCase { + public void testParseSpec() { + CaptureSearchResult c; + CDXFormat f = OKFormat(" CDX a V"); + c = OKParse(f,"http://foo.com 12"); + assertEquals("http://foo.com",c.getOriginalUrl()); + assertEquals(c.getOffset(), 12); + + + f = OKFormat(" CDX a V k"); + c = OKParse(f,"http://foo.com 12 10"); + assertEquals("http://foo.com",c.getOriginalUrl()); + assertEquals(12,c.getOffset()); + assertEquals("10",c.getDigest()); + + + exceptionFormat("CDX a k"); + exceptionFormat("\tCDX a k"); + exceptionFormat("\tCDX a k "); + exceptionFormat(" CDX\ta k"); + exceptionFormat(" CDX\ta k\t"); + exceptionFormat(" CDX\ta\tk\t"); + + f = OKFormat(" CDX\ta\tV\tk"); + c = OKParse(f,"http://foo.com\t12\t10"); + assertEquals("http://foo.com",c.getOriginalUrl()); + assertEquals(12,c.getOffset()); + assertEquals("10",c.getDigest()); + + c = OKParse(f,"http://foo .com\t12\t10"); + assertEquals("http://foo .com",c.getOriginalUrl()); + assertEquals(12,c.getOffset()); + assertEquals("10",c.getDigest()); + } + private CaptureSearchResult OKParse(CDXFormat f, String line) { + CaptureSearchResult r = null; + try { + r = f.parseResult(line); + } catch (CDXFormatException e) { + fail(e.getLocalizedMessage()); + } + return r; + } + private CDXFormat OKFormat(String format) { + CDXFormat f = null; + try { + f = new CDXFormat(format); + } catch (CDXFormatException e) { + fail("Format '" + format + "' should NOT have thrown exception"); + } + return f; + } + private void exceptionFormat(String format) { + try { + new CDXFormat(format); + fail("Format '" + format + "' should have thrown exception"); + } catch (CDXFormatException e) { + } + + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/filters/FileRegexFilterTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/filters/FileRegexFilterTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/filters/FileRegexFilterTest.java 2009-11-06 02:02:57 UTC (rev 2891) @@ -0,0 +1,49 @@ +package org.archive.wayback.resourceindex.filters; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +import junit.framework.TestCase; + +public class FileRegexFilterTest extends TestCase { + String[] patterns = {"^one-", "^two-"}; + + public void testGetSetPatterns() { + FileRegexFilter f = new FileRegexFilter(); + List<String> in = Arrays.asList(patterns); + f.setPatterns(in); + List<String> out = f.getPatterns(); + assertTrue(listCmp(in,out)); + } + + public void testFilterObject() { + List<String> in = Arrays.asList(patterns); + FileRegexFilter f = new FileRegexFilter(); + f.setPatterns(in); + CaptureSearchResult c = new CaptureSearchResult(); + c.setFile("one-11"); + assertEquals(f.filterObject(c), ObjectFilter.FILTER_INCLUDE); + c.setFile("onedd-11"); + assertEquals(f.filterObject(c), ObjectFilter.FILTER_EXCLUDE); + c.setFile("two-11"); + assertEquals(f.filterObject(c), ObjectFilter.FILTER_INCLUDE); + f.setPatterns(new ArrayList<String>()); + assertEquals(f.filterObject(c), ObjectFilter.FILTER_EXCLUDE); + } + private boolean listCmp(List<String> one, List<String> two) { + if(one.size() != two.size()) { + return false; + } + int size = one.size(); + for(int i = 0; i < size; i++) { + if(!one.get(i).equals(two.get(i))) { + return false; + } + } + return true; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/filters/FileRegexFilterTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 02:01:59
|
Revision: 2890 http://archive-access.svn.sourceforge.net/archive-access/?rev=2890&view=rev Author: bradtofel Date: 2009-11-06 02:01:45 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REMOVE: this accidentally got checked in -- nothing but empty tests, but the code it's testing is not ready for prime-time Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/partition/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:57:47
|
Revision: 2889 http://archive-access.svn.sourceforge.net/archive-access/?rev=2889&view=rev Author: bradtofel Date: 2009-11-06 01:57:37 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REMOVED: empty test cases Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/FileDownloaderTest.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/FileDownloaderTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/FileDownloaderTest.java 2009-11-06 01:54:34 UTC (rev 2888) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/FileDownloaderTest.java 2009-11-06 01:57:37 UTC (rev 2889) @@ -1,69 +0,0 @@ -/* FileDownloaderTest - * - * $Id$ - * - * Created on 3:46:13 PM Jan 25, 2007. - * - * Copyright (C) 2007 Internet Archive. - * - * This file is part of wayback-svn. - * - * wayback-svn is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback-svn is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback-svn; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.util; - -//import java.io.File; -//import java.net.URL; - -import junit.framework.TestCase; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class FileDownloaderTest extends TestCase { - /** - * @throws Exception - */ - public void testDownload() throws Exception { -// URL url = new URL("http://homeserver.us.archive.org/~brad/tmp.del.gz"); -// String wantHex = "01051ca0aabef856e9bdcee4ac23f66f"; -// File tmp = File.createTempFile("tmp","del"); -// FileDownloader downloader = new FileDownloader(); -// downloader.setDigest(true); -// downloader.download(url,tmp); -// assertTrue(tmp.exists()); -// assertEquals(downloader.getLastDigest(),wantHex); -// assertTrue(tmp.delete()); - } - - /** - * @throws Exception - */ - public void testDownloadGZ() throws Exception { -// URL url = new URL("http://homeserver.us.archive.org/~brad/tmp.del.gz"); -// String wantHex = "765dcbfb102670a6e75859599cb38fe4"; -// File tmp = File.createTempFile("tmp","del"); -// FileDownloader downloader = new FileDownloader(); -// downloader.setDigest(true); -// downloader.downloadGZ(url,tmp); -// assertTrue(tmp.exists()); -// assertEquals(downloader.getLastDigest(),wantHex); -// assertTrue(tmp.delete()); - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:54:46
|
Revision: 2888 http://archive-access.svn.sourceforge.net/archive-access/?rev=2888&view=rev Author: bradtofel Date: 2009-11-06 01:54:34 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: CDX to SearchResult adapters now use CDXFormat Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXFormatAdapter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java 2009-11-06 01:54:34 UTC (rev 2888) @@ -0,0 +1,44 @@ +package org.archive.wayback.resourceindex.cdx; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.Iterator; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.CloseableIterator; + +public class CDXFormatIndex extends CDXIndex { + public final static String CDX_HEADER_MAGIC = " CDX N b a m s k r M V g"; + + private CDXFormat cdx = null; + private long lastMod = -1; + + protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr) + throws IOException { + + long nowMod = file.lastModified(); + if(nowMod > lastMod) { + try { + // BUGBUG: I don't think java will let us do much better than + // this... No way to stat() a filehandle, right? + BufferedReader fr = new BufferedReader(new FileReader(file)); + cdx = new CDXFormat(fr.readLine()); + lastMod = nowMod; + fr.close(); + } catch (CDXFormatException e) { + lastMod = -1; + try { + cdx = new CDXFormat(CDX_HEADER_MAGIC); + } catch (CDXFormatException e1) { + throw new IOException(e1); + } + } + } + return new AdaptedIterator<String,CaptureSearchResult>(itr, + new CDXFormatToSearchResultAdapter(cdx)); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatIndex.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java 2009-11-06 01:54:34 UTC (rev 2888) @@ -0,0 +1,27 @@ +package org.archive.wayback.resourceindex.cdx; + +import java.util.logging.Logger; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; +import org.archive.wayback.util.Adapter; + +public class CDXFormatToSearchResultAdapter implements Adapter<String,CaptureSearchResult> { + private static final Logger LOGGER = Logger.getLogger( + CDXFormatToSearchResultAdapter.class.getName()); + + private CDXFormat cdx = null; + public CDXFormatToSearchResultAdapter(CDXFormat cdx) { + this.cdx = cdx; + } + + public CaptureSearchResult adapt(String line) { + try { + return cdx.parseResult(line); + } catch (CDXFormatException e) { + LOGGER.warning("CDXFormat(" + line + "):"+e.getLocalizedMessage()); + } + return null; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/CDXFormatToSearchResultAdapter.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXFormatAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXFormatAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXFormatAdapter.java 2009-11-06 01:54:34 UTC (rev 2888) @@ -0,0 +1,28 @@ +package org.archive.wayback.resourceindex.cdx; + +import java.util.Iterator; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; + +public class SearchResultToCDXFormatAdapter implements + Adapter<CaptureSearchResult, String> { + + private CDXFormat cdx = null; + + public SearchResultToCDXFormatAdapter(CDXFormat cdx) { + this.cdx = cdx; + } + + public String adapt(CaptureSearchResult o) { + return cdx.serializeResult(o); + } + public static Iterator<String> adapt(Iterator<CaptureSearchResult> input, + CDXFormat cdx) { + SearchResultToCDXFormatAdapter adapter = + new SearchResultToCDXFormatAdapter(cdx); + return new AdaptedIterator<CaptureSearchResult,String>(input,adapter); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/SearchResultToCDXFormatAdapter.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:53:33
|
Revision: 2887 http://archive-access.svn.sourceforge.net/archive-access/?rev=2887&view=rev Author: bradtofel Date: 2009-11-06 01:53:23 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: Moved common HTTP header parsing code into HTTPRecordAnnotater FEATURE: HTML content is now parsed using the SAX parser, to search for META robots tags FEATURE: Now HTTP headers are inspected for Robot related instructions Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -36,7 +36,6 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.IdentityUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; /** * @@ -50,13 +49,14 @@ // private static final Logger LOGGER = Logger.getLogger( // ARCRecordToSearchResultAdapter.class.getName()); + private HTTPRecordAnnotater annotater = null; private UrlCanonicalizer canonicalizer = null; public ARCRecordToSearchResultAdapter() { canonicalizer = new IdentityUrlCanonicalizer(); + annotater = new HTTPRecordAnnotater(); } -// public static SearchResult arcRecordToSearchResult(final ARCRecord rec) -// throws IOException, ParseException { + /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ @@ -68,7 +68,7 @@ return null; } } - + private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException { rec.close(); ARCRecordMetaData meta = rec.getMetaData(); @@ -84,12 +84,14 @@ // initialize with default HTTP code... result.setHttpCode("-"); + result.setRedirectUrl("-"); result.setDigest(rec.getDigestStr()); - result.setMimeType(meta.getMimetype()); result.setCaptureTimestamp(meta.getDate()); - String uriStr = meta.getUrl(); + result.setOriginalUrl(uriStr); + + if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { // skip filedesc record altogether... return null; @@ -97,49 +99,20 @@ if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { // skip URL + HTTP header processing for dns records... - result.setOriginalUrl(uriStr); - result.setRedirectUrl("-"); result.setUrlKey(uriStr); - + result.setMimeType("text/dns"); + result.setEndOffset(rec.compressedBytes); + } else { - result.setOriginalUrl(uriStr); + result.setUrlKey(canonicalizer.urlStringToKey(uriStr)); - String statusCode = (meta.getStatusCode() == null) ? "-" : meta .getStatusCode(); result.setHttpCode(statusCode); - String redirectUrl = "-"; Header[] headers = rec.getHttpHeaders(); - if (headers != null) { - - for (int i = 0; i < headers.length; i++) { - if (headers[i].getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = headers[i].getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - redirectUrl = UrlOperations.resolveUrl(uriStr, - locationStr); - - break; - } - } - result.setRedirectUrl(redirectUrl); - - String urlKey = canonicalizer.urlStringToKey(meta.getUrl()); - result.setUrlKey(urlKey); - } + annotater.annotateHTTPContent(result, rec, headers, meta.getMimetype()); } return result; } @@ -149,4 +122,18 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } + + /** + * @return the annotater + */ + public HTTPRecordAnnotater getAnnotater() { + return annotater; + } + + /** + * @param annotater the annotater to set + */ + public void setAnnotater(HTTPRecordAnnotater annotater) { + this.annotater = annotater; + } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,144 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.Header; +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.htmllex.ContextAwareLexer; +import org.archive.wayback.util.htmllex.ParseEventDelegator; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.url.UrlOperations; +import org.htmlparser.Node; +import org.htmlparser.lexer.Lexer; +import org.htmlparser.lexer.Page; +import org.htmlparser.util.ParserException; + +public class HTTPRecordAnnotater { + private RobotMetaRule rule = null; + private ParseEventDelegator rules = null; + private RobotMetaFlags robotFlags; + private static final Logger LOGGER = + Logger.getLogger(HTTPRecordAnnotater.class.getName()); + + private final static String[] mimes = { + "html" + }; + public HTTPRecordAnnotater() { + rules = new ParseEventDelegator(); + rules.init(); + rule = new RobotMetaRule(); + robotFlags = new RobotMetaFlags(); + rule.setRobotFlags(robotFlags); + rule.visit(rules); + } + public boolean isHTML(String mimeType) { + String mimeLower = mimeType.toLowerCase(); + for(String mime : mimes) { + if(mimeLower.contains(mime)) { + return true; + } + } + return false; + } + + private String escapeSpaces(final String input) { + if(input.contains(" ")) { + return input.replace(" ", "%20"); + } + return input; + } + + public String transformHTTPMime(String input) { + int semiIdx = input.indexOf(";"); + if(semiIdx > 0) { + return escapeSpaces(input.substring(0,semiIdx).trim()); + } + return escapeSpaces(input.trim()); + } + + public void annotateHTTPContent(CaptureSearchResult result, + InputStream is, Header[] headers, String mimeGuess) { + robotFlags.reset(); + String mimeType = null; + if (headers != null) { + + for (Header httpHeader : headers) { + if (httpHeader.getName().equals( + WaybackConstants.LOCATION_HTTP_HEADER)) { + + String locationStr = httpHeader.getValue(); + // TODO: "Location" is supposed to be absolute: + // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) + // (section 14.30) but Content-Location can be + // relative. + // is it correct to resolve a relative Location, as + // we are? + // it's also possible to have both in the HTTP + // headers... + // should we prefer one over the other? + // right now, we're ignoring "Content-Location" + result.setRedirectUrl( + UrlOperations.resolveUrl(result.getOriginalUrl(), + locationStr)); + + } else if(httpHeader.getName().toLowerCase().equals("content-type")) { + mimeType = transformHTTPMime(httpHeader.getValue()); + } else if(httpHeader.getName().toLowerCase().equals( + WaybackConstants.X_ROBOTS_HTTP_HEADER)) { + + robotFlags.parse(httpHeader.getValue()); + } + } + } + + // TODO: get the encoding: + String encoding = "utf-8"; + if(mimeType == null) { + // nothing present in the HTTP headers.. Use the WARC field: + mimeType = transformHTTPMime(mimeGuess); + } + result.setMimeType(mimeType); + // Now the sticky part: If it looks like an HTML document, look for + // robot meta tags: + if(isHTML(mimeType)) { + String fileContext = result.getFile() + ":" + result.getOffset(); + annotateHTMLContent(is, encoding, fileContext, result); + } + robotFlags.apply(result); + + } + + public void annotateHTMLContent(InputStream is, String charSet, String fileContext, + CaptureSearchResult result) { + + ParseContext context = new ParseContext(); + + Node node; + try { + ContextAwareLexer lex = new ContextAwareLexer( + new Lexer(new Page(is,charSet)),context); + while((node = lex.nextNode()) != null) { +// System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):"); +// System.err.println("-------------------/START"); +// System.err.println(node.toHtml(true)); +// System.err.println("-------------------/END"); + rules.handleNode(context, node); + } + rules.handleParseComplete(context); + } catch (ParserException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } catch (IOException e) { + LOGGER.warning(fileContext + " " + e.getLocalizedMessage()); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,44 @@ +package org.archive.wayback.resourcestore.indexer; + +import org.archive.wayback.core.CaptureSearchResult; + +public class RobotMetaFlags { + private static String NO_NOTHIN_MATCH = "NONE"; + private static String NO_FOLLOW_MATCH = "NOFOLLOW"; + private static String NO_INDEX_MATCH = "NOINDEX"; + private static String NO_ARCHIVE_MATCH = "NOARCHIVE"; + + private boolean noArchive = false; + private boolean noIndex = false; + private boolean noFollow = false; + public void reset() { + noArchive = false; + noIndex = false; + noFollow = false; + } + public void parse(String content) { + if(content == null) { + return; + } + String up = content.replaceAll("-", "").toUpperCase(); + if(up.contains(NO_FOLLOW_MATCH)) { + noFollow = true; + } + if(up.contains(NO_ARCHIVE_MATCH)) { + noArchive = true; + } + if(up.contains(NO_INDEX_MATCH)) { + noIndex = true; + } + if(up.contains(NO_NOTHIN_MATCH)) { + noFollow = true; + noArchive = true; + noIndex = true; + } + } + public void apply(CaptureSearchResult result) { + if(noFollow) result.setRobotNoFollow(); + if(noIndex) result.setRobotNoIndex(); + if(noArchive) result.setRobotNoArchive(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -0,0 +1,47 @@ +package org.archive.wayback.resourcestore.indexer; + +import java.io.IOException; + +import org.archive.wayback.util.htmllex.ParseEventDelegator; +import org.archive.wayback.util.htmllex.ParseEventDelegatorVisitor; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.handlers.OpenTagHandler; +import org.htmlparser.nodes.TagNode; + +public class RobotMetaRule implements ParseEventDelegatorVisitor, OpenTagHandler { + + private RobotMetaFlags robotFlags = null; + + public void visit(ParseEventDelegator rules) { + // register for <META> Start tags: + rules.addOpenTagHandler(this, "META"); + } + + public void handleOpenTagNode(ParseContext context, TagNode node) + throws IOException { + String nameVal = node.getAttribute("name"); + if(nameVal != null) { + if(nameVal.toUpperCase().equals("ROBOTS")) { + String content = node.getAttribute("content"); + if(content != null) { + robotFlags.parse(content); + } + } + } + } + + /** + * @return the robotFlags + */ + public RobotMetaFlags getRobotFlags() { + return robotFlags; + } + + /** + * @param robotFlags the robotFlags to set + */ + public void setRobotFlags(RobotMetaFlags robotFlags) { + this.robotFlags = robotFlags; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887) @@ -2,23 +2,23 @@ import java.io.File; import java.io.IOException; -import java.util.logging.Logger; +//import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; import org.apache.commons.httpclient.StatusLine; +import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.EncodingUtil; +import org.apache.log4j.Logger; import org.archive.io.ArchiveRecordHeader; import org.archive.io.RecoverableIOException; import org.archive.io.arc.ARCConstants; import org.archive.io.warc.WARCConstants; import org.archive.io.warc.WARCRecord; import org.archive.wayback.UrlCanonicalizer; -import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * Adapts certain WARCRecords into SearchResults. DNS and response records are @@ -33,29 +33,23 @@ */ public class WARCRecordToSearchResultAdapter implements Adapter<WARCRecord,CaptureSearchResult>{ + private static final Logger LOGGER = Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName()); private final static String DEFAULT_VALUE = "-"; - private UrlCanonicalizer canonicalizer = null; + private HTTPRecordAnnotater annotater = null; private boolean processAll = false; - public boolean isProcessAll() { - return processAll; - } - - public void setProcessAll(boolean processAll) { - this.processAll = processAll; - } - public WARCRecordToSearchResultAdapter() { - canonicalizer = new AggressiveUrlCanonicalizer(); + canonicalizer = new IdentityUrlCanonicalizer(); + annotater = new HTTPRecordAnnotater(); } - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + /* + * This just calls adaptInner, returning null if an Exception is thrown: */ public CaptureSearchResult adapt(WARCRecord rec) { try { @@ -65,121 +59,94 @@ return null; } } - - /* - * Transform input date to 14-digit timestamp: - * 2007-08-29T18:00:26Z => 20070829180026 - */ - private static String transformDate(final String input) { + + private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { - StringBuilder output = new StringBuilder(14); - - output.append(input.substring(0,4)); - output.append(input.substring(5,7)); - output.append(input.substring(8,10)); - output.append(input.substring(11,13)); - output.append(input.substring(14,16)); - output.append(input.substring(17,19)); - - return output.toString(); - } - - private static String escapeSpaces(final String input) { - if(input.contains(" ")) { - return input.replace(" ", "%20"); - } - return input; - } - - private static String transformHTTPMime(String input) { - int semiIdx = input.indexOf(";"); - if(semiIdx > 0) { - return escapeSpaces(input.substring(0,semiIdx).trim()); - } - return escapeSpaces(input.trim()); - } + ArchiveRecordHeader header = rec.getHeader(); - private String transformWarcFilename(String readerIdentifier) { - String warcName = readerIdentifier; - int index = warcName.lastIndexOf(File.separator); - if (index > 0 && (index + 1) < warcName.length()) { - warcName = warcName.substring(index + 1); + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.WARCINFO)) { + LOGGER.info("Skipping record type : " + type); + return null; } - return warcName; - } - private String transformDigest(final Object o) { - if(o == null) { - return DEFAULT_VALUE; + CaptureSearchResult result = genericResult(rec); + + if(type.equals(WARCConstants.RESPONSE)) { + String mime = annotater.transformHTTPMime(header.getMimetype()); + if(mime.equals("text/dns")) { + // close to complete reading, then the digest is legit + // TODO: DO we want to use the WARC header digest for this? + rec.close(); + result.setDigest(transformWARCDigest(rec.getDigestStr())); + result.setMimeType(mime); + } else { + result = adaptWARCHTTPResponse(result,rec); + } + } else if(type.equals(WARCConstants.REVISIT)) { + // also set the mime type: + result.setMimeType("warc/revisit"); + + } else if(type.equals(WARCConstants.REQUEST)) { + + if(processAll) { + // also set the mime type: + result.setMimeType("warc/request"); + } else { + result = null; + } + } else if(type.equals(WARCConstants.METADATA)) { + + if(processAll) { + // also set the mime type: + result.setMimeType("warc/metadata"); + } else { + result = null; + } + } else { + LOGGER.info("Skipping record type : " + type); } - String orig = o.toString(); - if(orig.startsWith("sha1:")) { - return orig.substring(5); - } - return orig; + + return result; } - private CaptureSearchResult getBlankSearchResult() { + // ALL HELPER METHODS BELOW: + + /* + * Extract all common WARC fields into a CaptureSearchResult. This is the + * same for all WARC record types: + * + * file, offset, timestamp, digest, urlKey, originalUrl + */ + private CaptureSearchResult genericResult(WARCRecord rec) { + CaptureSearchResult result = new CaptureSearchResult(); - result.setUrlKey(DEFAULT_VALUE); - result.setOriginalUrl(DEFAULT_VALUE); - result.setCaptureTimestamp(DEFAULT_VALUE); - result.setDigest(DEFAULT_VALUE); result.setMimeType(DEFAULT_VALUE); result.setHttpCode(DEFAULT_VALUE); result.setRedirectUrl(DEFAULT_VALUE); - result.setFile(DEFAULT_VALUE); - result.setOffset(0); - return result; - } - - private void addUrlDataToSearchResult(CaptureSearchResult result, String urlStr) - throws IOException { - result.setOriginalUrl(urlStr); - String urlKey = canonicalizer.urlStringToKey(urlStr); - result.setUrlKey(urlKey); - } + ArchiveRecordHeader header = rec.getHeader(); - private CaptureSearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - - CaptureSearchResult result = getBlankSearchResult(); - - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); + String file = transformWARCFilename(header.getReaderIdentifier()); + long offset = header.getOffset(); - String uriStr = header.getUrl(); - - result.setMimeType(header.getMimetype()); - - result.setOriginalUrl(uriStr); - result.setUrlKey(uriStr); - - rec.close(); - result.setDigest(rec.getDigestStr()); - - return result; - } - - private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header, - WARCRecord rec, String mime) - throws IOException { - - CaptureSearchResult result = getBlankSearchResult(); - - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); - result.setDigest(transformDigest(header.getHeaderValue( + result.setCaptureTimestamp(transformWARCDate(header.getDate())); + result.setFile(file); + result.setOffset(offset); + result.setDigest(transformWARCDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - addUrlDataToSearchResult(result,header.getUrl()); - - result.setMimeType(mime); - + String origUrl = header.getUrl(); + result.setOriginalUrl(origUrl); + try { + String urlKey = canonicalizer.urlStringToKey(origUrl); + result.setUrlKey(urlKey); + } catch (URIException e) { + LOGGER.warn("FAILED canonicalize(" + origUrl + "):" + + file + " " + offset); + result.setUrlKey(origUrl); + } return result; } @@ -200,19 +167,55 @@ } return count; } - - private CaptureSearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) - throws IOException { - CaptureSearchResult result = getBlankSearchResult(); + private String transformWARCFilename(String readerIdentifier) { + String warcName = readerIdentifier; + int index = warcName.lastIndexOf(File.separator); + if (index > 0 && (index + 1) < warcName.length()) { + warcName = warcName.substring(index + 1); + } + return warcName; + } - result.setCaptureTimestamp(transformDate(header.getDate())); - result.setFile(transformWarcFilename(header.getReaderIdentifier())); - result.setOffset(header.getOffset()); + private String transformWARCDigest(final Object o) { + if(o == null) { + return DEFAULT_VALUE; + } + String orig = o.toString(); + if(orig.startsWith("sha1:")) { + return orig.substring(5); + } + return orig; + } + + /* + * Transform input date to 14-digit timestamp: + * 2007-08-29T18:00:26Z => 20070829180026 + */ + private static String transformWARCDate(final String input) { - String origUrl = header.getUrl(); - addUrlDataToSearchResult(result,origUrl); + StringBuilder output = new StringBuilder(14); + + output.append(input.substring(0,4)); + output.append(input.substring(5,7)); + output.append(input.substring(8,10)); + output.append(input.substring(11,13)); + output.append(input.substring(14,16)); + output.append(input.substring(17,19)); + + return output.toString(); + } + /* + * Currently the WARCReader doesn't parse HTTP headers. This method parses + * them then calls the common ARC/WARC shared record parsing code, which + * addresses HTTP headers, and possibly even parses HTML content to look + * for Robot Meta tags. + */ + private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, + WARCRecord rec) throws IOException { + + ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. @@ -234,66 +237,13 @@ Header[] headers = HttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); - rec.close(); - result.setDigest(transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); - - if (headers != null) { - - for (Header httpHeader : headers) { - if (httpHeader.getName().equals( - WaybackConstants.LOCATION_HTTP_HEADER)) { - - String locationStr = httpHeader.getValue(); - // TODO: "Location" is supposed to be absolute: - // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) - // (section 14.30) but Content-Location can be - // relative. - // is it correct to resolve a relative Location, as - // we are? - // it's also possible to have both in the HTTP - // headers... - // should we prefer one over the other? - // right now, we're ignoring "Content-Location" - result.setRedirectUrl( - UrlOperations.resolveUrl(origUrl, locationStr)); - } else if(httpHeader.getName().toLowerCase().equals("content-type")) { - result.setMimeType(transformHTTPMime(httpHeader.getValue())); - } - } - } - return result; - } - - private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { - CaptureSearchResult result = null; - ArchiveRecordHeader header = rec.getHeader(); - String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); - if(type.equals(WARCConstants.RESPONSE)) { - String mime = header.getMimetype(); - if(mime.equals("text/dns")) { - result = adaptDNS(header,rec); - } else { - result = adaptResponse(header,rec); - } - } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptGeneric(header,rec,"warc/revisit"); - } else if(type.equals(WARCConstants.REQUEST)) { - if(processAll) { - result = adaptGeneric(header,rec,"warc/request"); - } - } else if(type.equals(WARCConstants.METADATA)) { - if(processAll) { - result = adaptGeneric(header,rec,"warc/metadata"); - } - } else { - LOGGER.info("Skipping record type : " + type); - } + annotater.annotateHTTPContent(result,rec,headers,header.getMimetype()); return result; } + public UrlCanonicalizer getCanonicalizer() { return canonicalizer; } @@ -301,4 +251,25 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } + + public boolean isProcessAll() { + return processAll; + } + + public void setProcessAll(boolean processAll) { + this.processAll = processAll; + } + /** + * @return the annotater + */ + public HTTPRecordAnnotater getAnnotater() { + return annotater; + } + + /** + * @param annotater the annotater to set + */ + public void setAnnotater(HTTPRecordAnnotater annotater) { + this.annotater = annotater; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:50:37
|
Revision: 2886 http://archive-access.svn.sourceforge.net/archive-access/?rev=2886&view=rev Author: bradtofel Date: 2009-11-06 01:50:20 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: moved main() from ArcIndexer and WarcIndexer into IndexWorker - leaving Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/scripts/cdx-indexer Modified: trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer 2009-11-06 01:49:32 UTC (rev 2885) +++ trunk/archive-access/projects/wayback/dist/src/scripts/arc-indexer 2009-11-06 01:50:20 UTC (rev 2886) @@ -75,7 +75,7 @@ # Main ArcIndexer class. if [ -z "$CLASS_MAIN" ] then - CLASS_MAIN='org.archive.wayback.resourcestore.indexer.ArcIndexer' + CLASS_MAIN='org.archive.wayback.resourcestore.indexer.IndexWorker' fi CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" Added: trunk/archive-access/projects/wayback/dist/src/scripts/cdx-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/cdx-indexer (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/cdx-indexer 2009-11-06 01:50:20 UTC (rev 2886) @@ -0,0 +1,82 @@ +#!/usr/bin/env sh +## +## This script creates a CDX file for all ARC files in a directory +## PUTs those CDX files into a remote pipeline, and informs a remote +## LocationDB of the locations of all the ARC files. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main ArcIndexer class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.resourcestore.indexer.IndexWorker' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" + Modified: trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer 2009-11-06 01:49:32 UTC (rev 2885) +++ trunk/archive-access/projects/wayback/dist/src/scripts/warc-indexer 2009-11-06 01:50:20 UTC (rev 2886) @@ -75,7 +75,7 @@ # Main ArcIndexer class. if [ -z "$CLASS_MAIN" ] then - CLASS_MAIN='org.archive.wayback.resourcestore.indexer.WarcIndexer' + CLASS_MAIN='org.archive.wayback.resourcestore.indexer.IndexWorker' fi CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:49:43
|
Revision: 2885 http://archive-access.svn.sourceforge.net/archive-access/?rev=2885&view=rev Author: bradtofel Date: 2009-11-06 01:49:32 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: moved main() from ArcIndexer and WarcIndexer into IndexWorker Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -25,9 +25,7 @@ package org.archive.wayback.resourcestore.indexer; import java.io.File; -import java.io.PrintWriter; import java.io.IOException; -import java.util.Iterator; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; @@ -35,12 +33,10 @@ import org.archive.io.arc.ARCRecord; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * Transforms an ARC file into Iterator<CaptureSearchResult>. @@ -50,10 +46,6 @@ */ public class ArcIndexer { - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; private UrlCanonicalizer canonicalizer = null; public ArcIndexer() { @@ -113,51 +105,6 @@ this.canonicalizer = canonicalizer; } - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("arc-indexer [-identity] ARCFILE"); - System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - ArcIndexer indexer = new ArcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if(args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if(args.length == (idx + 1)) { - pw = new PrintWriter(args[idx]); - } else { - USAGE(); - } - Iterator<CaptureSearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while(lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - private class ArchiveRecordToARCRecordAdapter implements Adapter<ArchiveRecord,ARCRecord> { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -24,16 +24,23 @@ */ package org.archive.wayback.resourcestore.indexer; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; import java.util.logging.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.cdx.CDXFormatIndex; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXFormatAdapter; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; import org.archive.wayback.resourceindex.updater.IndexClient; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.util.CloseableIterator; -//import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** @@ -112,6 +119,7 @@ } } catch(IOException e) { LOGGER.severe("FAILED to index or upload (" + name + ")"); + e.printStackTrace(); } } return worked; @@ -133,7 +141,86 @@ } return itr; } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("cdx-indexer [-format FORMAT|-identity] FILE"); + System.err.println("cdx-indexer [-format FORMAT|-identity] FILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index from ARC or WARC file"); + System.err.println("FILE at CDXFILE or to STDOUT."); + System.err.println("With -identity, perform no url canonicalization."); + System.err.println("With -format, output CDX in format FORMAT."); + System.exit(1); + } + /** + * @param args + */ + public static void main(String[] args) { + String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC; + PrintWriter pw = new PrintWriter(System.out); + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + boolean setFormat = false; + boolean isIdentity = false; + String path = null; + for(int idx = 0; idx < args.length; idx++) { + if(args[idx].equals("-identity")) { + canonicalizer = new IdentityUrlCanonicalizer(); + isIdentity = true; + } else if(args[idx].equals("-format")) { + idx++; + if(idx >= args.length) { + USAGE(); + } + cdxSpec = args[idx]; + setFormat = true; + } else { + // either input filename: + if(path == null) { + path = args[idx]; + } else { + // or if that's already been specified, then target file: + if(idx+1 != args.length){ + USAGE(); + } + try { + pw = new PrintWriter(args[idx]); + } catch (FileNotFoundException e) { + e.printStackTrace(); + System.exit(1); + } + break; + } + } + } + if(!setFormat && isIdentity) { + cdxSpec = cdxSpec.replace(" N ", " a "); + } + IndexWorker worker = new IndexWorker(); + worker.canonicalizer = canonicalizer; + worker.interval = 0; + worker.init(); + try { + CloseableIterator<CaptureSearchResult> itr = worker.indexFile(path); + CDXFormat cdxFormat = new CDXFormat(cdxSpec); + Iterator<String> lines = + SearchResultToCDXFormatAdapter.adapt(itr, cdxFormat); + pw.println(cdxSpec); + while(lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } catch (CDXFormatException e) { + e.printStackTrace(); + System.exit(1); + } + + } private class WorkerThread extends Thread { private long runInterval = 120000; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -2,8 +2,6 @@ import java.io.File; import java.io.IOException; -import java.io.PrintWriter; -import java.util.Iterator; import org.archive.io.ArchiveRecord; import org.archive.io.warc.WARCReader; @@ -11,20 +9,13 @@ import org.archive.io.warc.WARCRecord; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; public class WarcIndexer { - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - private UrlCanonicalizer canonicalizer = null; private boolean processAll = false; public WarcIndexer() { @@ -89,60 +80,7 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("warc-indexer [-identity] [-all] WARCFILE"); - System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); - System.err.println("With -identity, perform no url canonicalization."); - System.err.println("With -all, output request and metadata records."); - System.exit(1); - } - /** - * @param args - */ - public static void main(String[] args) { - WarcIndexer indexer = new WarcIndexer(); - int idx = 0; - while(args[idx] != null) { - if(args[idx].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - } else if(args[idx].equals("-all")) { - indexer.setProcessAll(true); - } else { - break; - } - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if (args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if (args.length == (idx+1)) { - pw = new PrintWriter(args[1]); - } else { - USAGE(); - } - Iterator<CaptureSearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while (lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - private class ArchiveRecordToWARCRecordAdapter implements Adapter<ArchiveRecord, WARCRecord> { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 01:42:42
|
Revision: 2884 http://archive-access.svn.sourceforge.net/archive-access/?rev=2884&view=rev Author: bradtofel Date: 2009-11-06 01:42:28 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REMOVE: this was a temp file used for testing svn:keywords Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-06 00:03:54 UTC (rev 2883) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/surt/SvnTest.java 2009-11-06 01:42:28 UTC (rev 2884) @@ -1,45 +0,0 @@ -/* SvnTest - * - * $Id$ : - * - * Created on Nov 5, 2009. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -package org.archive.wayback.surt; - -/** - * @author brad - * - */ -public class SvnTest { - public static String CURRENT_SVN_VERSION = "$Rev$"; - /** - * - */ - public String foo() { - String tmp = CURRENT_SVN_VERSION.substring(5); - return tmp.substring(0,tmp.length()-2); - } - public static void main(String[] args) { - SvnTest s = new SvnTest(); - System.out.println("SvnTest version is " + s.foo()); - } -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-06 00:04:02
|
Revision: 2883 http://archive-access.svn.sourceforge.net/archive-access/?rev=2883&view=rev Author: bradtofel Date: 2009-11-06 00:03:54 +0000 (Fri, 06 Nov 2009) Log Message: ----------- INITIAL REV: classes which enable indirection in serialization of CaptureSearchResults Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatException.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/DigestCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/EndOffsetCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/FilenameCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/HTTPCodeCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/OriginalURLCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RedirectURLCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RobotFlagsCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/StartOffsetCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/TimestampCDXField.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/URLKeyCDXField.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,34 @@ +/* CDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public interface CDXField { + public static String DEFAULT_VALUE = "-"; + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException; + public String serialize(CaptureSearchResult result); +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,200 @@ +/* CDXFormat + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +/** + * Class which allows serialization/deserialization of CaptureSearchResult + * objects into/out of a single line String representation. + * + * + * @author brad + * + */ +public class CDXFormat { + + /* + * A canonized url + * B news group + * C rulespace category *** + * D compressed dat file offset + * F canonized frame + * G multi-columm language description (* soon) + * H canonized host + * I canonized image + * J canonized jump point + * K Some weird FBIS what's changed kinda thing + * L canonized link + * M meta tags (AIF) * + * N massaged url + * P canonized path + * Q language string + * R canonized redirect + * U uniqness *** + * V compressed arc file offset * + * X canonized url in other href tages + * Y canonized url in other src tags + * Z canonized url found in script + * a original url ** + * b date ** + * c old style checksum * + * d uncompressed dat file offset + * e IP ** + * f frame * + * g file name + * h original host + * i image * + * j original jump point + * k new style checksum * + * l link * + * m mime type of original document * + * n arc document length * + * o port + * p original path + * r redirect * + * s response code * + * t title * + * v uncompressed arc file offset * + * x url in other href tages * + * y url in other src tags * + * z url found in script * + * # comment + * + * * in alexa-made dat file + * ** in alexa-made dat file meta-data line + * *** future data + */ + + private CDXField[] fields = null; + private char delimiter = ' '; + private String delimiterS = null; + + public static String CDX_MAGIC = " CDX"; + + public static char URL_KEY = 'A'; + public static char TIMESTAMP = 'b'; + public static char ORIGINAL_URL = 'a'; + public static char MIME_TYPE = 'm'; + public static char HTTP_CODE = 's'; + public static char DIGEST = 'k'; + public static char REDIRECT = 'r'; + public static char ROBOT_FLAGS = 'M'; + public static char COMPRESSED_OFFSET = 'V'; + public static char COMPRESSED_LENGTH = 'n'; + public static char FILE = 'g'; + + /** + * Construct a CDXFormat reader/writer based on the specification argument + * @param cdxSpec + * @throws CDXFormatException + */ + public CDXFormat(String cdxSpec) throws CDXFormatException { + if(!cdxSpec.startsWith(CDX_MAGIC)) { + throw new CDXFormatException("Spec '" + cdxSpec + + "' does not start with '" + CDX_MAGIC + "'"); + } + delimiter = cdxSpec.charAt(CDX_MAGIC.length()); + String fieldsString = cdxSpec.substring(CDX_MAGIC.length()+1); + int fieldCount = (fieldsString.length() + 1) / 2; + if(fieldsString.length() != (fieldCount * 2) - 1) { + throw new CDXFormatException("Extra char after spec '" + + cdxSpec + "'"); + } + fields = new CDXField[fieldCount]; + for(int i = 0; i < fieldCount; i++) { + char f = fieldsString.charAt(i * 2); + if(i < fieldCount - 1) { + char d = fieldsString.charAt((i*2)+1); + if(d != delimiter) { + throw new CDXFormatException("Non-delimiter char in '" + + fieldsString + "'"); + } + } + fields[i] = getField(f); + } + delimiterS = new String(""+delimiter); + } + + private CDXField getField(char fieldChar) throws CDXFormatException { + CDXField field = null; + switch (fieldChar) { + case 'A': field = new URLKeyCDXField(); break; + // backvards compat with Alexa tools: + case 'N': field = new URLKeyCDXField(); break; + case 'b': field = new TimestampCDXField(); break; + case 'a': field = new OriginalURLCDXField(); break; + case 'm': field = new MIMETypeCDXField(); break; + case 's': field = new HTTPCodeCDXField(); break; + case 'k': field = new DigestCDXField(); break; + case 'r': field = new RedirectURLCDXField(); break; + case 'M': field = new RobotFlagsCDXField(); break; + case 'V': field = new StartOffsetCDXField(); break; + // NOT IMPLEMENTED in ARC/WARCReaders... +// case 'n': field = new EndOffsetCDXField(); break; + case 'g': field = new FilenameCDXField(); break; + } + if(field == null) { + throw new CDXFormatException("Unknown field '"+fieldChar+"'"); + } + return field; + } + + /** + * @param line + * @return CaptureSearchResult containing data from the 'line' argument + * parsed according the the specification for this CDXFormat + * @throws CDXFormatException + */ + public CaptureSearchResult parseResult(String line) + throws CDXFormatException { + CaptureSearchResult result = new CaptureSearchResult(); + String[] parts = line.split(delimiterS); + + if(parts.length != fields.length) { + throw new CDXFormatException("Wrong number of fields"); + } + for(int i = 0; i < fields.length; i++) { + fields[i].apply(parts[i], result); + } + return result; + } + + /** + * @param result + * @return String representation of the data in 'result' formatted according + * to the specification for this CDXFormat + */ + public String serializeResult(CaptureSearchResult result) { + StringBuilder sb = new StringBuilder(100); + for(int i = 0; i < fields.length; i++) { + sb.append(fields[i].serialize(result)); + if(i < fields.length - 1) { + sb.append(delimiter); + } + } + return sb.toString(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatException.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatException.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,38 @@ +/* CDXFormatException + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +public class CDXFormatException extends Exception { + + public CDXFormatException(String string) { + super(string); + } + + /** + * + */ + private static final long serialVersionUID = 1L; + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormatException.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/DigestCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/DigestCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/DigestCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* DigestCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class DigestCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setDigest(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getDigest(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/DigestCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/EndOffsetCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/EndOffsetCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/EndOffsetCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,47 @@ +/* EndOffsetCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class EndOffsetCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + try { + result.setEndOffset(Long.parseLong(field)); + } catch(NumberFormatException e) { + throw new CDXFormatException(e.getLocalizedMessage()); + } + } + + public String serialize(CaptureSearchResult result) { + long r = result.getEndOffset(); + if(r == -1) { + return DEFAULT_VALUE; + } + return String.valueOf(r); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/EndOffsetCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/FilenameCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/FilenameCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/FilenameCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* FilenameCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class FilenameCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setFile(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getFile(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/FilenameCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/HTTPCodeCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/HTTPCodeCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/HTTPCodeCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* HTTPCodeCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class HTTPCodeCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setHttpCode(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getHttpCode(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/HTTPCodeCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* MIMETypeCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class MIMETypeCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setMimeType(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getMimeType(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/OriginalURLCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/OriginalURLCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/OriginalURLCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* OriginalURLCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class OriginalURLCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setOriginalUrl(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getOriginalUrl(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/OriginalURLCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RedirectURLCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RedirectURLCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RedirectURLCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* RedirectURLCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class RedirectURLCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setRedirectUrl(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getRedirectUrl(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RedirectURLCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RobotFlagsCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RobotFlagsCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RobotFlagsCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* RobotFlagsCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class RobotFlagsCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setRobotFlags(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getRobotFlags(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/RobotFlagsCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/StartOffsetCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/StartOffsetCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/StartOffsetCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,46 @@ +/* StartOffsetCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class StartOffsetCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + try { + result.setOffset(Long.parseLong(field)); + } catch(NumberFormatException e) { + throw new CDXFormatException(e.getLocalizedMessage()); + } + } + public String serialize(CaptureSearchResult result) { + long r = result.getOffset(); + if(r == -1) { + return DEFAULT_VALUE; + } + return String.valueOf(r); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/StartOffsetCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/TimestampCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/TimestampCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/TimestampCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* TimestampCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class TimestampCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setCaptureTimestamp(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getCaptureTimestamp(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/TimestampCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/URLKeyCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/URLKeyCDXField.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/URLKeyCDXField.java 2009-11-06 00:03:54 UTC (rev 2883) @@ -0,0 +1,40 @@ +/* URLKeyCDXField + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.cdx.format; + +import org.archive.wayback.core.CaptureSearchResult; + +public class URLKeyCDXField implements CDXField { + + public void apply(String field, CaptureSearchResult result) + throws CDXFormatException { + result.setUrlKey(field); + } + + public String serialize(CaptureSearchResult result) { + String r = result.getUrlKey(); + return r == null ? DEFAULT_VALUE : r; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/URLKeyCDXField.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 23:50:24
|
Revision: 2882 http://archive-access.svn.sourceforge.net/archive-access/?rev=2882&view=rev Author: bradtofel Date: 2009-11-05 23:50:17 +0000 (Thu, 05 Nov 2009) Log Message: ----------- INITIAL REV: SAX based, configurable server side rewriting of HTML content. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java 2009-11-05 23:50:17 UTC (rev 2882) @@ -0,0 +1,52 @@ +/* ArchivalUrlContextResultURIConverterFactory + * + * $Id$: + * + * Created on Nov 5, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.archivalurl; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.replay.html.ContextResultURIConverterFactory; + +/** + * @author brad + * + */ +public class ArchivalUrlContextResultURIConverterFactory + implements ContextResultURIConverterFactory { + private ArchivalUrlResultURIConverter converter = null; + public ArchivalUrlContextResultURIConverterFactory( + ArchivalUrlResultURIConverter converter) { + this.converter = converter; + } + /* (non-Javadoc) + * @see org.archive.wayback.replay.html.ContextResultURIConverterFactory#getContextConverter(java.lang.String) + */ + public ResultURIConverter getContextConverter(String flags) { + if(flags == null) { + return converter; + } + return new ArchivalUrlSpecialContextResultURIConverter(converter,flags); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2009-11-05 23:50:17 UTC (rev 2882) @@ -0,0 +1,174 @@ +/* ArchivalUrlSAXRewriteReplayRenderer + * + * $Id$ + * + * Created on 12:15:33 PM Feb 12, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.archivalurl; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.ReplayRenderer; +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.WaybackException; +import org.archive.wayback.replay.HttpHeaderOperation; +import org.archive.wayback.replay.HttpHeaderProcessor; +import org.archive.wayback.replay.JSPExecutor; +import org.archive.wayback.replay.charset.CharsetDetector; +import org.archive.wayback.replay.charset.StandardCharsetDetector; +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.util.htmllex.ContextAwareLexer; +import org.htmlparser.Node; +import org.htmlparser.lexer.Lexer; +import org.htmlparser.lexer.Page; +import org.htmlparser.util.ParserException; + +public class ArchivalUrlSAXRewriteReplayRenderer implements ReplayRenderer { + private ReplayParseEventDelegator delegator = null; + private HttpHeaderProcessor httpHeaderProcessor; + private CharsetDetector charsetDetector = new StandardCharsetDetector(); + private final static String OUTPUT_CHARSET = "utf-8"; + + public ArchivalUrlSAXRewriteReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { + this.httpHeaderProcessor = httpHeaderProcessor; + } + + // assume this is only called for appropriate doc types: html + public void renderResource(HttpServletRequest httpRequest, + HttpServletResponse httpResponse, WaybackRequest wbRequest, + CaptureSearchResult result, Resource resource, + ResultURIConverter uriConverter, CaptureSearchResults results) + throws ServletException, IOException, WaybackException { + + // copy the HTTP response code: + HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); + + // transform the original headers according to our headerProcessor: + Map<String,String> headers = HttpHeaderOperation.processHeaders( + resource, result, uriConverter, httpHeaderProcessor); + + // prepare several objects for the parse: + + // a JSPExecutor: + JSPExecutor jspExec = new JSPExecutor(uriConverter, httpRequest, + httpResponse, wbRequest, results, result, resource); + + // The URL of the page, for resolving in-page relative URLs: + URL url = null; + try { + url = new URL(result.getOriginalUrl()); + } catch (MalformedURLException e1) { + // TODO: this shouldn't happen... + throw new IOException(e1); + } + + // To make sure we get the length, we have to buffer it all up... + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + ArchivalUrlContextResultURIConverterFactory fact = + new ArchivalUrlContextResultURIConverterFactory( + (ArchivalUrlResultURIConverter) uriConverter); + // set up the context: + ReplayParseContext context = + new ReplayParseContext(fact,url,result.getCaptureTimestamp()); + context.setOutputCharset(OUTPUT_CHARSET); + context.setOutputStream(baos); + context.setJspExec(jspExec); + + // determine the character set used to encode the document bytes: + String charSet = charsetDetector.getCharset(resource, wbRequest); + + // and finally, parse, using the special lexer that knows how to + // handle javascript blocks containing unescaped HTML entities: + Page lexPage = new Page(resource,charSet); + ContextAwareLexer lex = new ContextAwareLexer(new Lexer(lexPage), + context); + Node node; + try { + while((node = lex.nextNode()) != null) { + delegator.handleNode(context, node); + } + delegator.handleParseComplete(context); + } catch (ParserException e) { + e.printStackTrace(); + throw new IOException(e); + } + + // At this point, baos contains the utf-8 encoded bytes of our result: + byte[] utf8Bytes = baos.toByteArray(); + // set the corrected length: + headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, + String.valueOf(utf8Bytes.length)); + headers.put("X-Wayback-Guessed-Charset", charSet); + + // send back the headers: + HttpHeaderOperation.sendHeaders(headers, httpResponse); + // Tomcat will always send a charset... It's trying to be smarter than + // we are. If the original page didn't include a "charset" as part of + // the "Content-Type" HTTP header, then Tomcat will use the default.. + // who knows what that is, or what that will do to the page.. + // let's try explicitly setting it to what we used: + httpResponse.setCharacterEncoding(OUTPUT_CHARSET); + + httpResponse.getOutputStream().write(utf8Bytes); + } + + /** + * @return the charsetDetector + */ + public CharsetDetector getCharsetDetector() { + return charsetDetector; + } + + /** + * @param charsetDetector the charsetDetector to set + */ + public void setCharsetDetector(CharsetDetector charsetDetector) { + this.charsetDetector = charsetDetector; + } + + /** + * @return the delegator + */ + public ReplayParseEventDelegator getDelegator() { + return delegator; + } + + /** + * @param delegator the delegator to set + */ + public void setDelegator(ReplayParseEventDelegator delegator) { + this.delegator = delegator; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java 2009-11-05 23:50:17 UTC (rev 2882) @@ -0,0 +1,63 @@ +/* ArchivalUrlSpecialContextResultURIConverter + * + * $Id$ + * + * Created on 12:15:33 PM Feb 12, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.archivalurl; + +import org.archive.wayback.ResultURIConverter; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class ArchivalUrlSpecialContextResultURIConverter +implements ResultURIConverter { + + + private String replayURIPrefix = null; + private String context; + + public ArchivalUrlSpecialContextResultURIConverter( + ArchivalUrlResultURIConverter converter, String context) { + replayURIPrefix = converter.getReplayURIPrefix(); + this.context = context; + } + + /* (non-Javadoc) + * @see org.archive.wayback.ResultURIConverter#makeReplayURI(java.lang.String, java.lang.String) + */ + public String makeReplayURI(String datespec, String url) { + String suffix = datespec + context + "/" + url; + if(replayURIPrefix == null) { + return suffix; + } else { + if(url.startsWith(replayURIPrefix)) { + return url; + } + return replayURIPrefix + suffix; + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-05 23:48:11
|
Revision: 2881 http://archive-access.svn.sourceforge.net/archive-access/?rev=2881&view=rev Author: bradtofel Date: 2009-11-05 23:48:03 +0000 (Thu, 05 Nov 2009) Log Message: ----------- INITIAL REV: helper class to encapsulate context needed to invoke a .jsp within a replay context. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/JSPExecutor.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/JSPExecutor.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/JSPExecutor.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/JSPExecutor.java 2009-11-05 23:48:03 UTC (rev 2881) @@ -0,0 +1,77 @@ +/* JSPExecutor + * + * $Id$ + * + * Created on 4:00:41 PM Apr 13, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.replay; + +import java.io.IOException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.UIResults; +import org.archive.wayback.core.WaybackRequest; + +/** + * Class which encapsulates all Replay context information needed to execute + * a .jsp file in the "context" of a particular replay request. + * + * This class then manages converting a jsp path into the String it produces. + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class JSPExecutor { + + private HttpServletRequest httpRequest = null; + private HttpServletResponse httpResponse = null; + private UIResults uiResults = null; + + public JSPExecutor(ResultURIConverter uriConverter, + HttpServletRequest httpRequest, HttpServletResponse httpResponse, + WaybackRequest wbRequest, CaptureSearchResults results, + CaptureSearchResult result, Resource resource) { + + this.httpRequest = httpRequest; + this.httpResponse = httpResponse; + uiResults = + new UIResults(wbRequest, uriConverter, results, result, resource); + } + + + public String jspToString(String jspPath) + throws ServletException, IOException { + + StringHttpServletResponseWrapper wrappedResponse = + new StringHttpServletResponseWrapper(httpResponse); + uiResults.forward(httpRequest, wrappedResponse, jspPath); + return wrappedResponse.getStringResponse(); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/JSPExecutor.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |