You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2008-08-18 23:00:33
|
Revision: 2555 http://archive-access.svn.sourceforge.net/archive-access/?rev=2555&view=rev Author: bradtofel Date: 2008-08-18 23:00:38 +0000 (Mon, 18 Aug 2008) Log Message: ----------- RESOURCE: .dia and .png illustrating automatic indexing. Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/site/resources/dia/AutoIndexing.dia trunk/archive-access/projects/wayback/dist/src/site/resources/images/AutoIndexing.png Property changes on: trunk/archive-access/projects/wayback/dist/src/site/resources/dia/AutoIndexing.dia ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Property changes on: trunk/archive-access/projects/wayback/dist/src/site/resources/images/AutoIndexing.png ___________________________________________________________________ Added: svn:mime-type + application/octet-stream This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mi...@us...> - 2008-08-16 11:50:15
|
Revision: 2554 http://archive-access.svn.sourceforge.net/archive-access/?rev=2554&view=rev Author: miklosh Date: 2008-08-16 11:50:24 +0000 (Sat, 16 Aug 2008) Log Message: ----------- Changed to using metadata keys defined in ImageSearch.java. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ThumbnailGenerator.java trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ThumbnailGenerator.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ThumbnailGenerator.java 2008-08-16 11:48:50 UTC (rev 2553) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ThumbnailGenerator.java 2008-08-16 11:50:24 UTC (rev 2554) @@ -77,8 +77,8 @@ } return null; } - metadata.add("width", Integer.toString(imageWidth)); - metadata.add("height", Integer.toString(imageHeight)); + metadata.add(ImageSearch.IMAGE_WIDTH_KEY, Integer.toString(imageWidth)); + metadata.add(ImageSearch.IMAGE_HEIGHT_KEY, Integer.toString(imageHeight)); // Do we need to scale down at all? if (thumbWidth > imageWidth && thumbHeight > imageHeight) { thumbWidth = imageWidth; Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp 2008-08-16 11:48:50 UTC (rev 2553) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/web/jsp/results.jsp 2008-08-16 11:50:24 UTC (rev 2554) @@ -169,7 +169,8 @@ Metadata meta = image.getMetadata(); int sizeInt = Integer.parseInt(meta.get(ImageSearch.SIZE_KEY)); size = Integer.toString((int)Math.round(sizeInt / 1024.0)) + "k"; - dimensions = meta.get("width") + "x" + meta.get("height"); + dimensions = meta.get(ImageSearch.IMAGE_WIDTH_KEY) + "x" + + meta.get(ImageSearch.IMAGE_HEIGHT_KEY); imgWidth = ""; } else { size = "??k"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mi...@us...> - 2008-08-16 11:48:41
|
Revision: 2553 http://archive-access.svn.sourceforge.net/archive-access/?rev=2553&view=rev Author: miklosh Date: 2008-08-16 11:48:50 +0000 (Sat, 16 Aug 2008) Log Message: ----------- Added image metadata indexing and image size filtering. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java 2008-08-14 03:26:36 UTC (rev 2552) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/DocIndexer.java 2008-08-16 11:48:50 UTC (rev 2553) @@ -22,9 +22,11 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; import org.apache.nutch.parse.*; @@ -41,9 +43,11 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.LinkDb; import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.crawl.MapWritable; import org.apache.lucene.index.*; import org.apache.lucene.document.*; + import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.IndexingFilters; import org.apache.nutch.indexer.NutchSimilarity; @@ -57,6 +61,12 @@ Mapper<Text, Writable, Text, NutchWritable> { public static final String DONE_NAME = "index.done"; + // Keys of indexed image metadata + public static final String[] indexedImageMetadata = { + ImageSearch.IMAGE_WIDTH_KEY, + ImageSearch.IMAGE_HEIGHT_KEY, + ImageSearch.SIZE_KEY + }; public static final Log LOG = LogFactory.getLog(DocIndexer.class); /** A utility class used to pass a lucene document from Indexer.reduce @@ -192,6 +202,7 @@ ParseData parseData = null; ParseText parseText = null; Metadata imageUrlMapping = new Metadata(); + Map<String, Metadata> imageMetadata = new TreeMap<String, Metadata>(); while (values.hasNext()) { Writable value = values.next().get(); // unwrap if (value instanceof Inlinks) { @@ -215,14 +226,21 @@ parseData = (ParseData) value; } else if (value instanceof ParseText) { parseText = (ParseText) value; - } else if (value instanceof Metadata) { - // Add image URL->digest mapping - Metadata mapping = (Metadata) value; - String[] imageUrls = mapping.names(); - for (String imageUrl : imageUrls) { + } else if (value instanceof MapWritable) { + MapWritable map = (MapWritable) value; + Set<Writable> mapping = map.keySet(); + Iterator<Writable> keys = mapping.iterator(); + while (keys.hasNext()) { + Text keyValue = (Text) keys.next(); + String imageUrl = keyValue.toString(); + MapWritable metaMap = (MapWritable) map.get(keyValue); + // Add image URL->digest mapping if (imageUrlMapping.get(imageUrl) == null) { - imageUrlMapping.add(imageUrl, mapping.get(imageUrl)); + imageUrlMapping.add(imageUrl, metaMap.get(new Text(Metadata.SIGNATURE_KEY)).toString()); } + // Convert from MapWritable to Metadata + imageMetadata.put(imageUrl, + convertMapWritableToMetadata(metaMap)); } } else if (LOG.isWarnEnabled()) { LOG.warn("Unrecognized type: " + value.getClass()); @@ -256,6 +274,18 @@ continue; } parseMeta.add(ImageSearch.IMAGE_IDS_KEY, mappedTo); + if (imageMetadata.containsKey(imageUrl)) { + Metadata imageMeta = imageMetadata.get(imageUrl); + // copy metadata into parseMeta + for (String name : indexedImageMetadata) { + String value = imageMeta.get(name); + if (value != null) { + parseMeta.add(name, value); + } else { + parseMeta.add(name, "-"); + } + } + } } } @@ -308,16 +338,56 @@ output.collect(key, new LuceneDocumentWrapper(doc)); } + + /** + * Converts a MapWritable object into a Metadata object. + * @param map MapWritable to convert + * @return Metadata object + */ + private static Metadata convertMapWritableToMetadata(MapWritable map) { + Metadata result = new Metadata(); + Iterator<Writable> metaKey = map.keySet().iterator(); + while (metaKey.hasNext()) { + Writable keyWritable = metaKey.next(); + //Text keyWritable = (Text) metaKey.next(); + String keyString = keyWritable.toString(); + Writable metaValue = map.get(keyWritable); + result.add(keyString, metaValue.toString()); + } + return result; + } + /** - * Emits image URLs as keys and their URL+digest as values. + * Tries to find out the digest of an image from a Content object. + * @param content Content object to retrieve info from + * @return null if this Content does not hold a recognized image format */ + private static String getImageDigestFromContent(Content content) { + // Check MIME type + if (content.getContentType().contains("image/")) { + Metadata meta = content.getMetadata(); + // Using NutchWax.DIGEST_KEY here + String digest = meta.get("digest"); + if (digest == null) { + digest = meta.get(Metadata.SIGNATURE_KEY); + } + return digest; + } else { + return null; + } + } + + /** + * Emits parent URLs as keys and their associated image's data + * (URL+digest and metadata) as values. + */ public static class ImageUrlEmitter - implements Mapper<Text, Writable, Text, Text>, - Reducer<Text, Text, Text, Metadata> { + implements Mapper<Text, Writable, Text, NutchWritable>, + Reducer<Text, NutchWritable, Text, MapWritable> { public void map(Text key, Writable value, - OutputCollector<Text, Text> output, Reporter reporter) + OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { if (value instanceof ParseData) { @@ -326,49 +396,58 @@ String[] imageUrls = parseMeta.getValues(ImageSearch.IMAGE_URLS_KEY); if (imageUrls.length > 0) { for (String url : imageUrls) { - output.collect(new Text(url), key); + output.collect(new Text(url), new NutchWritable(key)); } } } else if (value instanceof Content) { Content content = (Content) value; - if (content.getContentType().contains("image/")) { - Metadata meta = content.getMetadata(); - // Using NutchWax.DIGEST_KEY here - String digest = meta.get("digest"); - if (digest == null) { - digest = meta.get(Metadata.SIGNATURE_KEY); - } - output.collect(new Text(content.getUrl()), new Text(digest)); + String digest = getImageDigestFromContent(content); + if (digest != null) { + output.collect(new Text(content.getUrl()), new NutchWritable(new Text(digest))); } + } else if (value instanceof MapWritable) { + output.collect(key, new NutchWritable(value)); } } - public void reduce(Text key, Iterator<Text> values, - OutputCollector<Text, Metadata> output, Reporter reporter) + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, MapWritable> output, Reporter reporter) throws IOException { - + Vector<Text> parents = new Vector<Text>(); - String imageUrl = key.toString(); - String imageDigest = null; + Text imageDigest = null; + MapWritable metaMap = null; while (values.hasNext()) { - Text data = values.next(); - String value = data.toString(); - // Determine type of value - if (value.contains("/")) { - // This value is a parent's key - parents.add(data); - } else { - // This value is a digest - imageDigest = value.toString(); + Writable value = values.next().get(); + if (value instanceof Text) { + Text data = (Text) value; + String content = data.toString(); + // Determine type of value + if (content.contains("/")) { + // This value is a parent's key + parents.add(data); + } else { + // This value is a digest + imageDigest = data; + } + } else if (value instanceof MapWritable) { + metaMap = (MapWritable) value; } } if (imageDigest != null) { - Metadata meta = new Metadata(); - meta.add(imageUrl, imageDigest); + MapWritable resultMap = null; + if (metaMap != null) { + resultMap = metaMap; + } else { + resultMap = new MapWritable(); + } + resultMap.put(new Text(Metadata.SIGNATURE_KEY), imageDigest); + MapWritable imageInfo = new MapWritable(); + imageInfo.put(key, resultMap); Iterator<Text> it = parents.iterator(); while (it.hasNext()) { Text parentKey = it.next(); - output.collect(parentKey, meta); + output.collect(parentKey, imageInfo); } } } @@ -377,6 +456,60 @@ public void close() {} } + /** + * Emits image URLs as keys and their associated metadata as values. + */ + public static class ImageMetaEmitter + implements Mapper<Text, Writable, Text, NutchWritable>, + Reducer<Text, NutchWritable, Text, MapWritable> { + + public void map(Text key, Writable value, + OutputCollector<Text, NutchWritable> output, Reporter reporter) + throws IOException { + + if (value instanceof ImageWritable) { + ImageWritable imageData = (ImageWritable) value; + output.collect(key, new NutchWritable(imageData.getMetadata())); + } else if (value instanceof Content) { + Content content = (Content) value; + String digest = getImageDigestFromContent(content); + if (digest != null) { + output.collect(new Text(digest), new NutchWritable( + new Text(content.getUrl()))); + } + } + } + + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, MapWritable> output, Reporter reporter) + throws IOException { + + Text imageUrl = null; + Metadata imageMeta = null; + while (values.hasNext()) { + Writable value = values.next().get(); + if (value instanceof Text) { + imageUrl = (Text) value; + } else if (value instanceof Metadata) { + imageMeta = (Metadata) value; + } + } + if (imageUrl != null && imageMeta != null) { + // Convert Metadata into MapWritable + MapWritable metaMap = new MapWritable(); + String[] names = imageMeta.names(); + for (String name : names) { + String value = imageMeta.get(name); + metaMap.put(new Text(name), new Text(value)); + } + output.collect(imageUrl, metaMap); + } + } + + public void configure(JobConf job) {} + public void close() {} + } + public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments) throws IOException { @@ -385,28 +518,65 @@ LOG.info("DocIndexer: linkdb: " + linkDb); } + FileSystem fs = FileSystem.get(getConf()); /* + * Optional phase: adding image metadata + */ + Path metaDir = null; + JobConf job = new NutchJob(getConf()); + job.setJobName("imagemeta " + indexDir); + boolean haveImageData = false; + for (int i = 0; i < segments.length; i++) { + Path imageDataDir = new Path(segments[i], ImageWritable.IMAGE_DATA_DIR); + if (fs.exists(imageDataDir)) { + job.addInputPath(imageDataDir); + job.addInputPath(new Path(segments[i], Content.DIR_NAME)); + haveImageData = true; + } + } + if (haveImageData) { + metaDir = new Path("imgmeta-" + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + + job.setInputFormat(SequenceFileInputFormat.class); + job.setMapperClass(ImageMetaEmitter.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(NutchWritable.class); + job.setReducerClass(ImageMetaEmitter.class); + + job.setOutputPath(metaDir); + job.setOutputFormat(SequenceFileOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(MapWritable.class); + + JobClient.runJob(job); + } + + /* * First phase: determining image keys */ Path outDir = new Path("imgkeys-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - JobConf job = new NutchJob(getConf()); + job = new NutchJob(getConf()); job.setJobName("imagekeys " + indexDir); for (int i = 0; i < segments.length; i++) { job.addInputPath(new Path(segments[i], ParseData.DIR_NAME)); job.addInputPath(new Path(segments[i], Content.DIR_NAME)); } + if (metaDir != null) { + job.addInputPath(metaDir); + } job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ImageUrlEmitter.class); job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(Text.class); + job.setMapOutputValueClass(NutchWritable.class); job.setReducerClass(ImageUrlEmitter.class); job.setOutputPath(outDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); - job.setOutputValueClass(Metadata.class); + job.setOutputValueClass(MapWritable.class); JobClient.runJob(job); @@ -441,8 +611,8 @@ JobClient.runJob(job); - FileSystem fs = FileSystem.get(getConf()); - fs.delete(outDir); + //fs.delete(metaDir); + //fs.delete(outDir); if (LOG.isInfoEnabled()) { LOG.info("DocIndexer: done"); Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-08-14 03:26:36 UTC (rev 2552) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/java/org/archive/nutchwax/imagesearch/ImageSearch.java 2008-08-16 11:48:50 UTC (rev 2553) @@ -24,6 +24,9 @@ public static final String IMAGE_IDS_KEY = "image_ids"; public static final String IMAGE_POS_KEY = "image_pos"; public static final String IMAGE_URLS_KEY = "image_urls"; + public static final String IMAGE_WIDTH_KEY = "image_width"; + public static final String IMAGE_HEIGHT_KEY = "image_height"; + public static final String IMAGE_SIZE_KEY = "image_size"; // Image size category public static final String HAS_IMAGE_KEY = "has_image"; - public static final String SIZE_KEY = "size"; + public static final String SIZE_KEY = "size"; // File size of the image } Modified: trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java 2008-08-14 03:26:36 UTC (rev 2552) +++ trunk/archive-access/projects/nutchwax/imagesearch/src/plugin/src/java/org/archive/nutchwax/imagesearch/plugin/ImageIndexingFilter.java 2008-08-16 11:48:50 UTC (rev 2553) @@ -33,12 +33,15 @@ import org.apache.hadoop.conf.Configuration; import java.util.*; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.nutch.net.protocols.Response; import org.archive.nutchwax.imagesearch.ImageSearch; /** Adds image search related fields to a document. */ public class ImageIndexingFilter implements IndexingFilter { + public static final Log LOG = LogFactory.getLog(IndexingFilter.class); private Configuration conf; public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) @@ -59,8 +62,6 @@ doc.add(new Field(ImageSearch.ALT_TEXT_KEY, altText, Field.Store.NO, Field.Index.TOKENIZED)); } } - // Index image size - //... return doc; } else { String contentType = parse.getData().getMeta(Response.CONTENT_TYPE); @@ -74,23 +75,50 @@ if (imagePositions.length == 0) { // No images in this document doc.add(new Field(ImageSearch.HAS_IMAGE_KEY, "0", Field.Store.YES, Field.Index.TOKENIZED)); return doc; + } + // Filter based on image size + BitSet filteredIndexes = new BitSet(imagePositions.length); + int minWidth = conf.getInt("imagesearcher.indexer.minWidth", 10); + int minHeight = conf.getInt("imagesearcher.indexer.minHeight", 10); + String[] widths = metadata.getValues(ImageSearch.IMAGE_WIDTH_KEY); + String[] heights = metadata.getValues(ImageSearch.IMAGE_HEIGHT_KEY); + String[] imageUrls = metadata.getValues(ImageSearch.IMAGE_URLS_KEY); + + if (widths.length > 0 && widths.length == heights.length) { + for (int i = 0; i < widths.length; i++) { + if (Integer.parseInt(widths[i]) < minWidth || + Integer.parseInt(heights[i]) < minHeight) { + filteredIndexes.set(i); + if (LOG.isDebugEnabled()) { + LOG.debug("Filtered image " + imageUrls[i] + " " + + widths[i] + "x" + heights[i]); + } + } + } + } + + // Check if all images have been filtered + if (filteredIndexes.cardinality() == imagePositions.length) { + doc.add(new Field(ImageSearch.HAS_IMAGE_KEY, "0", Field.Store.YES, Field.Index.TOKENIZED)); + return doc; } else { doc.add(new Field(ImageSearch.HAS_IMAGE_KEY, "1", Field.Store.YES, Field.Index.TOKENIZED)); - for (String imagePos : imagePositions) { - doc.add(new Field(ImageSearch.IMAGE_POS_KEY, imagePos, + } + + // Add other image search related fields to the document + String[] imageIds = metadata.getValues(ImageSearch.IMAGE_IDS_KEY); + for (int i = 0; i < imagePositions.length; i++) { + if (filteredIndexes.get(i)) { + continue; + } else { + doc.add(new Field(ImageSearch.IMAGE_POS_KEY, imagePositions[i], Field.Store.YES, Field.Index.NO)); + doc.add(new Field(ImageSearch.IMAGE_IDS_KEY, imageIds[i], + Field.Store.YES, Field.Index.NO)); + doc.add(new Field(ImageSearch.IMAGE_URLS_KEY, imageUrls[i], + Field.Store.YES, Field.Index.TOKENIZED)); } } - String[] imageIds = metadata.getValues(ImageSearch.IMAGE_IDS_KEY); - for (String imageId : imageIds) { - doc.add(new Field(ImageSearch.IMAGE_IDS_KEY, imageId, - Field.Store.YES, Field.Index.NO)); - } - String[] imageUrls = metadata.getValues(ImageSearch.IMAGE_URLS_KEY); - for (String imageUrl : imageUrls) { - doc.add(new Field(ImageSearch.IMAGE_URLS_KEY, imageUrl, - Field.Store.YES, Field.Index.TOKENIZED)); - } return doc; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-14 03:26:26
|
Revision: 2552 http://archive-access.svn.sourceforge.net/archive-access/?rev=2552&view=rev Author: bradtofel Date: 2008-08-14 03:26:36 +0000 (Thu, 14 Aug 2008) Log Message: ----------- DOC: updated for 1.4 release.. Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-08-14 03:25:42 UTC (rev 2551) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-08-14 03:26:36 UTC (rev 2552) @@ -79,435 +79,164 @@ <section name="Wayback Configuration Overview"> - <p> - The wayback software provides Search and Replay access to documents - contained in a WaybackCollection. Search access allows users to - query a collection to locate documents, and is presently limited - to URL based queries. Replay access allows users to view archived - content in collections within a web browser. A WaybackCollection is - a combination of a ResourceStore, which contains the actual archived - documents, and a ResourceIndex, which provides URL based search of the - documents in the ResourceStore. - </p> - <p> - The Wayback machine is configured using Spring IOC, to specify and - configure concrete implementations of several basic modules. For - information about using Spring, please see - <a href="http://www.springframework.org/docs/reference/beans.html"> - this page - </a>. - </p> - </section> - - - - <section name="Defining WaybackCollections"> - <p> - The XML configuration template for a Wayback collection follows: - <pre> - -<bean id="localbdbcollection" - class="org.archive.wayback.webapp.WaybackCollection"> - <property name="resourceStore" ... /> - <property name="resourceIndex" ... /> - <property name="shutdownables" ... /> -</bean> - - </pre> - </p> <p> - The resourceStore property refers to a bean implementing - <a href="resource_store.html">org.archive.wayback.ResourceStore</a>. + The wayback software provides Query and Replay access to archived + documents. Query access allows users to locate particular documents + within the collection by URL and date. Replay access allows users to + view archived pages within their web browsers. Some Replay modes + require altering the original pages so embedded content is also loaded + from the wayback service, and not from the live web. </p> <p> - The resourceIndex property refers to a bean implementing - <a href="resource_index.html">org.archive.wayback.ResourceIndex</a>. + A WaybackCollection defines a set of archived documents and an index + which allows documents to be located within the collection. A + WaybackCollection may be exposed to end users through one or more + AccessPoints, which define: + <ul> + <li>the WaybackCollection itself</li> + <li>the URL where users can access the collection</li> + <li>how users can query the collection (the Query UI)</li> + <li>how documents are returned to users so they appear correctly in + their web browsers (the Replay UI)</li> + <li>the look and feel of the wayback user interface</li> + <li>who can access the documents in the collection</li> + <li>which documents from the collection are available</li> + </ul> </p> <p> - The shutdownables property refers to a list of beans implementing org.archive.wayback.Shutdownable, typically worker Threads performing automatic updates of the Collection. + Wayback is configured using Spring IOC, to specify and configure + concrete implementations of several basic modules. For information + about using Spring, please see + <a href="http://www.springframework.org/docs/reference/beans.html"> + this page + </a>. </p> - </section> - - <section name="org.archive.wayback.ResourceIndex implementations"> - - - <subsection name="LocalResourceIndex"> + <subsection name="AccessPoint configuration options"> <p> - This ResourceIndex implementation allows wayback to search one of - several index formats hosted on the same machine as the wayback - application. See below for details on which specific index formats - are available. + An AccessPoint's configuration must specify the following + implementations: + <ul> + <li><a href="WaybackCollection_Configuration"><b>collection</b></a> + the specific WaybackCollection being exposed via this + AccessPoint. + </li> + <li><a href="Query_UI"><b>query</b></a> responsible for generating + user visible content in response to user Queries, HTML, XML, + etc.</li> + <li><a href="Replay_Modes"><b>replay</b></a> responsible for + determining the appropriate ReplayRenderer implementation based + on the users request and the particular document to be + Replayed.</li> + <li><b>uriConverter</b> responsible for constructing Replay URLs + from records matching users queries. See Replay Modes below. + </li> + <li><b>parser</b> - responsible for translating incoming requests + into WaybackRequests. See Replay Modes below.</li> + </ul> </p> <p> - The XML configuration template for a LocalResourceIndex follows: - <pre> - -<property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source" ... /> - <property name="maxRecords" value="10000" /> - <property name="dedupeRecords" value="false" /> - </bean> -</property> - - </pre> - </p> - <p> - <b> - maxRecords - </b> - specifies the maximum number of records to process, and thus that can - be returned, during a single query. - </p> - <p> - <b> - dedupeRecords - </b> - set to true if you are using WARC files created by Heritrix 1.12 or - higher and configured the duplicate reduction features. See the - section Duplicate Reduction below for more information. - </p> - <p> - <b> - source - </b> - defines the format to be used for storing and searching records in - the ResourceIndex. There are several possible implementations - available: + An AccessPoint's configuration may optionally specify the following: <ul> + <li><a href="Exception_Rendering"><b>exception</b></a> - an + implementation responsible for generating error pages to users + </li> <li> - <b> - BDBIndex - </b> - This implementation is good for smaller scale installations, up - to 10's of millions of documents, and allows for fast incremental - updates to the index. It also allows for automated index updating. - <pre> - -<bean class="org.archive.wayback.resourceindex.bdb.BDBIndex" - init-method="init"> - <property name="bdbName" value="DB1" /> - <property name="bdbPath" value="/tmp/wayback/index/" /> - <property name="updater"> - <bean class="org.archive.wayback.resourceindex.bdb.BDBIndexUpdater"> - <property name="incoming" value="/tmp/wayback/index-data/incoming/" /> - <property name="failed" value="/tmp/wayback/index-data/failed/" /> - <property name="merged" value="/tmp/wayback/index-data/merged/" /> - <property name="runInterval" value="10000" /> - </bean> - </property> -</bean> - - </pre> - The <b>updater</b> property is optional. If used, a background - index merging thread will be started. Every <b>runInterval</b> - milliseconds, the thread will look for new files in the - <b>incoming</b> directory. Any files present are assumed to be - in CDX file format, and will be merged into the index and - immediately available for access. Files that are not successfully - merged with the index are left in place (or moved to the - <b>failed</b> directory, if it is specified.) Files that are - successfully merged are deleted (or moved to the <b>merged</b> - directory, if it is specified.) - <br></br> + <a href="Adding_Additional_Configurations_to_an_AccessPoint"> + <b>configs</b> + </a> - a Properties associating arbitrary key-value pairs which + are accessible to .jsp files responsible for generating the UI </li> <li> - <b> - CDXIndex - </b> - This implementation is good for larger scale installations, - bounded mostly by the size of the index you can (first create, - and later) store on a single machine. Using the command line tool - <b>arc-indexer</b> or <b>warc-indexer</b>, and the standard UNIX - <b>sort</b> tool (see note below on LC_ALL), you create a sorted - flat text file that is searched on each request. Building these - sorted files, and updating the index are manual operations - presently. - <pre> - -<bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> - <property name="path" value="/tmp/wayback/cdx-index/index.cdx" /> -</bean> - - </pre> + <a href="Excluding_Documents_within_an_AccessPoint"> + <b>exclusionFactory</b> + </a> - an implementation specifying what documents should be + accessible within this AccessPoint </li> <li> - <b> - CompositeSearchResultSource - </b> - This implementation allows for searching multiple CDXIndex text - files for each request. For optimal search efficiency, multiple - index files should be merged (sort -mu) prior to production use, - but this implementation allows a trade-off in simplified index - management for a decrease in search performance. - <pre> - -<bean id="compositecdxresultsource" class="org.archive.wayback.resourceindex.CompositeSearchResultSource"> - <property name="CDXSources"> - <list> - <value>/tmp/wayback/cdx-index/index.cdx.1</value> - <value>/tmp/wayback/cdx-index/index.cdx.2</value> - </list> - </property> -</bean> - - </pre> + <a href="Restricting_who_can_interact_with_an_AccessPoint"> + <b>authentication</b> + </a> - an implementation specifying who is allowed to connect to + this AccessPoint </li> + <li><b>urlRoot</b> - a String URL prefix under which all UI + elements should be referenced. + </li> + <li><b>locale</b> - A specific Locale to use for all requests + within this AccessPoint, overriding the users preferred Locale + as specified by their web browser. + </li> </ul> </p> - - </subsection> - - - <subsection name="RemoteResourceIndex configuration"> <p> - This ResourceIndex option allows hosting of a ResourceIndex on a - machine other than the machine hosting the Wayback webapp. + AccessPoints can be used to provide different levels and types of + access to the same collection for different users. For example, you + can provide both Proxy and Archival URL mode access to a single + collection by defining 2 AccessPoints with different Replay User + Interfaces but the same WaybackCollection. Using AccessPoints, you can + also provide different levels of access to a collection. For example, + users within a particular subnet may be able to access all documents + within a collection via one AccessPoint, but users outside that subnet + may be restricted to viewing documents allowed by a web sites current + robots.txt file. </p> <p> - The XML configuration template for a RemoteResourceIndex follows: - <pre> - -<bean id="remoteindex" class="org.archive.wayback.resourceindex.RemoteResourceIndex" init-method="init"> - <property name="searchUrlBase" value="http://wayback-index.archive.org:8080/wayback/xmlquery" /> -</bean> - - </pre> - <b>searchUrlBase</b> indicates the URL prefix to which OpenSearchQuery - parameters are appended to access a Wayback AccessPoint running a - LocalResourceIndex on a remote host to the Wayback application. + Please refer to + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml">wayback.xml</a> + within the wayback .war file for detailed example AccessPoint + configurations. </p> - </subsection> - - - <subsection name="NutchResourceIndex configuration"> + <subsection name="WaybackCollection Configuration"> <p> - This ResourceIndex option allows the wayback to query a Nutch - full-text search engine. This ResourceIndex option is highly - experimental. For help setting up a NutchResourceIndex, please see - <a href="http://archive-access.sourceforge.net/projects/nutch/wayback.html"> - this page. - </a> + A WaybackCollection's configuration must specify the following + implementations: + <ul> + <li><a href="resource_store.html">resourceStore</a> the specific + implementation used to specific set of documents within this + collection, and how to access them for Replay requests.</li> + <li><a href="resource_index.html">resourceIndex</a> the specific + implementation responsible for locating documents within the + collection.</li> + </ul> </p> <p> - The XML configuration template for a NutchResourceIndex follows: - <pre> - - <property name="remotenutchindex"> - <bean class="org.archive.wayback.resourceindex.NutchResourceIndex" init-method="init"> - <property name="searchUrlBase" value="http://webteam-ws.us.archive.org:8080/katrina/opensearch" /> - <property name="maxRecords" value="100" /> - </bean> - </property> - - </pre> - <b>searchUrlBase</b> indicates the URL prefix to which OpenSearchQuery - parameters are appended to access a Nutch servers XML query interface. - + A WaybackCollection's configuration may optionally specify the + following: + <ul> + <li>shutdownables - an List of one or more beans implementing + org.archive.wayback.Shutdownable needed to maintain this + WaybackCollection, typically Daemon Threads which perform + automatic indexing operations on the resourceStore and the + resourceIndex.</li> + </ul> </p> + <p> + For more information on WaybackCollection configuration options and + automatic indexing, please refer to the following documentation pages + and to the example Spring .xml configuration files within the wayback + .war: + <ul> + <li><a href="resource_store.html">ResourceStore configuration and + automatic indexing</a></li> + <li><a href="resource_index.html">ResourceIndex configuration</a></li> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml">BDBCollection.xml</a></li> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml">CDXCollection.xml</a></li> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml">RemoteCollection.xml</a></li> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml">NutchCollection.xml</a></li> + </ul> + </p> </subsection> </section> - - - <section name="Defining AccessPoints for WaybackCollections"> + <section name="Replay Modes"> <p> - Once you have defined one or more WaybackCollections, you need to - specify how those collections are exposed to end users. Collections are - exposed by defining an AccessPoint for that collection. + There are presently 3 Replay modes supported by the Wayback software, + Archival URL mode, Proxy mode, and an experimental DomainPrefix mode. </p> - <p> - An AccessPoint is a combination of a WaybackCollection, a Query User - Interface, a Replay User Interface, and a URL by which users interact - with that AccessPoint. AccessPoints can also describe mechanisms for - excluding documents, and for limiting what users are allowed to - interact with the AccessPoint. - </p> - <p> - AccessPoints can be used to provide different levels and types of - access to the same collection for different users. For example, you - can provide both Proxy and Archival URL mode access to a single - collection by defining 2 AccessPoints with different Replay User - Interfaces but the same WaybackCollection. Using AccessPoints, you can - also provide different levels of access to a collection. For example, - users within a particular subnet may be able to access all documents - within a collection via one AccessPoint, but users outside that subnet - may be restricted to viewing documents allowed by a web sites current - robots.txt file. - </p> - <p> - The XML configuration template for an AccessPoint follows: - <pre> - -<bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - <property name="collection" ... /> - <property name="query" ... /> - <property name="replay" ... /> - <property name="parser" ... /> - <property name="uriConverter" ... /> - <property name="exclusionFactory" ... /> - <property name="authentication" ... /> - <property name="configs" ... /> -</bean> - - </pre> - </p> - <p> - Required property configurations: - <ul> - <li> - <b> - collection - </b> - is a reference to the WaybackCollection for this AccessPoint. - </li> - <li> - <b> - query - </b> - defines what .jsp files to use to render results for queries to - this AccessPoint. See the section "Query .jsp configuration" for - more information. - </li> - <li> - <b> - replay - </b> - defines what Replay User Interface to use for this AccessPoint. See - the section "Setting up the Replay User Interface within an - AccessPoint" for more information. - </li> - <li> - <b> - parser - </b> - defines how incoming requests are parsed and subsequently processed, - and is usually dependent on the Replay User Interface being used - with this AccessPoint.See the section "Setting up the Replay User - Interface within an AccessPoint" for more information. - </li> - <li> - <b> - uriConverter - </b> - defines how public URLs are constructed to provide Replay access - to this AccessPoint. This is usually dependant on the Replay User - Interface used with this AccessPoint. See the section "Setting up - the Replay User Interface within an AccessPoint" for more - information. - </li> - </ul> - </p> - <p> - Optional property configurations: - <ul> - <li> - <b> - exclusionFactory - </b> - defines how documents are excluded within this AccessPoint. See the - section "Excluding Documents within an AccessPoint" for more - information. - </li> - <li> - <b> - authentication - </b> - defines who is allowed to interact with this AccessPoint. See the - section "Limiting Access to an AccessPoint" for more information. - </li> - <li> - <b> - configs - </b> - Allows additional customizations within this AccessPoint. See the - section "Adding Additional Configurations to an AccessPoint" for - more information. - </li> - </ul> - </p> - </section> - - - <section name="Query .jsp configuration"> - <p> - Wayback provides query results to a .jsp handler page, which is - responsible for rendering final output to users. The actual .jsp file - invoked for the various response types can be configured as described - below. Included with the Wayback package are several reference .jsp - implementations, including one which outputs XML. This XML interface is - used by the Wayback software in distributed index configurations, but - can also be used as an extension point for further user interface - customizations. - </p> - <br></br> - <p> - The XML configuration template for the query Renderer follows below, - including the default configuration for each value. The values indicate - the path to the .jsp file that will be executed to generate the output - for each class of query. - <pre> - -<bean class="org.archive.wayback.query.Renderer"> - <property name="errorJsp" value="/jsp/HTMLError.jsp" /> - <property name="xmlErrorJsp" value="/jsp/XMLError.jsp" /> - <property name="captureJsp" value="/jsp/HTMLResults.jsp" /> - <property name="urlJsp" value="/jsp/HTMLResults.jsp" /> - <property name="xmlJsp" value="/jsp/XMLResults.jsp" /> -</bean> - - </pre> - The following list indicates when each .jsp is executed: - <ul> - <li> - <b> - errorJsp - </b> - will be executed when any type of expected error condition occurs - during handling of a request. - </li> - <li> - <b> - xmlErrorJsp - </b> - will be executed when any type of expected error condition occurs - during handling of a request indicating that xml response data is - desired. - </li> - <li> - <b> - captureJsp - </b> - will be executed when results listing captures for a specific, - single URL are requested in HTML format. - </li> - <li> - <b> - urlJsp - </b> - will be executed when results listing captures for multiple URLs, - each URL having one or more captures, are requested in HTML format. - </li> - <li> - <b> - xmlJsp - </b> - will be executed when results are requested in XML format. - </li> - </ul> - </p> - </section> - - <section name="Setting up the Replay User Interface within an AccessPoint"> - <p> - There are presently 2 Replay modes supported by the Wayback software, - Archival URL mode, and Proxy mode. - </p> - <subsection name="Archival URL"> + <subsection name="Archival URL Replay Mode"> <p> Archival URL Replay mode uses a modified URL to designate - documents stored in ARC files. The general form of an + documents stored in ARC/WARC files. The general form of an Archival URL is: <br></br> <div> @@ -519,7 +248,7 @@ where <ul> <li> - <b>HOSTNAME</b> is the host where the Wayback Machine is + <b>HOSTNAME</b> is the host where the Wayback software is running. </li> <li> @@ -528,9 +257,9 @@ the Access Point. See below for example CONTEXT mappings. </li> <li> - <b>CONTEXT</b> is the context where the Wayback Machine - webapp has been deployed, plus the name of the Access Point. See - below for example CONTEXT mappings. + <b>CONTEXT</b> is the context where the Wayback webapp has been + deployed, plus the name of the Access Point. See below for + example CONTEXT mappings. </li> <li> <b>TIMESTAMP</b> is 0 to 14 digits of a date, possibly @@ -724,24 +453,11 @@ </table> </p> <p> - The properties <b>replay</b>, <b>parser</b>, and <b>uriConverter</b> + The properties <b>parser</b> and <b>uriConverter</b> for Archival URL Access Points must be set to the following implementations: <pre> - <property name="replay"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlReplayDispatcher"> - <property name="serverSideRendering" value="false" /> - <property name="jspInserts"> - <list> - <value>/replay/ArchiveComment.jsp</value> - <value>/replay/ClientSideJSInsert.jsp</value> - <value>/replay/Timeline.jsp</value> - </list> - </property> - </bean> - </property> - <property name="parser"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" init-method="init"> @@ -772,55 +488,6 @@ </tr> <tr> <td> - serverSideRendering - </td> - <td> - required - </td> - <td> - When set to true, all URL rewriting occurs on the server, - eliminating the need for client side Javascript rewriting. If this - option is set to false, then the <i>ClientSideJSInsert.jsp</i> - <b>jspInsert</b> should be used. If this option is true, and - you're attempting to set up an entirely JavaScript free - installation which includes an embedded Timeline in replayed - HTML documents, you can use the <i>JSLessTimeline.jsp</i> - <b>jspInsert</b>. - </td> - </tr> - <tr> - <td> - jspInserts - </td> - <td> - optional - </td> - <td> - If any values are included here, then those .jsp files will be - invoked for every replayed document, and the resulting output - will be included in replayed HTML pages. The example included - here will result in: - <ul> - <li> - An HTML comment embedded inside replayed web pages indicating - the dates the document was captured and the date it was served - by wayback. - </li> - <li> - A reference to a javascript file, client-rewrite.js, which - will attempt to modify URLs within the users browser to make - them direct back into wayback. - </li> - <li> - A timeline banner embedded in the top of HTML pages that - allows navigation between other versions of the currently - viewed document. - </li> - </ul> - </td> - </tr> - <tr> - <td> maxRecords </td> <td> @@ -857,36 +524,39 @@ </tr> </table> <p> - Note that the old <b>jsInserts</b> configuration has been deprecated, - in favor of including references to JavaScript files using jspInserts. - Also note that the use of the ClientSideJSInsert.jsp is required when - serverSideRendering is set to false. + For additional configuration examples and information about + ArchivalUrl Replay mode, please see the file + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml">ArchivalUrlReplay.xml</a> </p> </subsection> - <subsection name="Proxy"> + <subsection name="Proxy Replay Mode"> <p> Wayback can be configured to act as an HTTP proxy server. To utilize this mode, the wayback webapp must be deployed as the ROOT context, and client browser must be configured to proxy all HTTP requests through the Wayback Machine application. Instead of retrieving documents from the live web, the Wayback Machine will retrieve - documents from the local repository of ARC files. + documents from the configured WaybackCollection. </p> <p> Proxy Replay mode does not suffer from the shortcomings of - the inserted Javascript that the Archival URL mode uses, - but it has one major drawback: there is no way to - specify which version of a captured document should - be replayed. Only the URL to be replayed is sent from the - client browser to the Wayback Machine - no date information - is sent with the request. + the inserted Javascript that the Archival URL mode uses, all URLs + function as they did originally, but there can be another drawback + to using this feature: no date information is sent with each request. + Wayback attempts to address this problem by associating the date + clicked on query pages when a Replay session is begun, with the + users IP address. This can fail to work properly in situations where + multiple users are behind a NAT system which causes them to appear to + have the same IP address. </p> <p> - In Proxy Replay mode, the Wayback Machine will return the - most recent version captured of any requested page. This - behavior can be changed by using the experimental Firefox-specific - plugin developed by Oskar Grenholm. You can find out more about + Additionally, there is an experimental Firefox-specific plugin + developed by Oskar Grenholm, which sends a provides a novel interface + to navigate between different captured versions of a page within + Proxy mode, and also sends a special HTTP header which allows Wayback + to uniquely associate the correct date with browsers, even those + behind a NAT system. You can find out more about this plugin and download it <a href="http://archive-access.sourceforge.net/projects/waxtoolbar/"> here @@ -905,17 +575,15 @@ <pre> <bean name="8090" parent="8080:wayback"> - <property name="useServerName" value="true" /> - <property name="replay"> - <bean class="org.archive.wayback.proxy.ProxyReplayDispatcher" /> - </property> + <property name="urlRoot" value="http://wayback.somehost.org/" /> + <property name="replay"> ref="proxyreplay" /> <property name="uriconverter"> <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> - <property name="redirectURI" value="http://wayback.somehost.org:8090/jsp/Redirect.jsp" /> + <property name="redirectURI" value="http://wayback.somehost.org/jsp/Redirect.jsp" /> </bean> </property> <property name="parser"> - <bean class="org.archive.wayback.proxy.ProxyRequestParser" init-method="init"> + <bean class="org.archive.wayback.proxy.ProxyRequestParser" > <property name="localhostNames"> <list> <value>wayback.somehost.org</value> @@ -934,13 +602,256 @@ primary name of the machine running the Wayback application, then you may need to also specify the hostname used for the Wayback application in the <b>localhostNames</b> configuration list. - </p> + </p> + <p> + For additional configuration examples and information about + Proxy Replay mode, please see the file + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ProxyReplay.xml">ProxyReplay.xml</a> + </p> </subsection> + <subsection name="DomainPrefix Replay Mode"> + <p> + Wayback includes an additional, experimental Replay mode which is + similar to Archival URL mode, in that any document can be refernced + as a global URL, without any browser configuration requirements. This + mode requires deploying the Wayback webapp in ROOT context, and a + special DNS wildcard aliasing, so that all hostnames with a common + suffix will be directed to your host running Wayback. + </p> + <p> + The general form of a DomainPrefix URL is: + <br></br> + <div> + <code> + http://TIMESTAMP.ARCHIVE-HOSTNAME.WAYBACK-HOSTNAME:PORT/ARCHIVE-PATH + </code> + </div> + </p> + <p> + Here is an example DomainPrefix URL, on an assumed host + <b>wayback.somehost.org</b>, with a wayback webapp deployed as + <b>ROOT</b>, via the Access Point named <b>8081</b> (which indicates the + port Wayback requests will be recieved on) for the + page <b>http://www.yahoo.com/foo.gif</b> on Dec 31, 1999 at 12:00:00 UTC. + <br></br> + <div> + <code> + http://19991231120000.www.yahoo.com.wayback.somehost.org:8081/foo.gif + </code> + </div> + </p> + <p> + This mode performs all URL rewriting on the server side, so needs no + client-side Javascript to execute, and also does not suffer from some + of the request leakage problems present in Archival URL mode. It + presently is somewhat naive about rewriting links within returned + documents, and will also rewrite URLs in the text of pages + (not desired), as well as URLs referenced within the page (desired). + </p> + <p> + For additional configuration examples and information about + Domain Prefix Replay mode, please see the files + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml">wayback.xml</a> + and + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml">DomainPrefixReplay.xml</a> + . + </p> + </subsection> </section> + <section name="Wayback UI customization options"> + <p> + Wayback provides several opportunities for customizing the user + interface presented to users, which can be grouped into 4 categories: + <ul> + <li>Query UI rendering .jsp files.</li> + <li>Replay insert .jsp files.</li> + <li>Exception rendering .jsp files.</li> + <li>Localization .properties files.</li> + </ul> + </p> + <subsection name="Query UI"> + <p> + All content returned by Wayback in response to Query requests is + generated by .jsp files, which are executed and provided access to + the results found within the ResourceIndex. Wayback is distributed + with several sample implementations. + </p> + <p> + To alter the default behavior, you may either provide your own .jsp + files, and configure the Renderer to use them instead of the + default .jsp files, or the default .jsp files may be modified + directly. + <ul> + <li> + <b>captureJsp</b> - used when the request indicates that + a listing of all dates available for a single URL should be + returned. Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLCaptureResults.jsp">/WEB-INF/query/HTMLCaptureResults.jsp</a>. + An alternate implementation, /WEB-INF/query/CalendarResults.jsp + will generate HTML output similar to the global Wayback Machine + service. + </li> + <li> + <b>urlJsp</b> - used when the request indicates that a summary + of captures available for a number of URLs should be returned. + Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLUrlResults.jsp">/WEB-INF/query/HTMLUrlResults.jsp</a> + </li> + <li> + <b>xmlCaptureJsp</b> - used when the request indicates that + a listing of all dates available for a single URL should be + returned in XML format. Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/XMLCaptureResults.jsp">/WEB-INF/query/XMLCaptureResults.jsp</a>. + </li> + <li> + <b>xmlUrlJsp</b> - used when the request indicates that a + summary of captures available for a number of URLs should be + returned in XML format. + Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/XMLUrlResults.jsp">/WEB-INF/query/XMLUrlResults.jsp</a> + </li> + </ul> + </p> + </subsection> + <subsection name="Replay Inserts"> + <p> + Wayback allows for embedding additional content within replayed HTML + pages in all Replay modes. This is accomplished by executing one or + more .jsp files with access to context information about the request, + the results, and the actual Resource being returned. The output of + each .jsp file is included within the returned page. + </p> + <p> + Wayback is distributed with several example .jsp insert files that + can be used as is, modified to suit installation requirements, or + used as examples for more elaborate customizations: + <ul> + <li> + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ArchiveComment.jsp">/WEB-INF/replay/ArchiveComment.jsp</a> + inserts an HTML comment indicating when the document was + captured and retrieved. + </li> + <li> + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ClientSideJSInsert.jsp">/WEB-INF/replay/ClientSideJSInsert.jsp</a> + inserts some Javascript into the returned HTML page that updates + links, images, and other embedded content, attempting to make + all URL references within the page point back into the Wayback + service. + </li> + <li> + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/DebugBanner.jsp">/WEB-INF/replay/DebugBanner.jsp</a> + Not intended for production use, but a slightly more complex + jsp insert example that demonstrates how to access various + request context data, and is sometimes useful for debugging. + </li> + <li> + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Disclaimer.jsp">/WEB-INF/replay/Disclaimer.jsp</a> + Inserts a small banner at the top of replayed HTML pages, + alerting users that they are viewing an archived page, and + providing some information about the particular capture. + </li> + <li> + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/JSLessTimeline.jsp">/WEB-INF/replay/JSLessTimeline.jsp</a> + Inserts a banner in the top of replayed documents which allows + users to navigate directly between other captures of the current + page they are viewing. This version does not use Javascript to + place the banner, so it will appear in all HTML pages within a + frameset. + </li> + <li> + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp">/WEB-INF/replay/Timeline.jsp</a> + Inserts a banner in the top of replayed documents which allows + users to navigate directly between other captures of the current + page they are viewing. This version uses Javascript to + place the banner, attempting to only place the banner in the + largest frame within a frameset. + </li> + </ul> + </p> + </subsection> + <subsection name="Exception Rendering"> + <p> + Wayback is distributed with a default ExceptionRenderer that allows + customization of several types of anticipated exceptions that can + occur through normal operations. The BaseExceptionRenderer allows + installations to provide alternate .jsp files which are executed, and + the output of these .jsp files are returned to end users. To alter + the default behavior, you may either provide your own .jsp files, and + configure the BaseExceptionRenderer to use them instead of the + default .jsp files, or the default .jsp files may be modified + directly. + <ul> + <li> + <b>xmlErrorJsp</b> - used when the request indicates that XML + data should be returned. Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/XMLError.jsp">/WEB-INF/exception/XMLError.jsp</a> + </li> + <li> + <b>errorJsp</b> - used for HTML Replay exceptions, and for all + Query exceptions. Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/HTMLError.jsp">/WEB-INF/exception/HTMLError.jsp</a> + </li> + <li> + <b>imageErrorJsp</b> - used when the request appears to be an + embedded Replay request that expects an image to be returned. + Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/HTMLError.jsp">/WEB-INF/exception/HTMLError.jsp</a> + which produced HTML output. This may be desirable over + returning an actual image, since web browsers will usually show + any HTML alternate text associated with the image in place of + the image when image data is not returned. Wayback also + includes a 1x1 pixel gif, error_image.gif, which can be used to + display a gray box in place of images requests that result in + an exception. + </li> + <li> + <b>javascriptErrorJsp</b> - used when the request appears to be an + embedded Replay request that expects Javascript content to be + returned. Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/JavaScriptError.jsp">/WEB-INF/exception/JavaScriptError.jsp</a> + </li> + <li> + <b>cssErrorJsp</b> - used when the request appears to be an + embedded Replay request that expects CSS content to be returned. + Default is + <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/exception/CSSError.jsp">/WEB-INF/exception/CSSError.jsp</a> + </li> + </ul> + </p> + </subsection> + <subsection name="Localization .properties files."> + <p> + Wayback is packaged with a set of reference implementation .jsp files + for generating Query, Replay, and Exception user interface pages. + References to actual user visible text is abstracted within these + .jsp files so the specific text to display in various pages are read + from a .properties file. Wayback will automatically search for a + Locale-specific .properties file from which these text values should + be loaded, allowing the language presented to users to be changed. + </p> + <p> + By default, Wayback will use the language preference indicated by the + users web browser to find an appropriate .properties files, + defaulting to the standard English text if the users preferred + language is not available. Particular AccessPoints can be forced to a + particular Locale using the AccessPoint.locale property. + </p> + <p> + Several language customization .property files have already been + contributed by users in the community and are now included with the + standard Wayback distribution. We plan for a completely new and + improved UI implementation for version 1.6, and plan a more active + outreach program to create customizations in as many languages as + possible once this new UI is completed, and the required text + elements are determined. + </p> + </subsection> + </section> + <section name="Excluding Documents within an AccessPoint"> <subsection name="Excluding Documents with live Robots.txt"> Documents may be excluded from access within an Access Point by @@ -1193,7 +1104,9 @@ <p> The <b>-identity</b> option causes the tools to skip canonicalization of URLs. See the documentation for the <b>url-client</b> tool, and - the URL Canonicalization section below for more information. + the <a href="resource_index.html#URL_Canonicalization"> + URL Canonicalization + </a> section for more information. </p> </subsection> @@ -1224,7 +1137,7 @@ <i> LOCATION_URL </i> - is the absolute URL where the ArcProxy can be + is the absolute URL where the FileProxy can be accessed. ex. <b> http://wayback-webapp.your-archive.org:8080/locationdb/locationDB @@ -1271,7 +1184,10 @@ canonicalization function, but can also be used, if the canonicalization function is altered, to update an existing CDX index, without recreating CDX files from original ARCs. See the - seciond URL Canonicalization for more information. + section + <a href="resource_index.html#URL_Canonicalization"> + URL Canonicalization + </a> for more information. </p> <p> <code> @@ -1297,30 +1213,31 @@ </section> - <section name="ArcProxy and LocationDB application"> + <section name="FileProxy and LocationDB application"> <p> - The Wayback software includes an additional application, the ArcProxy, + The Wayback software includes an additional application, the FileProxy, which can simplify some distributed ResourceStore implementations. The - ArcProxy application exposes two external services, one used to - configure the underlying database mapping ARC filenames to the actual, - fully qualified HTTP 1.1 URL, and a second service which reverse proxies - incoming HTTP 1.1 range requests to appropriate back-end storage nodes. + FileProxy application exposes two external services, one used to + configure the underlying database mapping ARC/WRC filenames to the + actual, fully qualified HTTP 1.1 URL or local path, and a second + service which reverse proxies incoming HTTP 1.1 range requests to + appropriate back-end storage nodes. </p> <p> - The <b>arcproxy</b> reverse proxy service allows one or more HttpARCResourceStore - instances to configure a single URL prefix where all ARC files are - assumed to be located. This reverse proxy then uses a BDB JE to find the - actual current location of the ARC file, and forward the request to the - actual host holding the ARC file. + The <b>fileproxy</b> reverse proxy service allows one or more + SimpleResourceStore instances to configure a single URL prefix where + all ARC/WARC files are assumed to be located. This reverse proxy then + uses a BDB JE to find the actual current location of the ARC/WARC file, + and forward the request to the actual host holding the ARC/WARC file. </p> <p> The <b>locationdb</b> service allows population and management of the - BDB JE database(the <i>locationDB</i>) used by the <b>arcproxy</b> + BDB JE database(the <i>locationDB</i>) used by the <b>fileproxy</b> service. There is also a command line tool, <b>location-client</b> described elsewhere in this document which provides command line access to the management of the locationDB. @@ -1328,210 +1245,27 @@ <p> Adding the following configuration to wayback.xml will expose the - arcproxy and locationdb services: + fileproxy and locationdb services: </p> <pre> -<bean id="filelocationdb" class="org.archive.wayback.resourcestore.http.FileLocationDB" +<bean id="filelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB" init-method="init"> - <property name="bdbPath" value="/tmp/wayback/arc-db" /> + <property name="bdbPath" value="/tmp/wayback/file-db/db/" /> <property name="bdbName" value="DB1" /> - <property name="logPath" value="/tmp/wayback/arc-db.log" /> + <property name="logPath" value="/tmp/wayback/file-db/db.log" /> </bean> -<bean name="8080:arcproxy" class="org.archive.wayback.resourcestore.http.ArcProxyServlet"> +<bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> <property name="locationDB" ref="filelocationdb" /> </bean> -<bean name="8080:locationdb" class="org.archive.wayback.resourcestore.http.FileLocationDBServlet"> +<bean name="8080:locationdb" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet"> <property name="locationDB" ref="filelocationdb" /> </bean> </pre> </section> - <section name="URL Canonicalization"> - <subsection name="Introduction and Concepts"> - <p> - Sometimes URLs found in the field can have multiple forms, for - example: - <pre> - http://www.example.com/img/foo.gif - http://www.example.com/docs/../img/foo.gif - </pre> - are both valid representations of the exact same URL. Another, less - certain example would be: - <pre> - http://www.example.com/Interview.html - http://www.example.com/interview.html - </pre> - which differ only in the capitalization of the letter "i". On some - operating systems, these two URLs legitimately specify two distinct - documents. On Windows platforms, they refer to the same document. If - the document on a web server is actually named "Interview.html", but - a web designer creates a web page that refers to this document using - the lowercase "interview.html", then the link will work, and they and - the web site visitors may never notice the difference. The same - situation on a different operating system would probably not work - (although some web server plugins and modules will also correct this - problem transparently) and the web designer would probably notice and - correct the problem. In practice, we have found that it is very rare - for the two URLs above with different capitalization to refer to - different documents, and they can be treated as equivalent in most - situations. - </p> - <p> - Another example, which occurs far more often in the real world, - involves web servers injecting a session ID inside paths to documents - hosted on that web server. These session IDs allow the web server to - track individual user's states. Here are some example URLs - demonstrating path session ID injection: - <pre> - http://www.example.com/(S(4hqa0555fwsecu455xqckv45))/page1.aspx - http://www.example.com/(S(4hqa0555fwsecu455xqckv45))/page2.aspx - http://www.example.com/(S(a63098d96360a63098d96360))/page3.aspx - </pre> - In these examples, the first two URLs are using one session ID, and - the third uses a different session ID. If <b>page3.aspx</b> refers to - <b>page1.aspx</b> using an anchor like this: - <pre> - <a href="page1.aspx">page1</a> - </pre> - and a user visiting <b>page3.aspx</b> clicks the link to page1, then - the wayback will recieve a request for the URL: - <pre> - http://www.example.com/(S(a63098d96360a63098d96360))/page1.aspx - </pre> - If page1.aspx was captured using the different session ID, then the - wayback will be unable to locate this document in the index, even - though it was captured. - </p> - <p> - This session ID problem can be mitigated by <i>canonicalizing</i> the - URLs as they are placed in the index, so the index would contain the - following URLs, instead of the original form, which the crawler - captured: - <pre> - http://www.example.com/page1.aspx - http://www.example.com/page2.aspx - http://www.example.com/page3.aspx - </pre> - If the same canonicalization scheme is used to transform incoming - requests, before attempting to lookup URLs in the index, then the - software is able to locate and return the documents correctly. - </p> - </subsection> - <subsection name="Current Status within Wayback"> - <p> - Currently the Wayback includes only a single reference implementation - of a canonicalization scheme, which is currently called - <b>AggressiveUrlCanonicalizer</b>. This implementation provides the - following canonicalization: - <ul> - <li> - <b>www# removal</b> - http://www.example.com => example.com, - http://www13.example.com => example.com - </li> - <li> - <b>user info removal</b> - http://us...@ex... => example.com, - http://user:pas...@ex... => example.com, - </li> - <li> - <b>session ID removal</b> - http://www.example.com/(S(a63098d96360a63098d96360))/page1.aspx - ... [truncated message content] |
From: <bra...@us...> - 2008-08-14 03:25:33
|
Revision: 2551 http://archive-access.svn.sourceforge.net/archive-access/?rev=2551&view=rev Author: bradtofel Date: 2008-08-14 03:25:42 +0000 (Thu, 14 Aug 2008) Log Message: ----------- INITIAL REV: split out and updated documentation for ResourceIndex configuration options, including Canonicalization Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml Added: trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_index.xml 2008-08-14 03:25:42 UTC (rev 2551) @@ -0,0 +1,377 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> +<document> + <properties> + <title>Resource Index Configuration</title> + <author email="brad at archive dot org">Brad Tofel</author> + <revision>$$Id$$</revision> + </properties> + + <body> + <section name="ResourceIndex configuration options"> + <subsection name="Overview"> + <p> + A ResourceIndex locates documents within a WaybackCollection through + a single method: + <pre> + public SearchResults query(final WaybackRequest request) + throws ResourceIndexNotAvailableException, + ResourceNotInArchiveException, BadQueryException, + AccessControlException; + </pre> + The ResourceIndex is responsible for deciding which SearchResults + subclass, <b>CaptureSearchResults</b> or <b>UrlSearchResults</b>, is + appropriate for the WaybackRequest argument, and for populating the + returned SearchResults object with matching records. + </p> + <p> + When the request indicates the user wishes to find specific captures + of a single URL, CaptureSearchResults should be returned. When the + request may return results for multiple URLs, for example a query + attempting to locate all URLs beginning with a given prefix within + the WaybackCollection, a URLSearchResults object should be + returned. + </p> + </subsection> + <subsection name="LocalResourceIndex configuration options"> + <p> + This ResourceIndex implementation assumes a local database of all + documents within the WaybackCollection. The type of database is + specified with the <b>source</b> property. + </p> + <p> + The following configuration is required for a LocalResourceIndex: + <ul> + <li>source - a bean implementing SearchResultSource, which can be + one of the following: + <ul> + <li>BDBIndex - a BDBJE database holding records for all + documents within the WaybackCollection. This + implementation allows for fast incremental updates to the + index, and is required when using automatic indexing. + This implementation scales well to 10's of millions of + records.</li> + <li>CDXIndex - a sorted flat file containing one line per + document within the WaybackCollection. This + implementation requires that the CDX file be manually + maintained, but scales to very large sizes, limited + primarily by the size of file you can build and store. + CDX files can be built using the command line tool + <b>arc-indexer</b> or <b>warc-indexer</b>, and the + standard UNIX <b>sort</b> tool. + </li> + <li>CompositeSearchResultSource - an implementation allowing + aggregation of multiple SearchResultSources into a single + logical SearchResultSource. Use of BDBIndex + SearchResultSources within this class is experimental, + but this implementation has been used successfully in + production installations to serve results from several + CDXIndex files. For optimal search efficiency, multiple + index files should be merged (sort -mu) prior to + production use, but this implementation allows a + trade-off in simplified index management for a decrease + in search performance. A useful strategy for managing + large scale collections is to use several CDX files of + increasing size. Updates to the set of CDX files are + always performed against the smallest CDX file, and + occasionally this small file is merged with one of the + larger files, minimizing the amount of data that needs to + be read, sorted, and written back to disk to update the + set of CDX files.</li> + </ul> + </li> + </ul> + </p> + <p> + The following configurations are optional for LocalResourceIndexes: + <ul> + <li>maxRecords - integer maximum number of records to process for a + single request. Useful to prevent a single request from using + too much Disk and CPU resources.</li> + <li>dedupeRecords - boolean value that should be set to <i>true</i> + when using deduplicated WARC records. This causes Wayback to + modify search results as they are read from the index, so + records indicating a resource was inspected but not saved are + accessible within the Wayback. Please see the + <a href="#Duplicate_Reduction">Duplicate Reduction</a> section + below for more information.</li> + <li>annotater - experimental hook for modifying or omitting records + as they are read from the index. For example, additional + metadata could be associated with each record from an external + datasource, and this extra metadata could then be exposed to + end users through a .jsp customization.</li> + <li>canonicalizer - an implementation of UrlCanonicalizer. See the + section labeled URL Canonicalization below for more + information.</li> + </ul> + </p> + <p> + For specific Spring configuration examples of these ResourceIndex + options, please refer to the following files distributed within the + wayback .war file: + <ul> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml">BDBCollection.xml</a></li> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml">CDXCollection.xml</a></li> + </ul> + </p> + </subsection> + <subsection name="RemoteResourceIndex configuration options"> + <p> + This ResourceIndex implementation requests an external Wayback + installation to satisfy index requests, and can be useful for + distributed installations, as well as for experimenting with new + Wayback configurations and installations using an existing + ResourceIndex. For example, a development system can be configured to + use a production index remotely, minimizing the requirements and + setup required to test new configurations. + </p> + <p> + The actual index must be stored on another Wayback installation, and + is requested as XML through this implementation. + </p> + <p> + The following configuration is required for a RemoteResourceIndex: + <ul> + <li>searchUrlBase - the URL prefix indicating the AccessPoint + actually holding the ResourceIndex. + </li> + </ul> + </p> + <p> + The following configurations are optional for LocalResourceIndexes: + <ul> + <li>canonicalizer - an implementation of UrlCanonicalizer. See the + section labeled URL Canonicalization below for more + information.</li> + </ul> + </p> + <p> + For a Spring configuration example of this ResourceIndex option, + please refer to the following files distributed within the wayback + .war file: + <ul> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml">RemoteCollection.xml</a></li> + </ul> + </p> + </subsection> + <subsection name="NutchResourceIndex configuration options"> + <p> + This implementation, similar to the RemoteResourceIndex, accesses + index functionality from an external NutchWAX installation. This mode + of operation is considered experimental, and is not used within + Internet Archive production installations for performance reasons. + IA enables search within archived collections using NutchWax, but a + separate CDX index is used for Wayback Query and Replay + functionality. NutchWax is customized in these installations to + generate links within it's internal UI that direct to the appropriate + pages within the corresponding Wayback installation. + </p> + <p> + The following configuration is required for a RemoteResourceIndex: + <ul> + <li>searchUrlBase - the URL prefix indicating the opensearch API to + be used for queries. + </li> + </ul> + </p> + <p> + The following configurations are optional for LocalResourceIndexes: + <ul> + <li>maxRecords - integer maximum number of records to request from + the external NutchWax opensearch API.</li> + </ul> + </p> + <p> + For a Spring configuration example of this ResourceIndex option, + please refer to the following files distributed within the wayback + .war file: + <ul> + <li><a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml">NutchCollection.xml</a></li> + </ul> + </p> + </subsection> + </section> + <section name="URL Canonicalization"> + <subsection name="Introduction and Concepts"> + <p> + Sometimes URLs found in the field can have multiple forms, for + example: + <pre> + http://www.example.com/img/foo.gif + http://www.example.com/docs/../img/foo.gif + </pre> + are both valid representations of the exact same URL. Another, less + certain example would be: + <pre> + http://www.example.com/Interview.html + http://www.example.com/interview.html + </pre> + which differ only in the capitalization of the letter "i". On some + operating systems, these two URLs legitimately specify two distinct + documents. On Windows platforms, they refer to the same document. If + the document on a web server is actually named "Interview.html", but + a web designer creates a web page that refers to this document using + the lowercase "interview.html", then the link will work, and they and + the web site visitors may never notice the difference. The same + situation on a different operating system would probably not work + (although some web server plugins and modules will also correct this + problem transparently) and the web designer would probably notice and + correct the problem. In practice, we have found that it is very rare + for the two URLs above with different capitalization to refer to + different documents, and they can be treated as equivalent in most + situations. + </p> + <p> + Another example, which occurs far more often in the real world, + involves web servers injecting a session ID inside paths to documents + hosted on that web server. These session IDs allow the web server to + track individual user's states. Here are some example URLs + demonstrating path session ID injection: + <pre> + http://www.example.com/(S(4hqa0555fwsecu455xqckv45))/page1.aspx + http://www.example.com/(S(4hqa0555fwsecu455xqckv45))/page2.aspx + http://www.example.com/(S(a63098d96360a63098d96360))/page3.aspx + </pre> + In these examples, the first two URLs are using one session ID, and + the third uses a different session ID. If <b>page3.aspx</b> refers to + <b>page1.aspx</b> using an anchor like this: + <pre> + <a href="page1.aspx">page1</a> + </pre> + and a user visiting <b>page3.aspx</b> clicks the link to page1, then + the wayback will recieve a request for the URL: + <pre> + http://www.example.com/(S(a63098d96360a63098d96360))/page1.aspx + </pre> + If page1.aspx was captured using the different session ID, then the + wayback will be unable to locate this document in the index, even + though it was captured. + </p> + <p> + This session ID problem can be mitigated by <i>canonicalizing</i> the + URLs as they are placed in the index, so the index would contain the + following URLs, instead of the original form, which the crawler + captured: + <pre> + http://www.example.com/page1.aspx + http://www.example.com/page2.aspx + http://www.example.com/page3.aspx + </pre> + If the same canonicalization scheme is used to transform incoming + requests, before attempting to lookup URLs in the index, then the + software is able to locate and return the documents correctly. + </p> + </subsection> + <subsection name="Current Status within Wayback"> + <p> + Currently the Wayback includes only a single reference implementation + of a canonicalization scheme, which is currently called + <b>AggressiveUrlCanonicalizer</b>. This implementation provides the + following canonicalization: + <ul> + <li> + <b>www# removal</b> + http://www.example.com => example.com, + http://www13.example.com => example.com + </li> + <li> + <b>user info removal</b> + http://us...@ex... => example.com, + http://user:pas...@ex... => example.com, + </li> + <li> + <b>session ID removal</b> + http://www.example.com/(S(a63098d96360a63098d96360))/page1.aspx + => + example.com/page1.aspx + <br></br> + <i>(and other common session ID path injection schemes)</i> + </li> + <li> + <b>path and CGI argument lowercasing</b> + http://www.example.com/Interviews.cgi?Interview=Left + => + example.com/interviews.cgi?interview=left + </li> + <li> + <b>extra query argument delimiter removal</b> + http://www.example.com/Interviews.cgi?Interview=Left& + => + example.com/interviews.cgi?interview=left + </li> + <li> + <b>unneeded query specifier removal</b> + http://www.example.com/Interviews.cgi? + => + example.com/interviews.cgi + </li> + </ul> + These heuristics generally lead to correcting many common URL lookup + problems, but in some cases, these operation do the wrong thing, + typically by making content which is actually different appear to be + the same thing. + </p> + <p> + At the IA, we have recently switched to building CDX files using the + <b>-identity</b> option on the <b>arc-indexer</b> and + <b>warc-indexer</b> tools, and have added an additional step in our + CDX creation processes which uses the <b>url-client</b> tool before + sorting and merging CDX files. By keeping the original "identity" CDX + files, we have been able to test various URL canonicalization + strategies without the overhead of re-processing all the source + materials. + </p> + </subsection> + <subsection name="Future Directions within Wayback"> + <p> + In upcoming wayback releases, we intend to provide more + canonicalization implementations, including a configurable + implementation that will allow broad customization capabilities. + </p> + <p> + We also intend to alter the format of wayback indexes significantly. + Using this new format will be optional, but once indexes are created + in the new format is created, other indexes with different + canonicalization strategies can be built from them without requiring + a complete reindex of the original ARC/WARC content. + </p> + <p> + The new format will also allow a degree of dynamic canonicalization + at run-time, meaning different strategies can be tested using the + same indexes, and site-specific canonicalization strategies may be + possible. + </p> + <p> + We anticipate that allowing (advanced) users to easily change between + canonicalization strategies within the same wayback session will + promote better community understanding of the impacts of different + strategies, and will enable the community to build a set of best + practices for URL canonicalization. + </p> + </subsection> + </section> + <section name="Duplicate Reduction"> + <p> + Heritrix 1.12 and above have the capability to write WARC files, which + omit storing documents that have not changed since a previous visit. For + specifics on activating these features, please refer to the Heritrix + documentation. When Heritrix is using these features, and notices that + a document has not changed since the last time it was visited, it + creates an abbreviated WARC record, indicating that the document was + retrieved but not stored. In this abbreviated WARC record is an + indicator of the SHA1 digest of the document. + </p> + <p> + The wayback uses these identical SHA1 digests to map the location + (ARC/WARC + offset) of the original record that was stored to subsequent + records that were not. When a request for a subsequent capture that was + not stored is received by wayback, it will return the content of the + previous stored record. + </p> + <p> + The matching of these digests occurs at query time, and is configured + by setting the "dedupeRecords" option of the LocalResourceIndex to + "true". + </p> + </section> + </body> +</document> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-14 03:24:41
|
Revision: 2550 http://archive-access.svn.sourceforge.net/archive-access/?rev=2550&view=rev Author: bradtofel Date: 2008-08-14 03:24:49 +0000 (Thu, 14 Aug 2008) Log Message: ----------- TWEAK: tested & finalized for 1.4 release. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2008-08-14 02:00:33 UTC (rev 2549) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2008-08-14 03:24:49 UTC (rev 2550) @@ -37,8 +37,8 @@ <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> <value>/WEB-INF/replay/Disclaimer.jsp</value> <!-- + <value>/WEB-INF/replay/Timeline.jsp</value> <value>/WEB-INF/replay/DebugBanner.jsp</value> - <value>/WEB-INF/replay/Timeline.jsp</value> --> </list> </property> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-14 02:00:33 UTC (rev 2549) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-14 03:24:49 UTC (rev 2550) @@ -58,9 +58,9 @@ The XML files indicated in the following import tags contain alternate example implementations of WaybackCollections. --> + <import resource="BDBCollection.xml"/> +<!-- <import resource="NutchCollection.xml"/> -<!-- - <import resource="BDBCollection.xml"/> <import resource="CDXCollection.xml"/> <import resource="RemoteCollection.xml"/> --> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-14 02:00:23
|
Revision: 2549 http://archive-access.svn.sourceforge.net/archive-access/?rev=2549&view=rev Author: bradtofel Date: 2008-08-14 02:00:33 +0000 (Thu, 14 Aug 2008) Log Message: ----------- BUGFIX (unreported) was not explicitly setting UTF-8 encoding.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/advanced_search.jsp trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/help.jsp trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/advanced_search.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/advanced_search.jsp 2008-08-14 01:59:48 UTC (rev 2548) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/advanced_search.jsp 2008-08-14 02:00:33 UTC (rev 2549) @@ -1,3 +1,4 @@ +<%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> <%@ page import="org.archive.wayback.core.UIResults" %> <%@ page import="org.archive.wayback.util.StringFormatter" %> <jsp:include page="/WEB-INF/template/UI-header.jsp" flush="true" /> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/help.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/help.jsp 2008-08-14 01:59:48 UTC (rev 2548) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/help.jsp 2008-08-14 02:00:33 UTC (rev 2549) @@ -1,3 +1,4 @@ +<%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> <%@ page import="org.archive.wayback.core.UIResults" %> <%@ page import="org.archive.wayback.util.StringFormatter" %> <jsp:include page="/WEB-INF/template/UI-header.jsp" flush="true" /> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp 2008-08-14 01:59:48 UTC (rev 2548) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp 2008-08-14 02:00:33 UTC (rev 2549) @@ -1,3 +1,4 @@ +<%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> <%@ page import="java.util.ArrayList" %> <%@ page import="org.archive.wayback.core.UIResults" %> <%@ page import="org.archive.wayback.util.StringFormatter" %> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-14 01:59:38
|
Revision: 2548 http://archive-access.svn.sourceforge.net/archive-access/?rev=2548&view=rev Author: bradtofel Date: 2008-08-14 01:59:48 +0000 (Thu, 14 Aug 2008) Log Message: ----------- FEATURE: Now include an AnchorWindow select, and update the anchordate cookie when entering replay mode. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLCaptureResults.jsp trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLUrlResults.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp 2008-08-14 01:58:44 UTC (rev 2547) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/CalendarResults.jsp 2008-08-14 01:59:48 UTC (rev 2548) @@ -38,13 +38,8 @@ <%= fmt.format("PathQueryClassic.searchedFor",searchString) %> </td> <td align="right"> - <select onchange="SetAnchorWindow(this.value)"> - <option value="86400">1 day</option> - <option value="604800">1 week</option> - <option value="2592000">1 month</option> - <option value="31536000">1 year</option> - <option value="315360000">10 years</option> - </select> + Set Anchor Window: + <jsp:include page="/WEB-INF/template/AnchorWindow.jsp" flush="true" /> <%= fmt.format("PathQueryClassic.resultsSummary",resultCount) %> </td> </tr> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLCaptureResults.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLCaptureResults.jsp 2008-08-14 01:58:44 UTC (rev 2547) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLCaptureResults.jsp 2008-08-14 01:59:48 UTC (rev 2548) @@ -27,6 +27,8 @@ <%= fmt.format("PathQuery.resultsSummary",resultCount,searchString) %> <br></br> <%= fmt.format("PathQuery.resultRange",searchStartDate,searchEndDate) %> + Set Anchor Window: + <jsp:include page="/WEB-INF/template/AnchorWindow.jsp" flush="true" /> <hr></hr> <% boolean first = false; @@ -59,7 +61,7 @@ } if(updated) { %> - <a href="<%= replayUrl %>"><%= prettyDate %></a> + <a onclick="SetAnchorDate('<%= result.getCaptureTimestamp() %>');" href="<%= replayUrl %>"><%= prettyDate %></a> <span style="color:black;"><%= origHost %></span> <span style="color:gray;"><%= httpResponse %></span> <span style="color:brown;"><%= mimeType %></span> @@ -74,7 +76,7 @@ <% } else { %> - <a href="<%= replayUrl %>"><%= prettyDate %></a> + <a onclick="SetAnchorDate('<%= result.getCaptureTimestamp() %>');" href="<%= replayUrl %>"><%= prettyDate %></a> <span style="color:green;"><%= origHost %></span> <!-- <span style="color:red;"><%= arcFile %></span> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLUrlResults.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLUrlResults.jsp 2008-08-14 01:58:44 UTC (rev 2547) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/HTMLUrlResults.jsp 2008-08-14 01:59:48 UTC (rev 2548) @@ -56,7 +56,7 @@ String ts = result.getFirstCaptureTimestamp(); String anchor = uriConverter.makeReplayURI(ts,originalUrl); %> - <a href="<%= anchor %>"> + <a onclick="SetAnchorDate('<%= ts %>');" href="<%= anchor %>"> <%= urlKey %> </a> <span class="mainSearchText"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-14 01:58:34
|
Revision: 2547 http://archive-access.svn.sourceforge.net/archive-access/?rev=2547&view=rev Author: bradtofel Date: 2008-08-14 01:58:44 +0000 (Thu, 14 Aug 2008) Log Message: ----------- FEATURE: now updates AnchorDate when new values are selected on the timeline. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp 2008-08-14 01:58:05 UTC (rev 2546) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/Timeline.jsp 2008-08-14 01:58:44 UTC (rev 2547) @@ -11,6 +11,7 @@ <%@ page import="org.archive.wayback.query.resultspartitioner.ResultsTimelinePartitionsFactory" %> <%@ page import="org.archive.wayback.query.resultspartitioner.ResultsPartition" %> <%@ page import="org.archive.wayback.util.StringFormatter" %> +<jsp:include page="/WEB-INF/template/CookieJS.jsp" flush="true" /> <% String contextRoot = request.getScheme() + "://" + request.getServerName() + ":" @@ -185,7 +186,7 @@ titleString = "title=\"" + fmt.format("TimelineView.firstVersionTitle", first.getCaptureDate()) + "\""; - %><a wmSpecial="1" href="<%= results.resultToReplayUrl(first) %>"><% + %><a wmSpecial="1" onclick="SetAnchorDate('<%= first.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(first) %>"><% } %><img <%= titleString %> wmSpecial="1" border=0 width=19 height=20 src="<%= contextRoot %>/images/first.jpg"><% if(first != null) { @@ -196,7 +197,7 @@ titleString = "title=\"" + fmt.format("TimelineView.prevVersionTitle", prev.getCaptureDate()) + "\""; - %><a wmSpecial="1" href="<%= results.resultToReplayUrl(prev) %>"><% + %><a wmSpecial="1" onclick="SetAnchorDate('<%= prev.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(prev) %>"><% } %><img <%= titleString %> wmSpecial="1" border=0 width=13 height=20 src="<%= contextRoot %>/images/prev.jpg"><% if(first != null) { @@ -212,17 +213,20 @@ String imageUrl = contextRoot + "/images/line.jpg"; String replayUrl = null; String prettyDateTime = null; + String ts = null; if(numResults == 1) { imageUrl = contextRoot + "/images/mark_one.jpg"; CaptureSearchResult result = (CaptureSearchResult) partitionResults.get(0); replayUrl = results.resultToReplayUrl(result); prettyDateTime = fmt.format("TimelineView.markDateTitle",result.getCaptureDate()); + ts = result.getCaptureTimestamp(); } else if (numResults > 1) { imageUrl = contextRoot + "/images/mark_several.jpg"; CaptureSearchResult result = (CaptureSearchResult) partitionResults.get(numResults - 1); replayUrl = results.resultToReplayUrl(result); prettyDateTime = fmt.format("TimelineView.markDateTitle",result.getCaptureDate()); + ts = result.getCaptureTimestamp(); } if((i > 0) && (i < numPartitions)) { @@ -237,7 +241,7 @@ } else { -%><a wmSpecial="1" href="<%= replayUrl %>"><img wmSpecial="1" border=0 width=7 height=16 title="<%= prettyDateTime %>" src="<%= imageUrl %>"></a><% +%><a wmSpecial="1" onclick="SetAnchorDate('<%= ts %>');" href="<%= replayUrl %>"><img wmSpecial="1" border=0 width=7 height=16 title="<%= prettyDateTime %>" src="<%= imageUrl %>"></a><% } } @@ -249,10 +253,10 @@ titleString = "title=\"" + fmt.format("TimelineView.nextVersionTitle", next.getCaptureDate()) + "\""; - %><a wmSpecial="1" href="<%= results.resultToReplayUrl(next) %>"><% + %><a wmSpecial="1" onclick="SetAnchorDate('<%= next.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(next) %>"><% } %><img wmSpecial="1" <%= titleString %> border=0 width=13 height=20 src="<%= contextRoot %>/images/next.jpg"><% - if(first != null) { + if(next != null) { %></a><% } titleString = ""; @@ -260,10 +264,10 @@ titleString = "title=\"" + fmt.format("TimelineView.lastVersionTitle", last.getCaptureDate()) + "\""; - %><a wmSpecial="1" href="<%= results.resultToReplayUrl(last) %>"><% + %><a wmSpecial="1" onclick="SetAnchorDate('<%= last.getCaptureTimestamp() %>');" href="<%= results.resultToReplayUrl(last) %>"><% } %><img wmSpecial="1" <%= titleString %> border=0 width=19 height=20 src="<%= contextRoot %>/images/last.jpg"><% - if(first != null) { + if(last != null) { %></a><% } %></td> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-14 01:57:55
|
Revision: 2546 http://archive-access.svn.sourceforge.net/archive-access/?rev=2546&view=rev Author: bradtofel Date: 2008-08-14 01:58:05 +0000 (Thu, 14 Aug 2008) Log Message: ----------- INITIAL REV: template .jsp usable in Query, Exception, Replay insert context to draw an HTML select with date options for AnchorWindow. Automatically selects the correct option based on cookie information, and uses Javascript to update the cookie value when the select value is changed. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/AnchorWindow.jsp Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/AnchorWindow.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/AnchorWindow.jsp (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/AnchorWindow.jsp 2008-08-14 01:58:05 UTC (rev 2546) @@ -0,0 +1,20 @@ +<%@ + page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8" +%><%@ + page import="org.archive.wayback.core.UIResults" +%><%@ + page import="org.archive.wayback.core.WaybackRequest" +%><%@ page import="org.archive.wayback.util.html.SelectHTML" +%><% +SelectHTML window = new SelectHTML("foo"); +window.setProps("onchange=\"SetAnchorWindow(this.value); location.reload(true);\""); +window.addOption("none","0"); +window.addOption("1 day","86400"); +window.addOption("1 week","604800"); +window.addOption("1 month","2592000"); +window.addOption("1 year","31536000"); +window.addOption("10 years","315360000"); +UIResults results = UIResults.getGeneric(request); +WaybackRequest wbr = results.getWbRequest(); +window.setActive(wbr.get(WaybackRequest.REQUEST_ANCHOR_WINDOW)); +%><jsp:include page="/WEB-INF/template/CookieJS.jsp" flush="true" /><%= window.draw() %> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-14 00:51:46
|
Revision: 2545 http://archive-access.svn.sourceforge.net/archive-access/?rev=2545&view=rev Author: bradtofel Date: 2008-08-14 00:51:55 +0000 (Thu, 14 Aug 2008) Log Message: ----------- INITIAL REV: Czech UI .properties file. Was lost in my mail box for over a year, but better late than never. Thanks Luk?\195?\161?\197?\161 Mat?\196?\155jka! Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_cs.properties Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_cs.properties =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_cs.properties (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_cs.properties 2008-08-14 00:51:55 UTC (rev 2545) @@ -0,0 +1,104 @@ +Exception.wayback.title=Wayback Exception +Exception.wayback.message=Nastala nezn\u00e1m\u00e1 v\u00fdjimka. {0} +Exception.accessControl.title=Access Control Exception +Exception.accessControl.message=P\u0159\u00edstup do tohoto obsahu byl zablokov\u00e1n. {0} +Exception.badQuery.title=Bad Query Exception +Exception.badQuery.message=Vyhled\u00e1vac\u00ed dotaz nebyl \u00fapln\u00fd, nebo mu server nerozum\u00ed. +Exception.betterRequest.title=\u0160patn\u011b zadan\u00fd dotaz +Exception.betterRequest.message=Zadan\u00fd po\u017eadavek m\u016f\u017ee b\u00fdt l\u00e9pe vyj\u00e1d\u0159en jin\u00fdm po\u017eadavkem. {0} +Exception.configuration.title=Configuration Exception +Exception.configuration.message=Tato slu\u017eba nebyla spr\u00e1vn\u011b nakonfigurov\u00e1na. {0} +Exception.resourceIndexNotAvailable.title=Resource Index Not Available Exception +Exception.resourceIndexNotAvailable.message=Po\u017eadovan\u00fd Resource Index po\u017eadavku je do\u010dasn\u011b nedostupn\u00fd. Pros\u00edme, zkusto to pozd\u011bji. +Exception.resourceNotAvailable.title=Adresa nen\u00ed dostupn\u00e1 +Exception.resourceNotAvailable.message=Adresa, kterou po\u017eadujete, je do\u010dasn\u011b nedostupn\u00e1. Pros\u00edm, zkuste to pozd\u011bji. +Exception.resourceNotInArchive.title=Adresa nen\u00ed v archivu +Exception.resourceNotInArchive.message=Adresa, kterou hled\u00e1te, se v archivu nenach\u00e1z\u00ed. + +UIGlobal.pageTitle=Webarchiv +UIGlobal.helpLink=N\u00e1pov\u011bda +UIGlobal.enterWebAddress=Zadejte webovou adresu: +UIGlobal.selectYearAll=V\u0161echny +UIGlobal.urlSearchButton=Vyhledat +UIGlobal.advancedSearchLink=Pokro\u010dil\u00e9 vyhled\u00e1v\u00e1n\u00ed +UIGlobal.homeLink=Dom\u016f +UIGlobal.indexPage=Toto je nov\u00fd prototyp Wayback Machine. Umo\u017e\u0148uje v archivu naj\u00edt jak\u00e9koli URL, kter\u00e9 je v n\u011bm obsa\u017eeno. +UIGlobal.helpPage=Pros\u00edm, p\u0159e\u010dt\u011bte si \u010dasto pokl\u00e1dan\u00e9 dotazy (<a href="{0}">FAQ</a>) syst\u00e9mu Wayback. + +MetaReplay.title=Metadata dokumentu +MetaReplay.HTTPHeaders=Hlavi\u010dky HTTP +MetaReplay.originalURL=P\u016fvodn\u00ed URL +MetaReplay.URLKey=URL Key +MetaReplay.captureDate=Datum z\u00edsk\u00e1n\u00ed +MetaReplay.captureDateDisplay={0,date,H:mm:ss d. M. yyyy} +MetaReplay.archiveID=Archiva\u010dn\u00ed ID +MetaReplay.MIMEType=Mime typ +MetaReplay.digest=P\u0159ehled + +TimelineView.viewingVersion=Prohl\u00ed\u017een\u00e1 verze {0,number,integer} z {1,number,integer} +TimelineView.viewingVersionDate={0,date,H:mm:ss d. M. yyyy} +TimelineView.timeRange=\u010casov\u00e9 rozmez\u00ed +TimelineView.timeRange.years=Roky +TimelineView.timeRange.months=M\u011bs\u00edce +TimelineView.timeRange.days=Dny +TimelineView.timeRange.hours=Hodiny +TimelineView.timeRange.unknown=Nezn\u00e1m\u00fd +TimelineView.timeRange.auto=Auto({0}) +TimelineView.metaDataCheck=Metadata: +TimelineView.markDateTitle={0,date,H:mm:ss d. M. yyyy} +TimelineView.firstVersionTitle=Prvn\u00ed verze ({0,date,H:mm:ss d. M. yyyy}) +TimelineView.prevVersionTitle=P\u0159edchoz\u00ed verze ({0,date,H:mm:ss d. M. yyyy}) +TimelineView.nextVersionTitle=N\u00e1sleduj\u00edc\u00ed verze ({0,date,H:mm:ss d. M. yyyy}) +TimelineView.lastVersionTitle=Posledn\u00ed verze ({0,date,H:mm:ss d. M. yyyy}) +TimelineView.frameSetTitle=WB-Timeline +TimelineView.frameSetNoFramesMessage=K prohl\u00ed\u017een\u00ed tohoto obsahu pot\u0159ebujete prohl\u00ed\u017ee\u010d podporuj\u00edc\u00ed r\u00e1mce, well not exactly _this_, but what would have been here if you had a frames-capable browser. + + +ReplayView.banner=Wayback - extern\u00ed odkazy, formul\u00e1\u0159e a vyhled\u00e1vac\u00ed pol\u00ed\u010dka nemus\u00ed fungovat s touto kolekc\u00ed. Url: {0} \u010das: {1,date,H:mm:ss d. M. yyyy} +ReplayView.bannerHideLink=[skr\u00fdt] + +PathQuery.resultsSummary=Po\u010det v\u00fdsledk\u016f pro {1}: {0,number,integer} +PathQuery.resultRange=mezi {0,date, d. M. yyyy} a {1,date, d. M. yyyy} +PathQuery.newVersionIndicator=(nov\u00e1 verze) +PathQuery.redirectIndicator=(p\u0159esm\u011brovat) +PathQuery.classicResultLinkText={0,date,d. M. yyyy} + +PathPrefixQuery.showingResults=Prohl\u00ed\u017een\u00ed {0,number,integer} - {1,number,integer} z {2,number,integer} v\u00fdsledk\u016f pro {3} +PathPrefixQuery.unchangedIndicator=nezm\u011bn\u011bno + +PathQueryClassic.searchedFor=Vyhled\u00e1n\u00ed pro <a href="{0}"><b>{0}</b></a> +PathQueryClassic.searchResults=Vyhledan\u00e9 v\u00fdsledky pro {0,date,d. M. yyyy} - {1,date,d. M. yyyy} +PathQueryClassic.resultsSummary={0,choice,0#0 Results|1#1 Result|1<{0,number,integer} v\u00fdsledk\u016f} + +ResultPartition.columnSummary={0,choice,0#0 pages|1#1 page|1<{0,number,integer} str\u00e1nek} +ResultPartitions.day={0,date,d. M.} +ResultPartitions.hour={0,date,h a} +ResultPartitions.month={0,date,M yyyy} +ResultPartitions.twoMonth={0,date,M yyyy} - {1,date,M yyyy} +ResultPartitions.week={0,date,d. M.} - {1,date,d. M.} +ResultPartitions.year={0,date,yyyy} + +ReplayView.javaScriptComment=\ +// Dokument archivov\u00e1n {0,date,H:mm:ss d. M. yyyy} a z\u00edsk\u00e1n z \n\ +// WebArchiv.cz {1,date,H:mm:ss MMM d, yyyy}.\n\ +// JavaScript p\u0159id\u00e1n pomoc\u00ed Wayback Machine, copyright WebArchiv.cz.\n\ +//\n\ +// ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT +\n + +AdvancedSearch.url=Webov\u00e1 adresa: +AdvancedSearch.exactDate=P\u0159esn\u00e9 datum: +AdvancedSearch.earliestDate=Nejd\u0159\u00edv\u011bj\u0161\u00ed datum: +AdvancedSearch.latestDate=Nejpozd\u011bj\u0161\u00ed datum: +AdvancedSearch.submitButton=Vyhledat + +header.h1.title=WebArchiv - archiv \u010desk\u00e9ho webu +header.h1.nkp=N\u00e1rodn\u00ed knihovna +header.rss.title=Webarchiv RSS +header.logo.back=zp\u011bt +header.logo.title=Webarchiv.cz + +footer.up=Nahoru +footer.contact=kontakt +footer.mailto=ma...@we... + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-13 01:39:30
|
Revision: 2544 http://archive-access.svn.sourceforge.net/archive-access/?rev=2544&view=rev Author: bradtofel Date: 2008-08-13 01:39:39 +0000 (Wed, 13 Aug 2008) Log Message: ----------- REMOVE: all templates are now present in other spring config examples. Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml Deleted: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml 2008-08-13 01:37:38 UTC (rev 2543) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml 2008-08-13 01:39:39 UTC (rev 2544) @@ -1,155 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE beans PUBLIC "-//SPRING//DTD BEAN//EN" "http://www.springframework.org/dtd/spring-beans.dtd"> -<beans> - -<!-- SearchResultSource templates --> - <bean id="bdbsearchresultsource" - class="org.archive.wayback.resourceindex.bdb.BDBIndex" - init-method="init"> - <property name="bdbName" value="DB1" /> - <property name="bdbPath" value="/tmp/wayback/index/" /> - </bean> - - <bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> - <property name="path" value="/tmp/wayback/cdx-index/index.cdx" /> - </bean> - - <bean id="compositecdxresultsource" class="org.archive.wayback.resourceindex.CompositeSearchResultSource"> - <property name="CDXSources"> - <list> - <value>/tmp/wayback/cdx-index/index.cdx.1</value> - <value>/tmp/wayback/cdx-index/index.cdx.2</value> - </list> - </property> - </bean> - - -<!-- - <property name="annotater"> - <bean class="org.archive.wayback.resourceindex.filters.OracleAnnotationFilter"> - <property name="oracleUrl" value="http://localhost:8180/oracle/" /> - <property name="who" value="annotation" /> - </bean> - </property> ---> - - -<!-- ResourceIndex templates --> - - <bean id="remoteindex" class="org.archive.wayback.resourceindex.RemoteResourceIndex" init-method="init"> - <property name="searchUrlBase" value="http://webdata010.us.archive.org:8080/wayback/xmlquery" /> - </bean> - - <bean id="localbdbindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source" ref="bdbsearchresultsource" /> - <property name="maxRecords" value="10000" /> - </bean> - <bean id="localcdxindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source" ref="cdxsearchresultsource" /> - <property name="maxRecords" value="10000" /> - </bean> - - <bean id="remotenutchindex" class="org.archive.wayback.resourceindex.NutchResourceIndex" init-method="init"> - <property name="searchUrlBase" value="http://webteam-ws.us.archive.org:8080/katrina/opensearch" /> - <property name="maxRecords" value="100" /> - </bean> - -<!-- ResourceStore templates --> - - <bean id="localstore" class="org.archive.wayback.resourcestore.LocalesourceStore"> - <property name="dataDir" value="/tmp/wayback/arcs/" /> - </bean> - - <bean id="remotestore" class="org.archive.wayback.resourcestore.Http11ResourceStore"> - <property name="urlPrefix" value="http://localhost:8080/arcproxy/" /> - </bean> - -<!-- WaybackCollection templates --> - - <bean id="localcollection" class="org.archive.wayback.webapp.WaybackCollection"> - <property name="index" ref="localbdbindex" /> - <property name="store" ref="localstore" /> - </bean> - -<!-- QueryUI templates --> - <bean id="standardquery" class="org.archive.wayback.query.Renderer"> - <property name="captureJsp" value="/jsp/HTMLResults.jsp" /> - </bean> - <bean id="calendarquery" class="org.archive.wayback.query.Renderer"> - <property name="captureJsp" value="/jsp/CalendarResults.jsp" /> - </bean> - -<!-- ArchivalURL ReplayUI templates --> - <bean id="archivalurlreplay" class="org.archive.wayback.archivalurl.ArchivalUrlReplayDispatcher"> - <property name="jsInserts"> - <list> - <value>http://localhost:8080/wm.js</value> - </list> - </property> - </bean> - <bean id="archivalurluriconverter" - class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost:8080/wayback/" /> - </bean> - <bean id="archivalurlparser" class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" init-method="init"> - <property name="maxRecords" value="1000" /> - </bean> - -<!-- Proxy ReplayUI templates --> - <bean id="proxyreplay" class="org.archive.wayback.proxy.ProxyReplayDispatcher"> - </bean> - <bean id="proxyuriconverter" class="org.archive.wayback.proxy.RedirectResultURIConverter"> - <property name="redirectURI" value="http://localhost:8090/jsp/Redirect.jsp" /> - </bean> - <bean id="proxyparser" class="org.archive.wayback.proxy.ProxyRequestParser" init-method="init"> - <property name="localhostNames"> - <list> - <value>foo.archive.org</value> - </list> - </property> - <property name="maxRecords" value="1000" /> - </bean> - -<!--IP-base authentication template --> - - <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> - <property name="allowedRanges"> - <list> - <value>192.168.1.16/24</value> - </list> - </property> - </bean> - - -<!-- AccessPoint templates --> - - <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - <property name="collection" ref="localcollection" /> - <property name="query" ref="standardquery" /> - <property name="replay" ref="archivalurlreplay" /> - <property name="parser" ref="archivalurlparser" /> - <property name="uriConverter" ref="archivalurluriconverter" /> - </bean> - - <bean name="8081" parent="8080:wayback"> - <property name="useServerName" value="true" /> - <property name="replay"> - <bean class="org.archive.wayback.domainprefix.DomainPrefixReplayDispatcher" /> - </property> - - <property name="parser"> - <bean class="org.archive.wayback.domainprefix.DomainPrefixCompositeRequestParser" - init-method="init"> - <property name="hostPort" value="localhost.archive.org:8081" /> - <property name="maxRecords" value="1000" /> - <property name="earliestTimestamp" value="1996" /> - </bean> - </property> - - <property name="uriConverter"> - <bean class="org.archive.wayback.domainprefix.DomainPrefixResultURIConverter"> - <property name="hostPort" value="localhost.archive.org:8081" /> - </bean> - </property> - </bean> -</beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-13 01:37:30
|
Revision: 2543 http://archive-access.svn.sourceforge.net/archive-access/?rev=2543&view=rev Author: bradtofel Date: 2008-08-13 01:37:38 +0000 (Wed, 13 Aug 2008) Log Message: ----------- TWEAK: tested & finalized for 1.4 release. NutchWax 0.12.1 does not current work with Wayback as WERA did for replay: several problems, primarily that it does not index images, css, and likely several other formats. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,129 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains the default WaybackCollection implementation shipped + with Wayback. It consists of a LocalResourceIndex using a BDBIndex, and + uses several Worker Threads to implement automatic indexing. + + Several beans defined in this file reference the "resourcefilelocationdb" + bean defined in wayback.xml. + + To customize where the automatic indexing system looks for ARC/WARC files, + see "resourcefilesourceupdater.sources" within this file. + + To customize the location where the automatic indexing state data is stored + you can modify "wayback.basedir" in wayback.xml, or replace the properties + in this file for further flexibility. + + For more information about the auto-indexing system, please see: + + http://archive-access.sourceforge.net/projects/wayback/resource_store.html + +--> + +<!-- + A LocalResourceIndex bean using a BDBIndex SearchResultSource. +--> + <bean id="localbdbresourceindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> + <property name="source"> + <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex"> + <property name="bdbName" value="DB1" /> + <property name="bdbPath" value="${wayback.basedir}/index/" /> + </bean> + </property> + <property name="maxRecords" value="10000" /> + </bean> + +<!-- + An IndexQueue implementation required for automatic indexing. +--> + <bean id="indexqueue" class="org.archive.wayback.resourcestore.indexer.DirectoryIndexQueue"> + <property name="path" value="${wayback.basedir}/index-data/queue" /> + </bean> + + <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean id="localresourcestore" class="org.archive.wayback.resourcestore.LocationDBResourceStore"> + <property name="db" ref="resourcefilelocationdb" /> + </bean> + </property> + + <property name="resourceIndex" ref="localbdbresourceindex"/> + + <property name="shutdownables"> + <list> + <!-- This thread notices new files appearing in your resourcefilesources --> + <bean id="resourcefilesourceupdater" class="org.archive.wayback.resourcestore.resourcefile.ResourceFileSourceUpdater"> + <property name="target" value="${wayback.basedir}/file-db/incoming" /> + <property name="interval" value="100000" /> + <property name="sources"> + <list> + <!-- + This example looks for ARC/WARC files recursively under 2 + directories: /tmp/wayback/files1 and /tmp/wayback/files2 + You can specify as few or as many ResourceFileSource instances + as needed, but each must have a unique 'name' property. + --> + <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> + <property name="name" value="files1" /> + <property name="prefix" value="/tmp/wayback/files1/" /> + </bean> + <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> + <property name="name" value="files2" /> + <property name="prefix" value="/tmp/wayback/files2/" /> + </bean> + </list> + </property> + </bean> + + <!-- This thread updates the location db with updates from resourcefilesourceupdater --> + <bean id="resourcefilelocationdbupdater" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater"> + <property name="interval" value="100000" /> + <property name="db" ref="resourcefilelocationdb" /> + <property name="incomingDir" value="${wayback.basedir}/file-db/incoming" /> + <property name="stateDir" value="${wayback.basedir}/file-db/state" /> + </bean> + + <!-- This thread notices new files arriving in the filelocationdb, and queues them for indexing --> + <bean id="indexqueueupdater" class="org.archive.wayback.resourcestore.indexer.IndexQueueUpdater"> + <property name="db" ref="resourcefilelocationdb" /> + <property name="queue" ref="indexqueue" /> + <property name="interval" value="1000" /> + <property name="lastMark" value="${wayback.basedir}/index-data/queue.mark" /> + </bean> + + <!-- This thread checks the to-be-indexed queue for files needing indexing, indexes them, and hands off the results for merging with the ResourceIndex --> + <bean id="indexworker" class="org.archive.wayback.resourcestore.indexer.IndexWorker"> + <property name="db" ref="resourcefilelocationdb" /> + <property name="queue" ref="indexqueue" /> + <property name="interval" value="1000" /> + <property name="target"> + <bean class="org.archive.wayback.resourceindex.updater.IndexClient"> + <property name="tmpDir" value="${wayback.basedir}/index-data/tmp/" /> + <property name="target" value="${wayback.basedir}/index-data/incoming/" /> + </bean> + </property> + </bean> + + <!-- This thread merges updates from the indexworker into the ResourceIndex --> + <bean class="org.archive.wayback.resourceindex.updater.LocalResourceIndexUpdater"> + + <property name="index" ref="localbdbresourceindex" /> + <property name="incoming" value="${wayback.basedir}/index-data/incoming/" /> + <property name="failed" value="${wayback.basedir}/index-data/failed/" /> + <property name="merged" value="${wayback.basedir}/index-data/merged/" /> + <property name="runInterval" value="10000" /> + </bean> + </list> + </property> + </bean> + + +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,69 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains a WaybackCollection implementation using a manually + created and administered CDX index file(s). It also uses the experimental + FlatFileResourceFileLocationDB, which enables mapping between ARC/WARC file + names and their absolute paths/URLs using a sorted text file + "path-index.txt". + + The format of the path-index.txt is + <NAME><TAB><PATH-OR-URL> + + Be sure to set the environment variable LC_ALL to "C" before sorting CDX + files and path-index.txt files. + + All paths in this file reference the Spring property placeholder + "wayback.basedir" defined in wayback.xml. That value may be changed to + alter top-level prefixes for these paths, or the values in this file can + be changed directly for further flexibility. + +--> + + <bean id="localcdxcollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.LocationDBResourceStore"> + <property name="db"> + <bean class="org.archive.wayback.resourcestore.locationdb.FlatFileResourceFileLocationDB"> + <property name="path" value="${wayback.basedir}/path-index.txt" /> + </bean> + </property> + </bean> + </property> + + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> + <property name="source"> + + <bean class="org.archive.wayback.resourceindex.cdx.CDXIndex"> + <property name="path" value="${wayback.basedir}/cdx-index/index.cdx" /> + </bean> + +<!-- + A CompositeSearchResultSource example, that allows searching through + multiple sorted CDX files. +--> +<!-- + <bean class="org.archive.wayback.resourceindex.CompositeSearchResultSource"> + <property name="CDXSources"> + <list> + <value>${wayback.basedir}/cdx-index/index-1.cdx</value> + <value>${wayback.basedir}/cdx-index/index-2.cdx</value> + </list> + </property> + </bean> +--> + </property> + <property name="maxRecords" value="10000" /> + </bean> + </property> + </bean> + +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ComplexAccessPoint.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,87 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains an alternate "8080:wayback" AccessPoint demonstrating + several optional AccessPoint configurations. +--> + + <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> + <property name="collection" ref="localbdbcollection" /> + <property name="replay" ref="archivalurlreplay" /> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> + </bean> + </property> + + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/wayback/"/> + </bean> + </property> + + <property name="parser"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"> + <property name="maxRecords" value="1000" /> + <property name="earliestTimestamp" value="1996" /> + </bean> + </property> + + <!-- + The following configuration enables free String key-value pairs to be + associated. These values can be used within customized .jsp UI files. + Here is some example .jsp code demonstrating access of the "Institution" + value: + UIResults results = UIResults.getGeneric(request); + String institution = results.getContextConfig("Institution"); + ... + + --> + <property name="configs"> + <props> + <prop key="Institution">Sample Institution</prop> + <prop key="Collection">Sample Collection</prop> + </props> + </property> + + <!-- + The following is a rather complex configuration example demonstrating + context specific AccessControl configuration. Specifically, it causes any + request NOT originating INSIDE the 192.168.1.16/24 IP space to use the + specified Access Control Oracle to determine which documents are + accessible. Requests originating INSIDE the IP space have no access + control restrictions. + --> + + <property name="authentication"> + <bean class="org.archive.wayback.authenticationcontrol.AccessControlSettingOperation"> + <property name="operator"> + <bean class="org.archive.wayback.util.operator.NotBooleanOperator"> + <property name="operand"> + <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> + <property name="allowedRanges"> + <list> + <value>192.168.1.16/24</value> + </list> + </property> + </bean> + </property> + </bean> + </property> + <property name="factory"> + <bean class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> + <property name="accessGroup" value="ia_archiver" /> + </bean> + </property> + </bean> + </property> + </bean> + +</beans> Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/NutchCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> +<!-- + + The following WaybackCollection bean is an example using a NutchWAX + full-text index with Wayback, using a SimpleResourceStore to access + replayed documents. You will need to change searchUrlBase to your local + NutchWAX installation. + + Please note that Wayback is supported for use with NutchWax version 0.14.1 + or higher. + + Please see RemoteCollection.xml for information on the meaning of the + SimpleResourceStore definition and options. + + You also need to ensure that the maxRecords on your RequestParser is not + greater than the maxRecords configured on the RemoteNutchResourceIndex. + +--> + + <bean id="remotenutchcollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.SimpleResourceStore"> + <property name="prefix" value="http://wayback.archive-it.org/fileproxy/" /> + </bean> + </property> + + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.NutchResourceIndex"> + <property name="searchUrlBase" value="http://ia400138.us.archive.org:8080/nutch-1.0-dev/opensearch" /> + <property name="maxRecords" value="100" /> + </bean> + </property> + </bean> + +</beans> \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/RemoteCollection.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<beans xmlns="http://www.springframework.org/schema/beans" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.springframework.org/schema/beans + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> + +<!-- + + This file contains a WaybackCollection implementation using a + RemoteResourceIndex and a SimpleResourceStore. + + The RemoteResourceIndex implementation assumes that a Wayback is running on + the machine indicated by "searchUrlBase", which provides an XML-HTTP search + interface to the ResourceIndex on that machine. + + The SimpleResourceStore implementation assumes that all ARC/WARC files are + accessible under the path/URL named in "prefix". + + When a path is specified as the "prefix", it is assumed that there exists a + single local directory containing all ARC/WARC files. + + When a URL is specified as the "prefix", it is assumed that all ARC/WARC + files are HTTP 1.1 exported under the directory denoted. The + FileProxyServlet defined in wayback.xml may be useful in installations + where ARC/WARC files are distributed across many machines, and it is + desirable to route all ARC/WARC resource requests through a single machine. + +--> + + <bean id="remotecollection" class="org.archive.wayback.webapp.WaybackCollection"> + + <property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.SimpleResourceStore"> + <property name="prefix" value="http://wayback.archive-it.org/fileproxy/" /> + </bean> + </property> + + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex"> + <property name="searchUrlBase" value="http://wayback.archive-it.org/1055/xmlquery" /> + </bean> + </property> + </bean> + +</beans> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-13 01:07:20 UTC (rev 2542) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-13 01:37:38 UTC (rev 2543) @@ -53,16 +53,16 @@ </bean> --> - <import resource="BDBCollection.xml"/> <!-- The XML files indicated in the following import tags contain alternate example implementations of WaybackCollections. --> + <import resource="NutchCollection.xml"/> +<!-- + <import resource="BDBCollection.xml"/> <import resource="CDXCollection.xml"/> <import resource="RemoteCollection.xml"/> - <import resource="NutchCollection.xml"/> -<!-- --> @@ -79,7 +79,7 @@ --> <import resource="ArchivalUrlReplay.xml"/> <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - <property name="collection" ref="remotecollection" /> + <property name="collection" ref="localbdbcollection" /> <property name="replay" ref="archivalurlreplay" /> <property name="query"> <bean class="org.archive.wayback.query.Renderer"> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2542 http://archive-access.svn.sourceforge.net/archive-access/?rev=2542&view=rev Author: bradtofel Date: 2008-08-13 01:07:20 +0000 (Wed, 13 Aug 2008) Log Message: ----------- FEATURE: nutchwax 0.12.1 now supports exacturl again... still not always working.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2008-08-13 00:06:02 UTC (rev 2541) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2008-08-13 01:07:20 UTC (rev 2542) @@ -286,11 +286,11 @@ ms.append("date%3A").append(startDateStr).append('-').append(endDateStr); ms.append('+'); // Add 'url:URL'. -// if(wbRequest.isUrlQueryRequest()) { + if(wbRequest.isUrlQueryRequest()) { ms.append("url%3A"); -// } else { -// ms.append("exacturl%3A"); -// } + } else { + ms.append("exacturl%3A"); + } try { ms.append(java.net.URLEncoder.encode("\""+urlStr+"\"", "UTF-8")); } catch (UnsupportedEncodingException e) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-13 00:05:56
|
Revision: 2541 http://archive-access.svn.sourceforge.net/archive-access/?rev=2541&view=rev Author: bradtofel Date: 2008-08-13 00:06:02 +0000 (Wed, 13 Aug 2008) Log Message: ----------- TWEAK: unset DebugBanner, added Discalimer.jsp as default Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2008-08-12 23:03:30 UTC (rev 2540) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2008-08-13 00:06:02 UTC (rev 2541) @@ -6,7 +6,7 @@ <bean id="archivalurlhttpheaderprocessor" class="org.archive.wayback.replay.RedirectRewritingHttpHeaderProcessor" /> - <bean id="archivaldateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" /> + <bean id="archivaldateredirectingreplayrenderer" class="org.archive.wayback.replay.DateRedirectReplayRenderer" /> <bean id="archivalcssreplayrenderer" class="org.archive.wayback.archivalurl.ArchivalUrlCSSReplayRenderer"> <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> </bean> @@ -33,25 +33,25 @@ <constructor-arg><ref bean="archivalurlhttpheaderprocessor"/></constructor-arg> <property name="jspInserts"> <list> - <value>/WEB-INF/replay/ArchiveComment.jsp</value> - <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> + <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> + <value>/WEB-INF/replay/Disclaimer.jsp</value> +<!-- <value>/WEB-INF/replay/DebugBanner.jsp</value> -<!-- - <value>/WEB-INF/replay/Disclaimer.jsp</value> <value>/WEB-INF/replay/Timeline.jsp</value> --> </list> </property> </bean> - + <bean id="archivalurlreplay" class="org.archive.wayback.replay.SelectorReplayDispatcher"> <property name="selectors"> <list> <!-- REDIRECT IF NOT EXACT DATE --> - <bean class="org.archive.wayback.replay.selector.DateMismatchSelector"> - <property name="renderer" ref="archivaldateredirectingreplayrenderer"/> - </bean> + <bean class="org.archive.wayback.replay.selector.DateMismatchSelector"> + <property name="renderer" ref="archivaldateredirectingreplayrenderer"/> + </bean> <!-- HTML REPLAY --> <bean class="org.archive.wayback.replay.selector.MimeTypeSelector"> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml 2008-08-12 23:03:30 UTC (rev 2540) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml 2008-08-13 00:06:02 UTC (rev 2541) @@ -16,6 +16,7 @@ <property name="jspInserts"> <list> <value>/WEB-INF/replay/ArchiveComment.jsp</value> + <value>/WEB-INF/replay/Disclaimer.jsp</value> <!-- <value>/WEB-INF/replay/DebugBanner.jsp</value> <value>/WEB-INF/replay/JSLessTimeline.jsp</value> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-12 23:03:30 UTC (rev 2540) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-13 00:06:02 UTC (rev 2541) @@ -2,319 +2,104 @@ <beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.springframework.org/schema/beans - http://www.springframework.org/schema/beans/spring-beans-2.5.xsd"> + http://www.springframework.org/schema/beans/spring-beans-2.5.xsd" + default-init-method="init"> <!-- - The following 3 beans are required when using the ArcProxy for providing - HTTP 1.1 remote access to ARC files distributed across multiple computers - or directories. + Macro-like substitutions for the overall file: + wayback.basedir: default top level directory for all index, state, + locationdb storage. --> - - <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB" - init-method="init"> - <property name="bdbPath" value="/tmp/wayback/file-db/db/" /> - <property name="bdbName" value="DB1" /> - <property name="logPath" value="/tmp/wayback/file-db/db.log" /> - </bean> - <bean name="8080:locationdb" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet"> - <property name="locationDB" ref="resourcefilelocationdb" /> - </bean> - <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> - <property name="locationDB" ref="resourcefilelocationdb" /> - </bean> - <bean id="localbdbresourceindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source"> - <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex" - init-method="init"> - <property name="bdbName" value="DB1" /> - <property name="bdbPath" value="/tmp/wayback/index/" /> - </bean> + <bean class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer"> + <property name="properties"> + <value> + wayback.basedir=/tmp/wayback + </value> </property> - <property name="maxRecords" value="10000" /> </bean> - <bean id="indexqueue" class="org.archive.wayback.resourcestore.indexer.DirectoryIndexQueue"> - <property name="path" value="/tmp/wayback/index-data/queue" /> - </bean> - <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> - <property name="resourceStore"> - <bean id="localresourcestore" class="org.archive.wayback.resourcestore.LocalResourceFileResourceStore"> - <property name="db" ref="resourcefilelocationdb" /> - </bean> - </property> - - <property name="resourceIndex" ref="localbdbresourceindex"/> - - <property name="shutdownables"> - <list> - <!-- This thread notices new files appearing in your resourcefilesources --> - <bean id="resourcefilesourceupdater" class="org.archive.wayback.resourcestore.resourcefile.ResourceFileSourceUpdater" - init-method="init"> - <property name="target" value="/tmp/wayback/file-db/incoming" /> - <property name="interval" value="100000" /> - <property name="sources"> - <list> - <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> - <property name="name" value="files1" /> - <property name="prefix" value="/tmp/wayback/files1/" /> - </bean> - <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> - <property name="name" value="files2" /> - <property name="prefix" value="/tmp/wayback/files2/" /> - </bean> - </list> - </property> - </bean> - - <!-- This thread updates the location db with updates from resourcefilesourceupdater --> - <bean id="resourcefilelocationdbupdater" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater" - init-method="init"> - <property name="interval" value="100000" /> - <property name="db" ref="resourcefilelocationdb" /> - <property name="incomingDir" value="/tmp/wayback/file-db/incoming" /> - <property name="stateDir" value="/tmp/wayback/file-db/state" /> - </bean> - - <!-- This thread notices new files arriving in the filelocationdb, and queues them for indexing --> - <bean id="indexqueueupdater" class="org.archive.wayback.resourcestore.indexer.IndexQueueUpdater" - init-method="init"> - <property name="db" ref="resourcefilelocationdb" /> - <property name="queue" ref="indexqueue" /> - <property name="interval" value="1000" /> - <property name="lastMark" value="/tmp/wayback/index-data/queue.mark" /> - </bean> - - <!-- This thread checks the to-be-indexed queue for files needing indexing, indexes them, and hands off the results for merging with the ResourceIndex --> - <bean id="indexworker" class="org.archive.wayback.resourcestore.indexer.IndexWorker" - init-method="init"> - <property name="db" ref="resourcefilelocationdb" /> - <property name="queue" ref="indexqueue" /> - <property name="interval" value="1000" /> - <property name="target"> - <bean class="org.archive.wayback.resourceindex.updater.IndexClient"> - <property name="tmpDir" value="/tmp/wayback/index-data/tmp/" /> - <property name="target" value="/tmp/wayback/index-data/incoming/" /> - </bean> - </property> - </bean> - - <!-- This thread merges updates from the indexworker into the ResourceIndex --> - <bean class="org.archive.wayback.resourceindex.updater.LocalResourceIndexUpdater" - init-method="init"> - - <property name="index" ref="localbdbresourceindex" /> - <property name="incoming" value="/tmp/wayback/index-data/incoming/" /> - <property name="failed" value="/tmp/wayback/index-data/failed/" /> - <property name="merged" value="/tmp/wayback/index-data/merged/" /> - <property name="runInterval" value="10000" /> - </bean> - </list> - </property> - </bean> - - - <!-- - The following WaybackCollection bean template is required when using a - manually built local CDX index. + The ResourceFileLocationDB implementation to use for mapping ARC/WARC names + to absolute paths/URLs via a BDBJE database. --> -<!-- - <bean id="localcdxcollection" class="org.archive.wayback.webapp.WaybackCollection"> - <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.LocalResourceStore" - init-method="init"> - <property name="dataDir" value="/tmp/wayback/arcs/" /> - </bean> - </property> - - <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source"> - <bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> - <property name="path" value="/tmp/wayback/cdx-index/index.cdx" /> - </bean> - </property> - <property name="maxRecords" value="10000" /> - </bean> - </property> + <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB"> + <property name="bdbPath" value="${wayback.basedir}/file-db/db/" /> + <property name="bdbName" value="DB1" /> + <property name="logPath" value="${wayback.basedir}/file-db/db.log" /> </bean> ---> - <bean id="localcdxcollection2" class="org.archive.wayback.webapp.WaybackCollection"> - - <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.LocalResourceFileResourceStore"> - <property name="db"> - <bean class="org.archive.wayback.resourcestore.locationdb.FlatFileResourceFileLocationDB"> - <property name="path" value="/tmp/wayback/path-index.txt" /> - </bean> - </property> - </bean> - </property> - - <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source"> - <bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> - <property name="path" value="/tmp/wayback/cdx-index/index.1" /> - </bean> - </property> - <property name="maxRecords" value="10000" /> - </bean> - </property> - </bean> - <!-- - The following WaybackCollection bean template is required when using a - remote ResourceIndex and ResourceStore implementation. This will also - require setting up an arcproxy and locationdb on the host specified by - the resourceStore:urlPrefix configuration, and an additional AccessPoint - on the host specified by the resourceIndex:searchUrlBase configuration. + To enable manual management of, or remote access to the above locationDB, + uncomment the following bean. --> - <!-- - <bean id="remotecollection" class="org.archive.wayback.webapp.WaybackCollection"> - - <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> - <property name="urlPrefix" value="http://wayback.archive-it.org/fileproxy/" /> - </bean> - </property> - - <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex" - init-method="init"> - <property name="searchUrlBase" value="http://wayback.archive-it.org/1055/xmlquery" /> - </bean> - </property> + <bean name="8080:locationdb" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet"> + <property name="locationDB" ref="resourcefilelocationdb" /> </bean> --> <!-- - The following WaybackCollection bean template is an example for using a NutchWAX - full-text index with Wayback, using a RemoteResourceStore to access - replayed documents. You will need to change searchUrlBase to your local NutchWAX - installation. You may also need to ensure that the maxRecords on your RequestParser is - not greater than the maxRecords configured on the RemoteNutchResourceIndex. + The FileProxyServlet uses a ResourceFileLocationDB to make all ARC/WARC + files appear to reside within a single HTTP 1.1 exported directory. + Required when using the SimpleResourceStore to access distributed ARC/WARC + files over HTTP through a single reverse proxy. --> <!-- - <bean id="remotenutchcollection" class="org.archive.wayback.webapp.WaybackCollection"> - - <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> - <property name="urlPrefix" value="http://webapp101.us.archive.org/arcproxy/" /> - </bean> - </property> - - <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.NutchResourceIndex" init-method="init"> - <property name="searchUrlBase" value="http://webapp101.us.archive.org/e04/xmlquery" /> - <property name="maxRecords" value="100" /> - </bean> - </property> + <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> + <property name="locationDB" ref="resourcefilelocationdb" /> </bean> --> -<!-- - The following bean is an example using the Access Control Oracle, thanks - Alex Osborne and NLA. Currently this is pretty undocumented, but here is a - place to get started: + <import resource="BDBCollection.xml"/> - http://webteam.archive.org/confluence/display/wayback/Exclusions+API +<!-- + The XML files indicated in the following import tags contain alternate + example implementations of WaybackCollections. --> + <import resource="CDXCollection.xml"/> + <import resource="RemoteCollection.xml"/> + <import resource="NutchCollection.xml"/> <!-- - <bean id="excluder-factory-oracle" class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> - <property name="oracleUrl" value="http://localhost:8180/oracle/" /> - <property name="accessGroup" value="ia_archiver" /> - </bean> --> + <!-- This is the only AccessPoint defined by default within this wayback.xml Spring configuration file, providing an ArchivalURL Replay UI to the - "localbdbcollection" by providing ArchivalURL-specific implementations - of the replay, parser, and uriConverter. - + "localbdbcollection", defined in "BDBCollection.xml" by providing + ArchivalURL-specific implementations of the replay, parser, and + uriConverter. + This AccessPoint currently will provide access only from the machine - running Tomcat. To provide external access, replace "localhost" with your - fully qualified hostname of the computer running Tomcat. + running Tomcat. To provide external access, replace "localhost.archive.org" + with your fully qualified hostname of the computer running Tomcat. --> <import resource="ArchivalUrlReplay.xml"/> <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - <!-- - <property name="exclusionFactory" ref="excluder-factory-oracle" /> - --> - <property name="collection" ref="localbdbcollection" /> - <property name="configs"> - <props> - <prop key="inst">foo</prop> - <prop key="coll">supreme court</prop> - </props> - </property> - - <property name="uriConverter"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost:8080/wayback/"/> - </bean> - </property> - + <property name="collection" ref="remotecollection" /> + <property name="replay" ref="archivalurlreplay" /> <property name="query"> <bean class="org.archive.wayback.query.Renderer"> <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> </bean> </property> - <property name="replay" ref="archivalurlreplay" /> - - <property name="parser"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" - init-method="init"> - <property name="maxRecords" value="1000" /> - <property name="earliestTimestamp" value="1996" /> + <property name="uriConverter"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/wayback/"/> </bean> </property> - </bean> - <bean name="8080:rwayback" parent="8080:wayback"> - <property name="collection" ref="localcdxcollection2" /> <property name="parser"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" - init-method="init"> - <property name="maxRecords" value="100" /> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser"> + <property name="maxRecords" value="1000" /> <property name="earliestTimestamp" value="1996" /> </bean> </property> - <property name="exception"> - <bean class="org.archive.wayback.exception.CustomNotInArchiveExceptionRenderer"> - <property name="hosts"> - <list> - <value>www.aladems.org</value> - </list> - </property> - <property name="jspHandler" value="/exception/GrayBlank.jsp"/> - </bean> - </property> - <property name="uriConverter"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost:8080/rwayback/" /> - </bean> - </property> - <!-- - <property name="collection"> - <bean class="org.archive.wayback.webapp.WaybackCollection"> - <property name="resourceStore" ref="fancyresourcestore" /> - <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex" - init-method="init"> - <property name="searchUrlBase" value="http://localhost:8080/wayback/xmlquery" /> - </bean> - </property> - </bean> - </property> - --> + </bean> <!-- @@ -325,52 +110,65 @@ <bean name="8080:netsecure" parent="8080:wayback"> <property name="authentication"> - <bean class="org.archive.wayback.authenticationcontrol.AccessControlSettingOperation"> - <property name="operator"> - <bean class="org.archive.wayback.util.operator.NotBooleanOperator"> - <property name="operand"> - <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> - <property name="allowedRanges"> - <list> - <value>192.168.1.16/24</value> - </list> - </property> - </bean> - </property> - </bean> + <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> + <property name="allowedRanges"> + <list> + <value>192.168.1.16/24</value> + </list> </property> - <property name="factory" ref="excluder-factory-robot"/> </bean> </property> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://192.168.1.16:8080/netsecure/" /> + <property name="replayURIPrefix" value="http://localhost.archive.org:8080/netsecure/" /> </bean> </property> </bean> --> + <!-- The following AccessPoint inherits all configuration from the 8080:wayback - AccessPoint, but checks live web robots.txt documents to determine if - archived content should be accessible. - - Note: using this AccessPoint requires enabling the "livewebcache" and - "excluder-factory-robot" beans declared at the top of this file. + AccessPoint, but uses an Access Control Oracle to determine if archived + content should be accessible. + + The Access Control Oracle was developed by Alex Osborne of the NLA. + + Some documentation for this project is available at: + + http://webteam.archive.org/confluence/display/wayback/Exclusions+API --> + <!-- - <bean name="8080:robots" parent="8080:wayback"> - <property name="exclusionFactory" ref="excluder-factory-robot" /> + <bean name="8080:exclusion" parent="8080:wayback"> + <property name="exclusionFactory"> + <bean class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> + <property name="accessGroup" value="ia_archiver" /> + </bean> + </property> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> - <property name="replayURIPrefix" value="http://localhost:8080/robots/" /> + <property name="replayURIPrefix" value="http://localhost:8080/exclusion/" /> </bean> </property> </bean> --> +<!-- + The following AccessPoint inherits all configuration from the 8080:wayback + AccessPoint, but provides a DomainPrefix Replay UI to the same collection. + These two access points can be used simultaneously on the same Tomcat + installation. + + Note: using this AccessPoint requires adding a "Connector" on port 8081 + in your Tomcat's server.xml file. + + Note: the hostname suffix localhost.archive.org has a special DNS wildcard + entry, so all hostnames suffixed with this value resolve to 127.0.0.1 + --> <import resource="DomainPrefixReplay.xml"/> <bean name="8081" parent="8080:wayback"> - <property name="useServerName" value="true" /> + <property name="urlRoot" value="http://localhost.archive.org:8081/" /> <property name="replay" ref="domainprefixreplay" /> <property name="uriConverter"> <bean class="org.archive.wayback.domainprefix.DomainPrefixResultURIConverter"> @@ -378,7 +176,7 @@ </bean> </property> <property name="parser"> - <bean class="org.archive.wayback.domainprefix.DomainPrefixCompositeRequestParser" init-method="init"> + <bean class="org.archive.wayback.domainprefix.DomainPrefixCompositeRequestParser"> <property name="hostPort" value="localhost.archive.org:8081" /> <property name="maxRecords" value="1000" /> </bean> @@ -397,18 +195,18 @@ --> <import resource="ProxyReplay.xml"/> <bean name="8090" parent="8080:wayback"> - <property name="useServerName" value="true" /> + <property name="urlRoot" value="http://localhost.archive.org/" /> <property name="replay" ref="proxyreplay" /> <property name="uriConverter"> <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> - <property name="redirectURI" value="http://brad.archive.org/jsp/Redirect.jsp" /> + <property name="redirectURI" value="http://localhost.archive.org/jsp/Redirect.jsp" /> </bean> </property> <property name="parser"> - <bean class="org.archive.wayback.proxy.ProxyRequestParser" init-method="init"> + <bean class="org.archive.wayback.proxy.ProxyRequestParser"> <property name="localhostNames"> <list> - <value>brad.archive.org</value> + <value>localhost.archive.org</value> </list> </property> <property name="maxRecords" value="1000" /> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2540 http://archive-access.svn.sourceforge.net/archive-access/?rev=2540&view=rev Author: bradtofel Date: 2008-08-12 23:03:30 +0000 (Tue, 12 Aug 2008) Log Message: ----------- FEATURE: DomainPrefix HTML Replay did not allow embedded replay .jsp files.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixTextReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixTextReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixTextReplayRenderer.java 2008-08-12 23:01:21 UTC (rev 2539) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixTextReplayRenderer.java 2008-08-12 23:03:30 UTC (rev 2540) @@ -25,6 +25,8 @@ package org.archive.wayback.domainprefix; import java.io.IOException; +import java.util.Iterator; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -83,5 +85,20 @@ page.sb.setLength(0); page.sb.ensureCapacity(replaced.length()); page.sb.append(replaced); + + List<String> jspInserts = getJspInserts(); + + StringBuilder toInsert = new StringBuilder(300); + + if(jspInserts != null) { + Iterator<String> itr = jspInserts.iterator(); + while(itr.hasNext()) { + toInsert.append(page.includeJspString(itr.next(), httpRequest, + httpResponse, wbRequest, results, result, resource)); + } + } + + page.insertAtEndOfBody(toInsert.toString()); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-12 23:01:35
|
Revision: 2538 http://archive-access.svn.sourceforge.net/archive-access/?rev=2538&view=rev Author: bradtofel Date: 2008-08-12 23:00:57 +0000 (Tue, 12 Aug 2008) Log Message: ----------- TWEAK: whitespace Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/DebugBanner.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/DebugBanner.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/DebugBanner.jsp 2008-08-12 22:41:36 UTC (rev 2537) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/DebugBanner.jsp 2008-08-12 23:00:57 UTC (rev 2538) @@ -1,18 +1,30 @@ -<%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> -<%@ page import="java.util.Date" %> -<%@ page import="java.util.Map" %> -<%@ page import="java.util.Set" %> -<%@ page import="java.util.Iterator" %> -<%@ page import="org.archive.wayback.WaybackConstants" %> -<%@ page import="org.archive.wayback.core.CaptureSearchResult" %> -<%@ page import="org.archive.wayback.core.CaptureSearchResults" %> -<%@ page import="org.archive.wayback.core.SearchResult" %> -<%@ page import="org.archive.wayback.core.UIResults" %> -<%@ page import="org.archive.wayback.core.WaybackRequest" %> -<%@ page import="org.archive.wayback.util.StringFormatter" %> -<%@ page import="org.archive.wayback.util.html.SelectHTML" %> -<jsp:include page="/WEB-INF/template/CookieJS.jsp" flush="true" /> -<% +<%@ + page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8" +%><%@ + page import="java.util.Date" +%><%@ + page import="java.util.Map" +%><%@ + page import="java.util.Set" +%><%@ + page import="java.util.Iterator" +%><%@ + page import="org.archive.wayback.WaybackConstants" +%><%@ + page import="org.archive.wayback.core.CaptureSearchResult" +%><%@ + page import="org.archive.wayback.core.CaptureSearchResults" +%><%@ + page import="org.archive.wayback.core.SearchResult" +%><%@ + page import="org.archive.wayback.core.UIResults" +%><%@ + page import="org.archive.wayback.core.WaybackRequest" +%><%@ + page import="org.archive.wayback.util.StringFormatter" +%><%@ + page import="org.archive.wayback.util.html.SelectHTML" +%><jsp:include page="/WEB-INF/template/CookieJS.jsp" flush="true" /><% SelectHTML window = new SelectHTML("foo"); window.setProps("onchange=\"SetAnchorWindow(this.value); location.reload(true);\""); window.addOption("1 day","86400"); @@ -158,4 +170,4 @@ </script> <!-- End of DebugBanner.jsp output ---> +--> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-12 23:01:35
|
Revision: 2539 http://archive-access.svn.sourceforge.net/archive-access/?rev=2539&view=rev Author: bradtofel Date: 2008-08-12 23:01:21 +0000 (Tue, 12 Aug 2008) Log Message: ----------- TWEAK: whitespace Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/CookieJS.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/CookieJS.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/CookieJS.jsp 2008-08-12 23:00:57 UTC (rev 2538) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/template/CookieJS.jsp 2008-08-12 23:01:21 UTC (rev 2539) @@ -1,6 +1,8 @@ -<%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> -<%@ page import="org.archive.wayback.core.WaybackRequest" %> -<script type="text/javascript"> +<%@ + page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8" +%><%@ + page import="org.archive.wayback.core.WaybackRequest" +%><script type="text/javascript"> function SetCookie(cookieName,cookieValue,nDays) { var today = new Date(); var expire = new Date(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-12 22:41:28
|
Revision: 2537 http://archive-access.svn.sourceforge.net/archive-access/?rev=2537&view=rev Author: bradtofel Date: 2008-08-12 22:41:36 +0000 (Tue, 12 Aug 2008) Log Message: ----------- TWEAK: Updated getFormatter() from WaybackRequest, not UIResults. fixed typo in text. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp 2008-08-11 23:57:40 UTC (rev 2536) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp 2008-08-12 22:41:36 UTC (rev 2537) @@ -4,14 +4,14 @@ <jsp:include page="/WEB-INF/template/UI-header.jsp" flush="true" /> <% UIResults results = UIResults.getGeneric(request); -StringFormatter fmt = results.getFormatter(); +StringFormatter fmt = results.getWbRequest().getFormatter(); Object names = request.getAttribute("AccessPointNames"); if(names != null) { if(names instanceof ArrayList) { ArrayList<String> accessPoints = (ArrayList<String>) names; if(accessPoints.size() > 0) { %> - You seems to be accessing this Wayback via an incorrect URL. Please try one of the following AccessPoints:<br></br> + You seem to be accessing this Wayback via an incorrect URL. Please try one of the following AccessPoints:<br></br> <% } for(String accessPoint : accessPoints) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-11 23:57:31
|
Revision: 2536 http://archive-access.svn.sourceforge.net/archive-access/?rev=2536&view=rev Author: bradtofel Date: 2008-08-11 23:57:40 +0000 (Mon, 11 Aug 2008) Log Message: ----------- FEATURE: now pulls any PropertyPlaceHolderConfigurer specified in the spring config, and allows them to process the config for macro substitution. TWEAK: cleaned up code dealing with classes to make it more concise and legible. removed some commented out code. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java 2008-08-11 23:53:31 UTC (rev 2535) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestMapper.java 2008-08-11 23:57:40 UTC (rev 2536) @@ -27,7 +27,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.Iterator; import java.util.Map; import java.util.logging.Logger; @@ -35,6 +34,7 @@ import javax.servlet.http.HttpServletRequest; import org.archive.wayback.exception.ConfigurationException; +import org.springframework.beans.factory.config.PropertyPlaceholderConfigurer; import org.springframework.beans.factory.xml.XmlBeanFactory; import org.springframework.core.io.FileSystemResource; import org.springframework.core.io.Resource; @@ -55,12 +55,7 @@ private final static String PORT_SEPARATOR = ":"; - private final static String ACCESS_POINT_CLASSNAME = - "org.archive.wayback.webapp.AccessPoint"; - private final static String CONFIG_PATH = "config-path"; -// private WaybackContext defaultContext = null; -// private ServletContext servletContext = null; private XmlBeanFactory factory = null; /** @@ -68,9 +63,9 @@ * @param servletContext * @throws ConfigurationException */ + @SuppressWarnings("unchecked") public RequestMapper(ServletContext servletContext) throws ConfigurationException { -// this.servletContext = servletContext; String configPath = servletContext.getInitParameter(CONFIG_PATH); if(configPath == null) { throw new ConfigurationException("Missing " + CONFIG_PATH @@ -79,6 +74,13 @@ String resolvedPath = servletContext.getRealPath(configPath); Resource resource = new FileSystemResource(resolvedPath); factory = new XmlBeanFactory(resource); + Map map = factory.getBeansOfType(PropertyPlaceholderConfigurer.class); + if(map != null) { + Collection<PropertyPlaceholderConfigurer> macros = map.values(); + for(PropertyPlaceholderConfigurer macro : macros) { + macro.postProcessBeanFactory(factory); + } + } factory.preInstantiateSingletons(); } @@ -130,21 +132,14 @@ return context; } - @SuppressWarnings("unchecked") public ArrayList<String> getAccessPointNamesOnPort(String portStr) { ArrayList<String> names = new ArrayList<String>(); - try { - Class accessPointClass = Class.forName(ACCESS_POINT_CLASSNAME); - String[] apNames = factory.getBeanNamesForType(accessPointClass); - String portStrColon = portStr + ":"; - for(String apName : apNames) { - if(apName.startsWith(portStrColon)) { - names.add(apName.substring(portStrColon.length())); - } + String[] apNames = factory.getBeanNamesForType(AccessPoint.class); + String portStrColon = portStr + ":"; + for(String apName : apNames) { + if(apName.startsWith(portStrColon)) { + names.add(apName.substring(portStrColon.length())); } - } catch (ClassNotFoundException e) { - // boy, we're in trouble now.. - e.printStackTrace(); } return names; } @@ -154,16 +149,13 @@ @SuppressWarnings("unchecked") public void destroy() { LOGGER.info("shutting down contexts..."); - Class accessPointClass; - try { - accessPointClass = Class.forName(ACCESS_POINT_CLASSNAME); - Map beanMap = factory.getBeansOfType(accessPointClass); - Iterator beanNameItr = beanMap.keySet().iterator(); - Collection accessPoints = beanMap.values(); - while(beanNameItr.hasNext()) { - String apName = (String) beanNameItr.next(); - AccessPoint ap = (AccessPoint) beanMap.get(apName); + Map beanMap = factory.getBeansOfType(AccessPoint.class); + Collection accessPoints = beanMap.values(); + for(Object o : accessPoints) { + if(o instanceof AccessPoint) { + AccessPoint ap = (AccessPoint) o; try { + String apName = ap.getBeanName(); LOGGER.info("Shutting down AccessPoint " + apName); ap.shutdown(); LOGGER.info("Successfully shut down " + apName); @@ -171,21 +163,6 @@ e.printStackTrace(); } } - for(Object o : accessPoints) { - if(o instanceof AccessPoint) { - AccessPoint ap = (AccessPoint) o; - try { - ap.shutdown(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - } catch (ClassNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); } - } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-11 23:53:23
|
Revision: 2535 http://archive-access.svn.sourceforge.net/archive-access/?rev=2535&view=rev Author: bradtofel Date: 2008-08-11 23:53:31 +0000 (Mon, 11 Aug 2008) Log Message: ----------- TWEAK: now ferrets away original beanName, which can be accessed through the new getter Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2008-08-11 23:25:34 UTC (rev 2534) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2008-08-11 23:53:31 UTC (rev 2535) @@ -81,6 +81,7 @@ private boolean useServerName = false; private int contextPort = 0; private String contextName = null; + private String beanName = null; private WaybackCollection collection = null; private ReplayDispatcher replay = null; private ExceptionRenderer exception = new BaseExceptionRenderer(); @@ -112,7 +113,7 @@ * @see org.springframework.beans.factory.BeanNameAware#setBeanName(java.lang.String) */ public void setBeanName(String beanName) { - // TODO Auto-generated method stub + this.beanName = beanName; this.contextName = ""; int idx = beanName.indexOf(":"); if(idx > -1) { @@ -126,6 +127,9 @@ } } } + public String getBeanName() { + return beanName; + } /** * @param httpRequest * @return the prefix of paths recieved by this server that are handled by This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-11 23:25:27
|
Revision: 2534 http://archive-access.svn.sourceforge.net/archive-access/?rev=2534&view=rev Author: bradtofel Date: 2008-08-11 23:25:34 +0000 (Mon, 11 Aug 2008) Log Message: ----------- TWEAK: ResourceStore.retrieveResource() no longer throws IOException: this is now converted into a ResourceNotAvailableException Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/ResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/ResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/ResourceStore.java 2008-08-09 02:37:52 UTC (rev 2533) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/ResourceStore.java 2008-08-11 23:25:34 UTC (rev 2534) @@ -41,11 +41,10 @@ * * @param result * @return Resource object retrieved for the SearchResult - * @throws IOException * @throws ResourceNotAvailableException */ - public Resource retrieveResource(CaptureSearchResult result) throws IOException, - ResourceNotAvailableException; + public Resource retrieveResource(CaptureSearchResult result) + throws ResourceNotAvailableException; public void shutdown() throws IOException; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java 2008-08-09 02:37:52 UTC (rev 2533) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java 2008-08-11 23:25:34 UTC (rev 2534) @@ -49,15 +49,21 @@ /* (non-Javadoc) * @see org.archive.wayback.ResourceStore#retrieveResource(org.archive.wayback.core.SearchResult) */ - public Resource retrieveResource(CaptureSearchResult result) throws IOException, - ResourceNotAvailableException { + public Resource retrieveResource(CaptureSearchResult result) + throws ResourceNotAvailableException { // extract ARC filename String fileName = result.getFile(); if(fileName == null || fileName.length() < 1) { - throw new IOException("No ARC/WARC name in search result..."); + throw new ResourceNotAvailableException("No ARC/WARC name in search result..."); } - String urls[] = db.nameToUrls(fileName); + String urls[]; + try { + urls = db.nameToUrls(fileName); + } catch (IOException e1) { + e1.printStackTrace(); + throw new ResourceNotAvailableException(e1.getLocalizedMessage()); + } if(urls == null || urls.length == 0) { throw new ResourceNotAvailableException("Unable to locate(" + fileName + ")"); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2008-08-09 02:37:52 UTC (rev 2533) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2008-08-11 23:25:34 UTC (rev 2534) @@ -48,13 +48,13 @@ private String prefix = null; - public Resource retrieveResource(CaptureSearchResult result) throws IOException, - ResourceNotAvailableException { + public Resource retrieveResource(CaptureSearchResult result) + throws ResourceNotAvailableException { // extract ARC filename String fileName = result.getFile(); if(fileName == null || fileName.length() < 1) { - throw new IOException("No ARC/WARC name in search result..."); + throw new ResourceNotAvailableException("No ARC/WARC name in search result..."); } final long offset = result.getOffset(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-09 02:37:44
|
Revision: 2533 http://archive-access.svn.sourceforge.net/archive-access/?rev=2533&view=rev Author: bradtofel Date: 2008-08-09 02:37:52 +0000 (Sat, 09 Aug 2008) Log Message: ----------- TWEAK: interim checkin of current configs -- not suitable for deployment, but a baseline for several alternate configurations. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2008-08-09 02:36:45 UTC (rev 2532) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/ArchivalUrlReplay.xml 2008-08-09 02:37:52 UTC (rev 2533) @@ -35,7 +35,7 @@ <list> <value>/WEB-INF/replay/ArchiveComment.jsp</value> <value>/WEB-INF/replay/ClientSideJSInsert.jsp</value> - <value>/WEB-INF/replay/DebugBanner.jsp</value> + <value>/WEB-INF/replay/DebugBanner.jsp</value> <!-- <value>/WEB-INF/replay/Disclaimer.jsp</value> <value>/WEB-INF/replay/Timeline.jsp</value> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml 2008-08-09 02:36:45 UTC (rev 2532) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/DomainPrefixReplay.xml 2008-08-09 02:37:52 UTC (rev 2533) @@ -16,8 +16,8 @@ <property name="jspInserts"> <list> <value>/WEB-INF/replay/ArchiveComment.jsp</value> +<!-- <value>/WEB-INF/replay/DebugBanner.jsp</value> -<!-- <value>/WEB-INF/replay/JSLessTimeline.jsp</value> --> </list> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml 2008-08-09 02:36:45 UTC (rev 2532) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback-templates.xml 2008-08-09 02:37:52 UTC (rev 2533) @@ -23,6 +23,17 @@ </property> </bean> + +<!-- + <property name="annotater"> + <bean class="org.archive.wayback.resourceindex.filters.OracleAnnotationFilter"> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> + <property name="who" value="annotation" /> + </bean> + </property> +--> + + <!-- ResourceIndex templates --> <bean id="remoteindex" class="org.archive.wayback.resourceindex.RemoteResourceIndex" init-method="init"> Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-09 02:36:45 UTC (rev 2532) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2008-08-09 02:37:52 UTC (rev 2533) @@ -9,131 +9,12 @@ HTTP 1.1 remote access to ARC files distributed across multiple computers or directories. --> -<!-- ---> - - -<!-- - The following 2 beans are required when using exclusions based on live - robots.txt documents. ---> -<!-- - <bean id="livewebcache" class="org.archive.wayback.liveweb.LiveWebCache"> - - <property name="arcCacheDir"> - <bean class="org.archive.wayback.liveweb.ARCCacheDirectory" - init-method="init"> - <property name="arcDir" value="/tmp/wayback/liveweb/arcs/" /> - <property name="arcPrefix" value="live" /> - </bean> - </property> - - <property name="cacher"> - <bean class="org.archive.wayback.liveweb.URLCacher"> - <property name="tmpDir" value="/tmp/wayback/liveweb/tmp/" /> - </bean> - </property> - - <property name="index"> - <bean class="org.archive.wayback.liveweb.LiveWebLocalResourceIndex"> - - <property name="source"> - <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex" - init-method="init"> - - <property name="bdbName" value="DB1" /> - <property name="bdbPath" value="/tmp/wayback/liveweb/db/" /> - </bean> - </property> - </bean> - </property> - </bean> - - <bean id="excluder-factory-robot" class="org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory"> - <property name="maxCacheMS" value="86400000" /> - <property name="userAgent" value="ia_archiver" /> - <property name="webCache" ref="livewebcache" /> - </bean> ---> - -<!-- - The following bean is an example using the Access Control Oracle, thanks - Alex Osborne and NLA. Currently this is pretty undocumented, but here is a - place to get started: - - http://webteam.archive.org/confluence/display/wayback/Exclusions+API ---> - - <bean id="excluder-factory-oracle" class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> - <property name="oracleUrl" value="http://localhost:8180/oracle/" /> - <property name="accessGroup" value="ia_archiver" /> - </bean> - -<!-- - <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> - - <property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.LocalResourceStore" - init-method="init"> - - <property name="dataDir" value="/tmp/wayback/arcs/" /> - - <property name="indexThread"> - <bean class="org.archive.wayback.resourcestore.AutoIndexThread"> - <property name="queuedDir" value="/tmp/wayback/arc-indexer/queued" /> - <property name="workDir" value="/tmp/wayback/arc-indexer/work" /> - <property name="runInterval" value="10000" /> - <property name="indexClient"> - <bean class="org.archive.wayback.resourceindex.indexer.IndexClient"> - <property name="tmpDir" value="/tmp/wayback/arc-indexer/tmp" /> - <property name="target" value="/tmp/wayback/index-data/incoming" /> - </bean> - </property> - </bean> - </property> - </bean> - </property> - - <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source"> - <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex" - init-method="init"> - <property name="bdbName" value="DB1" /> - <property name="bdbPath" value="/tmp/wayback/index/" /> - <property name="updater"> - <bean class="org.archive.wayback.resourceindex.bdb.BDBIndexUpdater"> - <property name="incoming" value="/tmp/wayback/index-data/incoming/" /> - <property name="failed" value="/tmp/wayback/index-data/failed/" /> - <property name="merged" value="/tmp/wayback/index-data/merged/" /> - <property name="runInterval" value="10000" /> - </bean> - </property> - </bean> - </property> - <property name="maxRecords" value="10000" /> - </bean> - </property> - </bean> ---> - - - -<!-- - <property name="annotater"> - <bean class="org.archive.wayback.resourceindex.filters.OracleAnnotationFilter"> - <property name="oracleUrl" value="http://localhost:8180/oracle/" /> - <property name="who" value="annotation" /> - </bean> - </property> ---> - <bean id="resourcefilelocationdb" class="org.archive.wayback.resourcestore.locationdb.BDBResourceFileLocationDB" init-method="init"> - <property name="bdbPath" value="/tmp/wayback/file-db" /> + <property name="bdbPath" value="/tmp/wayback/file-db/db/" /> <property name="bdbName" value="DB1" /> - <property name="logPath" value="/tmp/wayback/file-db.log" /> + <property name="logPath" value="/tmp/wayback/file-db/db.log" /> </bean> <bean name="8080:locationdb" class="org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBServlet"> <property name="locationDB" ref="resourcefilelocationdb" /> @@ -141,27 +22,27 @@ <bean name="8080:fileproxy" class="org.archive.wayback.resourcestore.locationdb.FileProxyServlet"> <property name="locationDB" ref="resourcefilelocationdb" /> </bean> + + <bean id="localbdbresourceindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> + <property name="source"> + <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex" + init-method="init"> + <property name="bdbName" value="DB1" /> + <property name="bdbPath" value="/tmp/wayback/index/" /> + </bean> + </property> + <property name="maxRecords" value="10000" /> + </bean> <bean id="indexqueue" class="org.archive.wayback.resourcestore.indexer.DirectoryIndexQueue"> - <property name="path" value="/tmp/wayback/indexer-queue" /> + <property name="path" value="/tmp/wayback/index-data/queue" /> </bean> - <bean id="localbdbresourceindex" class="org.archive.wayback.resourceindex.LocalResourceIndex"> - <property name="source"> - <bean class="org.archive.wayback.resourceindex.bdb.BDBIndex" - init-method="init"> - <property name="bdbName" value="DB1" /> - <property name="bdbPath" value="/tmp/wayback/index/" /> - </bean> - </property> - <property name="maxRecords" value="10000" /> - </bean> - <bean id="localbdbcollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> - <bean id="localresourcestore" class="org.archive.wayback.resourcestore.LocalResourceFileResourceStore"> - <property name="db" ref="resourcefilelocationdb" /> - </bean> + <bean id="localresourcestore" class="org.archive.wayback.resourcestore.LocalResourceFileResourceStore"> + <property name="db" ref="resourcefilelocationdb" /> + </bean> </property> <property name="resourceIndex" ref="localbdbresourceindex"/> @@ -171,14 +52,18 @@ <!-- This thread notices new files appearing in your resourcefilesources --> <bean id="resourcefilesourceupdater" class="org.archive.wayback.resourcestore.resourcefile.ResourceFileSourceUpdater" init-method="init"> - <property name="target" value="/tmp/wayback/file-db-incoming" /> + <property name="target" value="/tmp/wayback/file-db/incoming" /> <property name="interval" value="100000" /> <property name="sources"> <list> - <bean id="resourcefilesource" class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> - <property name="name" value="braddir1" /> + <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> + <property name="name" value="files1" /> <property name="prefix" value="/tmp/wayback/files1/" /> </bean> + <bean class="org.archive.wayback.resourcestore.resourcefile.DirectoryResourceFileSource"> + <property name="name" value="files2" /> + <property name="prefix" value="/tmp/wayback/files2/" /> + </bean> </list> </property> </bean> @@ -188,8 +73,8 @@ init-method="init"> <property name="interval" value="100000" /> <property name="db" ref="resourcefilelocationdb" /> - <property name="incomingDir" value="/tmp/wayback/file-db-incoming" /> - <property name="stateDir" value="/tmp/wayback/file-db-state" /> + <property name="incomingDir" value="/tmp/wayback/file-db/incoming" /> + <property name="stateDir" value="/tmp/wayback/file-db/state" /> </bean> <!-- This thread notices new files arriving in the filelocationdb, and queues them for indexing --> @@ -198,7 +83,7 @@ <property name="db" ref="resourcefilelocationdb" /> <property name="queue" ref="indexqueue" /> <property name="interval" value="1000" /> - <property name="lastMark" value="/tmp/wayback/index-queue.mark" /> + <property name="lastMark" value="/tmp/wayback/index-data/queue.mark" /> </bean> <!-- This thread checks the to-be-indexed queue for files needing indexing, indexes them, and hands off the results for merging with the ResourceIndex --> @@ -217,14 +102,14 @@ <!-- This thread merges updates from the indexworker into the ResourceIndex --> <bean class="org.archive.wayback.resourceindex.updater.LocalResourceIndexUpdater" - init-method="init"> - - <property name="index" ref="localbdbresourceindex" /> - <property name="incoming" value="/tmp/wayback/index-data/incoming/" /> - <property name="failed" value="/tmp/wayback/index-data/failed/" /> - <property name="merged" value="/tmp/wayback/index-data/merged/" /> - <property name="runInterval" value="10000" /> - </bean> + init-method="init"> + + <property name="index" ref="localbdbresourceindex" /> + <property name="incoming" value="/tmp/wayback/index-data/incoming/" /> + <property name="failed" value="/tmp/wayback/index-data/failed/" /> + <property name="merged" value="/tmp/wayback/index-data/merged/" /> + <property name="runInterval" value="10000" /> + </bean> </list> </property> </bean> @@ -248,9 +133,9 @@ <property name="resourceIndex"> <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> <property name="source"> - <bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> - <property name="path" value="/tmp/wayback/cdx-index/index.cdx" /> - </bean> + <bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> + <property name="path" value="/tmp/wayback/cdx-index/index.cdx" /> + </bean> </property> <property name="maxRecords" value="10000" /> </bean> @@ -258,7 +143,30 @@ </bean> --> + <bean id="localcdxcollection2" class="org.archive.wayback.webapp.WaybackCollection"> + <property name="resourceStore"> + <bean class="org.archive.wayback.resourcestore.LocalResourceFileResourceStore"> + <property name="db"> + <bean class="org.archive.wayback.resourcestore.locationdb.FlatFileResourceFileLocationDB"> + <property name="path" value="/tmp/wayback/path-index.txt" /> + </bean> + </property> + </bean> + </property> + + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.LocalResourceIndex"> + <property name="source"> + <bean id="cdxsearchresultsource" class="org.archive.wayback.resourceindex.cdx.CDXIndex"> + <property name="path" value="/tmp/wayback/cdx-index/index.1" /> + </bean> + </property> + <property name="maxRecords" value="10000" /> + </bean> + </property> + </bean> + <!-- The following WaybackCollection bean template is required when using a remote ResourceIndex and ResourceStore implementation. This will also @@ -266,20 +174,21 @@ the resourceStore:urlPrefix configuration, and an additional AccessPoint on the host specified by the resourceIndex:searchUrlBase configuration. --> + <!-- <bean id="remotecollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> - <property name="urlPrefix" value="http://localhost:8080/arcproxy/" /> + <property name="urlPrefix" value="http://wayback.archive-it.org/fileproxy/" /> </bean> </property> <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex" - init-method="init"> - <property name="searchUrlBase" value="http://indexhost:8080/index/xmlquery" /> - </bean> + <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex" + init-method="init"> + <property name="searchUrlBase" value="http://wayback.archive-it.org/1055/xmlquery" /> + </bean> </property> </bean> --> @@ -291,30 +200,39 @@ installation. You may also need to ensure that the maxRecords on your RequestParser is not greater than the maxRecords configured on the RemoteNutchResourceIndex. --> - +<!-- <bean id="remotenutchcollection" class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore"> <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> -<!-- - <property name="urlPrefix" value="http://crawling11.us.archive.org/arcproxy/" /> ---> - <property name="urlPrefix" value="http://webapp100.us.archive.org/arcproxy/" /> + <property name="urlPrefix" value="http://webapp101.us.archive.org/arcproxy/" /> </bean> </property> <property name="resourceIndex"> <bean class="org.archive.wayback.resourceindex.NutchResourceIndex" init-method="init"> -<!-- - <property name="searchUrlBase" value="http://webteam-ws.us.archive.org:8080/katrina/opensearch" /> - --> - <property name="searchUrlBase" value="http://192.168.1.208:9090/nutch-1.0-dev/opensearch" /> + <property name="searchUrlBase" value="http://webapp101.us.archive.org/e04/xmlquery" /> <property name="maxRecords" value="100" /> </bean> </property> </bean> +--> <!-- + The following bean is an example using the Access Control Oracle, thanks + Alex Osborne and NLA. Currently this is pretty undocumented, but here is a + place to get started: + + http://webteam.archive.org/confluence/display/wayback/Exclusions+API +--> +<!-- + <bean id="excluder-factory-oracle" class="org.archive.wayback.accesscontrol.oracleclient.OracleExclusionFilterFactory"> + <property name="oracleUrl" value="http://localhost:8180/oracle/" /> + <property name="accessGroup" value="ia_archiver" /> + </bean> +--> + +<!-- This is the only AccessPoint defined by default within this wayback.xml Spring configuration file, providing an ArchivalURL Replay UI to the "localbdbcollection" by providing ArchivalURL-specific implementations @@ -325,17 +243,17 @@ fully qualified hostname of the computer running Tomcat. --> <import resource="ArchivalUrlReplay.xml"/> - <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> - <!-- + <bean name="8080:wayback" class="org.archive.wayback.webapp.AccessPoint"> + <!-- <property name="exclusionFactory" ref="excluder-factory-oracle" /> --> - <property name="collection" ref="localbdbcollection" /> - <property name="configs"> - <props> - <prop key="inst">foo</prop> - <prop key="coll">supreme court</prop> - </props> - </property> + <property name="collection" ref="localbdbcollection" /> + <property name="configs"> + <props> + <prop key="inst">foo</prop> + <prop key="coll">supreme court</prop> + </props> + </property> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> @@ -343,24 +261,25 @@ </bean> </property> - <property name="query"> - <bean class="org.archive.wayback.query.Renderer"> - <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> - </bean> - </property> + <property name="query"> + <bean class="org.archive.wayback.query.Renderer"> + <property name="captureJsp" value="/WEB-INF/query/CalendarResults.jsp" /> + </bean> + </property> <property name="replay" ref="archivalurlreplay" /> - <property name="parser"> - <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" - init-method="init"> - <property name="maxRecords" value="1000" /> - <property name="earliestTimestamp" value="1996" /> - </bean> - </property> - </bean> + <property name="parser"> + <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" + init-method="init"> + <property name="maxRecords" value="1000" /> + <property name="earliestTimestamp" value="1996" /> + </bean> + </property> + </bean> <bean name="8080:rwayback" parent="8080:wayback"> + <property name="collection" ref="localcdxcollection2" /> <property name="parser"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlRequestParser" init-method="init"> @@ -383,19 +302,19 @@ <property name="replayURIPrefix" value="http://localhost:8080/rwayback/" /> </bean> </property> - <property name="collection" ref="remotenutchcollection"> <!-- - <bean class="org.archive.wayback.webapp.WaybackCollection"> - <property name="resourceStore" ref="fancyresourcestore" /> - <property name="resourceIndex"> - <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex" - init-method="init"> - <property name="searchUrlBase" value="http://localhost:8080/wayback/xmlquery" /> - </bean> - </property> - </bean> - --> + <property name="collection"> + <bean class="org.archive.wayback.webapp.WaybackCollection"> + <property name="resourceStore" ref="fancyresourcestore" /> + <property name="resourceIndex"> + <bean class="org.archive.wayback.resourceindex.RemoteResourceIndex" + init-method="init"> + <property name="searchUrlBase" value="http://localhost:8080/wayback/xmlquery" /> + </bean> + </property> + </bean> </property> + --> </bean> <!-- @@ -403,32 +322,32 @@ AccessPoint, but only allows access from the specified IP network. --> <!-- - <bean name="8080:netsecure" parent="8080:wayback"> - - <property name="authentication"> + <bean name="8080:netsecure" parent="8080:wayback"> + + <property name="authentication"> <bean class="org.archive.wayback.authenticationcontrol.AccessControlSettingOperation"> <property name="operator"> <bean class="org.archive.wayback.util.operator.NotBooleanOperator"> <property name="operand"> - <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> - <property name="allowedRanges"> - <list> - <value>192.168.1.16/24</value> - </list> - </property> - </bean> - </property> - </bean> - </property> - <property name="factory" ref="excluder-factory-robot"/> + <bean class="org.archive.wayback.authenticationcontrol.IPMatchesBooleanOperator"> + <property name="allowedRanges"> + <list> + <value>192.168.1.16/24</value> + </list> + </property> + </bean> + </property> + </bean> + </property> + <property name="factory" ref="excluder-factory-robot"/> </bean> - </property> + </property> <property name="uriConverter"> <bean class="org.archive.wayback.archivalurl.ArchivalUrlResultURIConverter"> <property name="replayURIPrefix" value="http://192.168.1.16:8080/netsecure/" /> </bean> </property> - </bean> + </bean> --> <!-- The following AccessPoint inherits all configuration from the 8080:wayback @@ -449,22 +368,22 @@ </bean> --> -<import resource="DomainPrefixReplay.xml"/> -<bean name="8081" parent="8080:wayback"> - <property name="useServerName" value="true" /> - <property name="replay" ref="domainprefixreplay" /> - <property name="uriConverter"> - <bean class="org.archive.wayback.domainprefix.DomainPrefixResultURIConverter"> - <property name="hostPort" value="localhost.archive.org:8081" /> - </bean> - </property> - <property name="parser"> - <bean class="org.archive.wayback.domainprefix.DomainPrefixCompositeRequestParser" init-method="init"> - <property name="hostPort" value="localhost.archive.org:8081" /> - <property name="maxRecords" value="1000" /> - </bean> - </property> -</bean> + <import resource="DomainPrefixReplay.xml"/> + <bean name="8081" parent="8080:wayback"> + <property name="useServerName" value="true" /> + <property name="replay" ref="domainprefixreplay" /> + <property name="uriConverter"> + <bean class="org.archive.wayback.domainprefix.DomainPrefixResultURIConverter"> + <property name="hostPort" value="localhost.archive.org:8081" /> + </bean> + </property> + <property name="parser"> + <bean class="org.archive.wayback.domainprefix.DomainPrefixCompositeRequestParser" init-method="init"> + <property name="hostPort" value="localhost.archive.org:8081" /> + <property name="maxRecords" value="1000" /> + </bean> + </property> + </bean> <!-- @@ -477,23 +396,23 @@ in your Tomcat's server.xml file. --> <import resource="ProxyReplay.xml"/> -<bean name="8090" parent="8080:wayback"> - <property name="useServerName" value="true" /> - <property name="replay" ref="proxyreplay" /> - <property name="uriConverter"> - <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> - <property name="redirectURI" value="http://brad.archive.org/jsp/Redirect.jsp" /> - </bean> - </property> - <property name="parser"> - <bean class="org.archive.wayback.proxy.ProxyRequestParser" init-method="init"> - <property name="localhostNames"> - <list> - <value>brad.archive.org</value> - </list> - </property> - <property name="maxRecords" value="1000" /> - </bean> - </property> -</bean> + <bean name="8090" parent="8080:wayback"> + <property name="useServerName" value="true" /> + <property name="replay" ref="proxyreplay" /> + <property name="uriConverter"> + <bean class="org.archive.wayback.proxy.RedirectResultURIConverter"> + <property name="redirectURI" value="http://brad.archive.org/jsp/Redirect.jsp" /> + </bean> + </property> + <property name="parser"> + <bean class="org.archive.wayback.proxy.ProxyRequestParser" init-method="init"> + <property name="localhostNames"> + <list> + <value>brad.archive.org</value> + </list> + </property> + <property name="maxRecords" value="1000" /> + </bean> + </property> + </bean> </beans> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-09 02:36:36
|
Revision: 2532 http://archive-access.svn.sourceforge.net/archive-access/?rev=2532&view=rev Author: bradtofel Date: 2008-08-09 02:36:45 +0000 (Sat, 09 Aug 2008) Log Message: ----------- DOC update for 1.4 features. Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_store.xml Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-08-09 01:20:56 UTC (rev 2531) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2008-08-09 02:36:45 UTC (rev 2532) @@ -53,7 +53,7 @@ <p> Once you have downloaded the .tar.gz file from sourceforge, you will need to unpack the file to access the - webapp file, <b>wayback.war</b>. + webapp file, <b>wayback-webapp-1.4.0.war</b>. </p> <p> Installation and configuration of this software involves the @@ -110,177 +110,24 @@ class="org.archive.wayback.webapp.WaybackCollection"> <property name="resourceStore" ... /> <property name="resourceIndex" ... /> + <property name="shutdownables" ... /> </bean> </pre> </p> <p> - The resourceStore property refers to a bean implementing org.archive.wayback.ResourceStore. + The resourceStore property refers to a bean implementing + <a href="resource_store.html">org.archive.wayback.ResourceStore</a>. </p> <p> - The resourceIndex property refers to a bean implementing org.archive.wayback.ResourceIndex. + The resourceIndex property refers to a bean implementing + <a href="resource_index.html">org.archive.wayback.ResourceIndex</a>. </p> - </section> + <p> + The shutdownables property refers to a list of beans implementing org.archive.wayback.Shutdownable, typically worker Threads performing automatic updates of the Collection. + </p> + </section> - - - <section name="org.archive.wayback.ResourceStore implementations"> - - - <subsection name="LocalResourceStore"> - <p> - This implementation works well for small - collections, where all the ARC/WARC files can be placed in a single - directory on the same computer running the wayback application. - Using NFS or another network filesystem technology and symbolic - links can allow this implementation to deal with files in - multiple directories, or across multiple storage nodes. This - implementation also includes the capability to run a background - thread to automatically notice new ARC/WARC files appearing, index - those files, and hand off the index data for merging with - a BDBResourceIndex. When using automatic indexing, any files added to - the 'dataDir' will automatically be indexed and queued for merging - with the ResourceIndex. Please see documentation for the - BDBResourceIndex for information on configuring automatic merging of - indexed data with a BDBResourceIndex. - </p> - <p> - The XML configuration template for a LocalResourceStore follows: - <pre> - -<property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.LocalResourceStore" - init-method="init"> - - <property name="dataDir" value="/tmp/wayback/arcs/" /> - - <property name="indexThread"> - <bean class="org.archive.wayback.resourcestore.AutoIndexThread"> - <property name="queuedDir" value="/tmp/wayback/arc-indexer/queued" /> - <property name="workDir" value="/tmp/wayback/arc-indexer/work" /> - <property name="runInterval" value="10000" /> - <property name="indexClient"> - <bean class="org.archive.wayback.resourceindex.indexer.IndexClient"> - <property name="tmpDir" value="/tmp/wayback/arc-indexer/tmp" /> - <property name="target" value="/tmp/wayback/index-data/incoming" /> - </bean> - </property> - </bean> - </property> - </bean> -</property> - - </pre> - </p> - <p> - Required configuration: - <ul> - <li> - <b> - dataDir - </b> - is the local directory where ARC files will be - located. - </li> - </ul> - </p> - <p> - Optional configuration (only needed if the indexThread property-bean - is specified, for automatic indexing) - <ul> - <li> - <b> - queuedDir - </b> - names a local directory where the indexer will maintain state - about ARC files that have already been indexed. - </li> - <li> - <b> - workDir - </b> - names a local directory where the indexer will maintain state - about ARC files that are about to be indexed. - </li> - <li> - <b> - runInterval - </b> - indicates the number of milliseconds between polling arcDir - for newly created ARC files. Default is 10000. - </li> - <li> - <b> - tmpDir - </b> - names a local directory where index data will be stored - temporarily before handing off to <b>target</b>. - </li> - <li> - <b> - target - </b> - names: - <ol> - <li> - a local directory where an BDBIndexUpdater is configured to - look for new index data to be merged with a BDBIndex. - </li> - <li> - a remote http:// URL where index data should be PUT, for - merging with a remote BDBIndex. - </li> - </ol> - </li> - </ul> - </p> - <p> - <b>Note:</b> upgrading from Wayback 1.0 to 1.2 requires changing - ResourceStore implementations from <b>LocalARCResourceStore</b> to - <b>LocalResourceStore</b>. <b>LocalARCResourceStore</b> is now - deprecated. - </p> - </subsection> - - - <subsection name="Http11ResourceStore"> - <p> - This implementation allows the wayback application to access - documents in remote ARC/WARC files via HTTP 1.1, and scales to - millions of ARC/WARC files. - </p> - <p> - The XML configuration template for an Http11ResourceStore follows: - <pre> - -<property name="resourceStore"> - <bean class="org.archive.wayback.resourcestore.Http11ResourceStore"> - <property name="urlPrefix" value="http://localhost:8080/arcproxy/" /> - </bean> -</property> - - </pre> - </p> - <p> - Required configuration: - <ul> - <li> - <b> - urlPrefix - </b> - this is the http:// prefix where ARC/WARC files are exported with - an ArcProxy installation. See elsewhere in this document for - information about setting up an ArcProxy. - </li> - </ul> - </p> - </subsection> - - - </section> - - - <section name="org.archive.wayback.ResourceIndex implementations"> Added: trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_store.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_store.xml (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/resource_store.xml 2008-08-09 02:36:45 UTC (rev 2532) @@ -0,0 +1,143 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> +<document> + <properties> + <title>Resource Store Configuration</title> + <author email="brad at archive dot org">Brad Tofel</author> + <revision>$$Id$$</revision> + </properties> + + <body> + <section name="ResourceStore configuration options"> + <subsection name="FileLocationDB"> + <p> + The Location Database provides a mapping between ARC/WARC file names + and the absolution location of those ARC/WARC files. Absolute + location, in this case, can refer to either HTTP URLs or absolute + paths to files on the local file system. + </p> + <p> + Whenever locations are added for a new filename that was not + previously present in the location database, a record (in this case a + line) is added to a log file. This log file can then be used to + determine which files have been seen by the location database. The + ResourceFileLocationDatabase interface includes methods to retrieve + the current length of this log file, and to return an iterator with + all records between two points in the log. This interface allows an + observer to poll the location database to create events when new files + are added to the underlying database. + </p> + </subsection> + <subsection name="Automatic Indexing Components"> + <p> + Wayback includes 5 Thread/Worker classes to enable automatic indexing + of new content: + <img src="images/AutoIndexing.png" /> + <ul> + <li> + <b>ResourceFileSourceUpdater</b> is responsible for repeatedly + scanning one or more ResourceFileSource instances, creating + manifests of the files seen in each, and handing the manifests + off to the ResourceFileLocationDBUpdater. In the future, for + larger installations, with 100s to 1000s of machines holding + ARC/WARC files, multiple instances of this component may run in + parallel. + </li> + <li> + <b>ResourceFileLocationDBUpdater</b> is responsible for noticing + new manifests appearing in an incoming directory, and merging + the contents of those manifests with the actual location database, + which is currently implemented using a BDBJE database. + </li> + <li> + <b>IndexQueueUpdater</b> is responsible for polling the location + database log, and adding newly discovered ARC/WARC files to the + IndexQueue. + </li> + <li> + <b>IndexWorker</b> is responsible for polling the IndexQueue, and + when file names are present in the queue, creating an index of + all resources in the ARC/WARC file, and handing the results to + the LocalResourceIndexUpdater. In the future, for larger + installations, multiple instances of this component may run in + parallel on multiple hosts, or this entire component may be + replaced by a distributed Hadoop indexing implementation. + </li> + <li> + <b>LocalResourceIndexUpdater</b> is responsible for noticing new + index result files appearing in an incoming directory, and merging + those results with an existing LocalResourceIndex. Currently the + only provided LocalResourceIndex that can be updated based on an + underlying BDBJE database, but future implementation may maintain + a set of sorted CDX files, or a combination of CDX files and a + BDBJE database. + </li> + </ul> + </p> + </subsection> + </section> + + <section name="org.archive.wayback.ResourceStore implementations"> + <p> + Wayback allows for several configurations enabling diverse collection + sizes and distribution of ARC/WARC files across many local directories + or across many servers. For most configurations, the default + LocationDBResourceStore will suffice, but Wayback is distributed with + 2 additional classes, FileProxy and SimpleResourceStore, which + provide an opportunity to insert a single HTTP caching server between + the Wayback service and an ARC/WARC storage cluster. + </p> + + <subsection name="LocationDBResourceStore"> + <p> + This implementation uses a LocationDB to convert ARC/WARC filenames + into absolute paths, or HTTP URLs. The underlying LocationDB can be + managed by the automatic indexing threads as described above, or it + can be manually managed with the <i>location-client</i> command line + tool. Be sure to enable the + org.archive.wayback.resourcestore.locationdb.FileProxyServlet + if you plan to manage the LocationDB manually. + </p> + </subsection> + <subsection name="SimpleResourceStore"> + <p> + This configuration depends on all ARC/WARC files appearing within a + single HTTP 1.1 exported root directory, or within a single local + directory. ARC/WARC file names are appended to a common prefix, either + a local directory on the host running Wayback, or under a single + remote directory. + </p> + <p> + The FileProxyServlet can be used to make all ARC/WARC files accessible + within a single HTTP directory, acting as a reverse proxy to the + actual host holding the ARC/WARC files. The FileProxyServlet uses a + LocationDB to translate requested ARC/WARC filenames into the actual + location of each file. + </p> + </subsection> + </section> + <section name="Telling Wayback where to look for your ARC/WARC files"> + <p> + When using the automatic indexing functionality, you need to provide a + list of ResourceFileSource objects to the ResourceFileSourceUpdater + class. Wayback currently contains 2 ResourceFileSource implementations: + <ul> + <li> + <b>DirectoryResourceFileSource</b> will recursively scan a local + directory for ARC/WARC files (ending with: .arc, .arc.gz, .warc, + or .warc.gz). The 'name' property of each + DirectoryResourceFileSource must be unique, and consist of valid + filename characters. + </li> + <li> + <b>JspUrlResourceFileSource</b> is a highly experimental + implementation which executes a local .jsp file, passing the 'url' + parameter to the .jsp. The local .jsp is expected to produce output + of the form (NAME URL) for all ARC/WARC files appearing under the + argument url prefix, presumably by parsing the directory index HTML + from the server hosting 'url'. + </li> + </ul> + </p> + </section> + </body> +</document> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-08-09 01:20:49
|
Revision: 2531 http://archive-access.svn.sourceforge.net/archive-access/?rev=2531&view=rev Author: bradtofel Date: 2008-08-09 01:20:56 +0000 (Sat, 09 Aug 2008) Log Message: ----------- RENAME: Http11ResourceStore => SimpleResourceStore (now will use prefix as URL or local path prefix) RENAME: LocalResourceFileResourceStore => LocationDBResourceStore REMOVE: old LocalResourceStore, AutoIndexThread FEATURE: added method to ResourceFactory: getResource(String,long) which guesses if the String argument is an URL or a path, and calls the correct getResource() method. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-08-08 23:35:06 UTC (rev 2530) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/AutoIndexThread.java 2008-08-09 01:20:56 UTC (rev 2531) @@ -1,219 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.logging.Logger; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.updater.IndexClient; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.DirMaker; - -/** - * Thread that repeatedly notices new files in the LocalResourceStore, indexes - * those files, and hands them off to a ResourceIndex via an IndexClient - * - * @author brad - * @version $Date$, $Revision$ - */ -public class AutoIndexThread extends Thread { - private static final Logger LOGGER = - Logger.getLogger(AutoIndexThread.class.getName()); - - private final static int DEFAULT_RUN_INTERVAL_MS = 10000; - private LocalResourceStore store = null; - private File workDir = null; - private File queuedDir = null; - private int runInterval = DEFAULT_RUN_INTERVAL_MS; - private IndexClient indexClient = null; - - /** - * @param store - * @param runInterval - */ - public AutoIndexThread() { - super("AutoARCIndexThread"); - super.setDaemon(true); - } - - public void run() { - LOGGER.info("AutoIndexThread is alive."); - int sleepInterval = runInterval; - if(store == null) { - throw new RuntimeException("No LocalResourceStore set"); - } - while (true) { - try { - int numIndexed = indexNewArcs(); - if (numIndexed == 0) { - sleep(sleepInterval); - sleepInterval += runInterval; - } else { - sleepInterval = runInterval; - } - } catch (InterruptedException e) { - e.printStackTrace(); - return; - } - } - } - - /** - * Scan for new ARC files, and index any new files discovered. - * - * There are 3 main steps, which could be broken into separate threads: - * 1) detect new ARCs - * 2) create CDX files for each new ARC - * 3) upload CDX files to target (or rename to local "incoming" directory) - * - * for now these are sequential. - * - * @return number of ARC files indexed - */ - public int indexNewArcs() { - int numIndexed = 0; - try { - queueNewArcsForIndex(); - } catch (IOException e) { - e.printStackTrace(); - } - try { - numIndexed = indexArcs(10); - } catch (MalformedURLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return numIndexed; - } - /** - * Find any new ARC files and queue them for indexing. - * @throws IOException - */ - public void queueNewArcsForIndex() throws IOException { - - // build a HashMap of what has been queued already: - HashMap<String,String> queued = new HashMap<String, String>(); - String entries[] = queuedDir.list(); - if(entries != null) { - for (int i = 0; i < entries.length; i++) { - queued.put(entries[i], "i"); - } - } - // now scan thru arcDir, and make a flag file for anything that was not - // already there: - Iterator<String> files = store.fileNamesIterator(); - if(files != null) { - while(files.hasNext()) { - String fileName = files.next(); - if(!queued.containsKey(fileName)) { - File newQueuedFile = new File(queuedDir,fileName); - File newToBeIndexedFile = new File(workDir,fileName); - newToBeIndexedFile.createNewFile(); - newQueuedFile.createNewFile(); - } - } - } - } - - private String fileNameToBase(final String fileName) { - return fileName; - } - - /** - * Index up to 'max' ARC/WARC files queued for indexing, queueing the - * resulting CDX files for merging with the BDBIndex. - * - * @param indexer - * @param max maximum number to index in this method call, 0 for unlimited - * @return int number of ARC/WARC files indexed - * @throws MalformedURLException - * @throws IOException - */ - public int indexArcs(int max) - throws MalformedURLException, IOException { - - int numIndexed = 0; - String toBeIndexed[] = workDir.list(); - - if (toBeIndexed != null) { - for (int i = 0; i < toBeIndexed.length; i++) { - String fileName = toBeIndexed[i]; - File file = store.getLocalFile(fileName); - if(file != null) { - File workFlagFile = new File(workDir,fileName); - String cdxBase = fileNameToBase(fileName); - - try { - - LOGGER.info("Indexing " + file.getAbsolutePath()); - CloseableIterator<CaptureSearchResult> itr = store.indexFile(file); - - if(indexClient.addSearchResults(cdxBase, itr)) { - if (!workFlagFile.delete()) { - throw new IOException("Unable to delete " - + workFlagFile.getAbsolutePath()); - } - } - itr.close(); - numIndexed++; - } catch (IOException e) { - LOGGER.severe("FAILED index: " + file.getAbsolutePath() - + " cause: " + e.getLocalizedMessage()); - } - if(max > 0 && (numIndexed >= max)) { - break; - } - } - } - } - return numIndexed; - } - - - - public LocalResourceStore getStore() { - return store; - } - - public void setStore(LocalResourceStore store) { - this.store = store; - } - - public String getWorkDir() { - return workDir == null ? null : workDir.getAbsolutePath(); - } - - public void setWorkDir(String workDir) throws IOException { - this.workDir = DirMaker.ensureDir(workDir); - } - - public String getQueuedDir() { - return queuedDir == null ? null : queuedDir.getAbsolutePath(); - } - - public void setQueuedDir(String queuedDir) throws IOException { - this.queuedDir = DirMaker.ensureDir(queuedDir); - } - - public int getRunInterval() { - return runInterval; - } - - public void setRunInterval(int runInterval) { - this.runInterval = runInterval; - } - - public IndexClient getIndexClient() { - return indexClient; - } - - public void setIndexClient(IndexClient indexClient) { - this.indexClient = indexClient; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-08-08 23:35:06 UTC (rev 2530) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java 2008-08-09 01:20:56 UTC (rev 2531) @@ -1,101 +0,0 @@ -/* HttpARCResourceStore - * - * $Id$ - * - * Created on 5:29:56 PM Oct 12, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.IOException; -import java.net.URL; - -import org.archive.wayback.ResourceStore; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.exception.ResourceNotAvailableException; -import org.archive.wayback.resourcestore.resourcefile.ArcWarcFilenameFilter; -import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; - - -/** - * Implements ResourceStore where ARC/WARCs are accessed via HTTP 1.1 range - * requests. All files are assumed to be "rooted" at a particular HTTP URL, - * within a single directory, implying a file reverse-proxy to connect through - * to actual HTTP ARC/WARC locations. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class Http11ResourceStore implements ResourceStore { - - private String urlPrefix = null; - - - public Resource retrieveResource(CaptureSearchResult result) throws IOException, - ResourceNotAvailableException { - - // extract ARC filename - String fileName = result.getFile(); - if(fileName == null || fileName.length() < 1) { - throw new IOException("No ARC/WARC name in search result..."); - } - - final long offset = result.getOffset(); - if(!fileName.endsWith(ArcWarcFilenameFilter.ARC_SUFFIX) - && !fileName.endsWith(ArcWarcFilenameFilter.ARC_GZ_SUFFIX) - && !fileName.endsWith(ArcWarcFilenameFilter.WARC_SUFFIX) - && !fileName.endsWith(ArcWarcFilenameFilter.WARC_GZ_SUFFIX)) { - fileName = fileName + ArcWarcFilenameFilter.ARC_GZ_SUFFIX; - } - - String fileUrl = urlPrefix + fileName; - Resource r = null; - try { - - r = ResourceFactory.getResource(new URL(fileUrl), offset); - - } catch (IOException e) { - - e.printStackTrace(); - throw new ResourceNotAvailableException("Unable to retrieve", - e.getLocalizedMessage()); - } - return r; - } - - /** - * @return the urlPrefix - */ - public String getUrlPrefix() { - return urlPrefix; - } - - /** - * @param urlPrefix the urlPrefix to set - */ - public void setUrlPrefix(String urlPrefix) { - this.urlPrefix = urlPrefix; - } - - public void shutdown() throws IOException { - // no-op - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-08-08 23:35:06 UTC (rev 2530) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java 2008-08-09 01:20:56 UTC (rev 2531) @@ -1,112 +0,0 @@ -/* LocalResourceFileResourceStore - * - * $Id$ - * - * Created on 6:17:54 PM May 29, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.IOException; -import java.net.URL; - -import org.archive.wayback.ResourceStore; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.exception.ResourceNotAvailableException; -import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; -import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; - -/** - * Simple ResourceStore implementation, which uses a ResourceFileLocationDB to - * locate ARC/WARC files, that can be remote(via http://) or local paths. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LocalResourceFileResourceStore implements ResourceStore { - - private ResourceFileLocationDB db = null; - - /* (non-Javadoc) - * @see org.archive.wayback.ResourceStore#retrieveResource(org.archive.wayback.core.SearchResult) - */ - public Resource retrieveResource(CaptureSearchResult result) throws IOException, - ResourceNotAvailableException { - // extract ARC filename - String fileName = result.getFile(); - if(fileName == null || fileName.length() < 1) { - throw new IOException("No ARC/WARC name in search result..."); - } - - String urls[] = db.nameToUrls(fileName); - if(urls == null || urls.length == 0) { - throw new ResourceNotAvailableException("Unable to locate(" + - fileName + ")"); - } - - final long offset = result.getOffset(); - - Resource r = null; - // TODO: attempt multiple threads? - for(String url : urls) { - - try { - - if(url.startsWith("http://")) { - r = ResourceFactory.getResource(new URL(url), offset); - } else { - // assume local path: - r = ResourceFactory.getResource(new File(url), offset); - } - // TODO: attempt to grab the first few KB? The underlying - // InputStreams support mark(), so we could reset() after. - // wait for now, currently this will parse HTTP headers, - // which means we've already read some - - } catch (IOException e) { - e.printStackTrace(); - } - if(r != null) { - break; - } - } - if(r == null) { - throw new ResourceNotAvailableException("Unable to retrieve"); - } - return r; - } - - /* (non-Javadoc) - * @see org.archive.wayback.ResourceStore#shutdown() - */ - public void shutdown() throws IOException { - // NOOP - } - - public ResourceFileLocationDB getDb() { - return db; - } - - public void setDb(ResourceFileLocationDB db) { - this.db = db; - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-08-08 23:35:06 UTC (rev 2530) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceStore.java 2008-08-09 01:20:56 UTC (rev 2531) @@ -1,142 +0,0 @@ -package org.archive.wayback.resourcestore; - -import java.io.File; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; - -import org.archive.wayback.ResourceStore; -import org.archive.wayback.core.Resource; -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.exception.ConfigurationException; -import org.archive.wayback.exception.ResourceNotAvailableException; -import org.archive.wayback.resourcestore.indexer.ArcIndexer; -import org.archive.wayback.resourcestore.indexer.WarcIndexer; -import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.DirMaker; - -/** - * Class which implements a local ARC, WARC, ARC.gz, WARC.gz, ResourceStore - * including an optional automatic indexing thread - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LocalResourceStore implements ResourceStore { - - private File dataDir = null; - private AutoIndexThread indexThread = null; - - private ArcIndexer arcIndexer = new ArcIndexer(); - private WarcIndexer warcIndexer = new WarcIndexer(); - public final static String ARC_EXTENSION = ".arc"; - public final static String ARC_GZ_EXTENSION = ".arc.gz"; - public final static String WARC_EXTENSION = ".warc"; - public final static String WARC_GZ_EXTENSION = ".warc.gz"; - public final static String OPEN_EXTENSION = ".open"; - private final static String[] SUFFIXES = { - "", ARC_EXTENSION, ARC_GZ_EXTENSION, WARC_EXTENSION, WARC_GZ_EXTENSION - }; - private FilenameFilter filter = new ArcWarcFilenameFilter(); - - public void init() throws ConfigurationException { - if(indexThread != null) { - indexThread.setStore(this); - indexThread.start(); - } - } - - public File getLocalFile(String fileName) { - // try adding suffixes: empty string is first in the list - File file = null; - for(String suffix : SUFFIXES) { - file = new File(dataDir,fileName + suffix); - if(file.exists() && file.canRead()) { - return file; - } - } - // this might work if the full path is in the index... - file = new File(fileName); - if(file.exists() && file.canRead()) { - return file; - } - // doh. - return null; - } - - public Resource retrieveResource(CaptureSearchResult result) throws IOException, - ResourceNotAvailableException { - String fileName = result.getFile(); - long offset = result.getOffset(); - File file = getLocalFile(fileName); - if (file == null) { - - // TODO: this needs to be prettied up for end user consumption.. - throw new ResourceNotAvailableException("Cannot find ARC file (" - + fileName + ")"); - } else { - - Resource r = ResourceFactory.getResource(file, offset); - return r; - } - } - - public CloseableIterator<CaptureSearchResult> indexFile(File dataFile) throws IOException { - CloseableIterator<CaptureSearchResult> itr = null; - - String name = dataFile.getName(); - if(name.endsWith(ARC_EXTENSION)) { - itr = arcIndexer.iterator(dataFile); - } else if(name.endsWith(ARC_GZ_EXTENSION)) { - itr = arcIndexer.iterator(dataFile); - } else if(name.endsWith(WARC_EXTENSION)) { - itr = warcIndexer.iterator(dataFile); - } else if(name.endsWith(WARC_GZ_EXTENSION)) { - itr = warcIndexer.iterator(dataFile); - } - return itr; - } - - public Iterator<String> fileNamesIterator() throws IOException { - if(dataDir != null) { - String[] files = dataDir.list(filter); - List<String> l = Arrays.asList(files); - return l.iterator(); - } - return null; - } - - public String getDataDir() { - return DirMaker.getAbsolutePath(dataDir); - } - - public void setDataDir(String dataDir) throws IOException { - this.dataDir = DirMaker.ensureDir(dataDir); - } - - private class ArcWarcFilenameFilter implements FilenameFilter { - public boolean accept(File dir, String name) { - File tmp = new File(dir,name); - if(tmp.isFile() && tmp.canRead()) { - return name.endsWith(ARC_EXTENSION) || - name.endsWith(ARC_GZ_EXTENSION) || - name.endsWith(WARC_GZ_EXTENSION) || - name.endsWith(WARC_EXTENSION); - } - return false; - } - } - - public AutoIndexThread getIndexThread() { - return indexThread; - } - public void setIndexThread(AutoIndexThread indexThread) { - this.indexThread = indexThread; - } - public void shutdown() throws IOException { - // no-op. could shut down threads - } -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java (from rev 2496, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocalResourceFileResourceStore.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java 2008-08-09 01:20:56 UTC (rev 2531) @@ -0,0 +1,112 @@ +/* LocalResourceFileResourceStore + * + * $Id$ + * + * Created on 6:17:54 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore; + +import java.io.File; +import java.io.IOException; +import java.net.URL; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; + +/** + * Simple ResourceStore implementation, which uses a ResourceFileLocationDB to + * locate ARC/WARC files, that can be remote(via http://) or local paths. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LocationDBResourceStore implements ResourceStore { + + private ResourceFileLocationDB db = null; + + /* (non-Javadoc) + * @see org.archive.wayback.ResourceStore#retrieveResource(org.archive.wayback.core.SearchResult) + */ + public Resource retrieveResource(CaptureSearchResult result) throws IOException, + ResourceNotAvailableException { + // extract ARC filename + String fileName = result.getFile(); + if(fileName == null || fileName.length() < 1) { + throw new IOException("No ARC/WARC name in search result..."); + } + + String urls[] = db.nameToUrls(fileName); + if(urls == null || urls.length == 0) { + throw new ResourceNotAvailableException("Unable to locate(" + + fileName + ")"); + } + + final long offset = result.getOffset(); + + Resource r = null; + // TODO: attempt multiple threads? + for(String url : urls) { + + try { + + if(url.startsWith("http://")) { + r = ResourceFactory.getResource(new URL(url), offset); + } else { + // assume local path: + r = ResourceFactory.getResource(new File(url), offset); + } + // TODO: attempt to grab the first few KB? The underlying + // InputStreams support mark(), so we could reset() after. + // wait for now, currently this will parse HTTP headers, + // which means we've already read some + + } catch (IOException e) { + e.printStackTrace(); + } + if(r != null) { + break; + } + } + if(r == null) { + throw new ResourceNotAvailableException("Unable to retrieve"); + } + return r; + } + + /* (non-Javadoc) + * @see org.archive.wayback.ResourceStore#shutdown() + */ + public void shutdown() throws IOException { + // NOOP + } + + public ResourceFileLocationDB getDb() { + return db; + } + + public void setDb(ResourceFileLocationDB db) { + this.db = db; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/LocationDBResourceStore.java ___________________________________________________________________ Added: svn:mergeinfo + Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java (from rev 2496, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/Http11ResourceStore.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2008-08-09 01:20:56 UTC (rev 2531) @@ -0,0 +1,100 @@ +/* HttpARCResourceStore + * + * $Id$ + * + * Created on 5:29:56 PM Oct 12, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore; + +import java.io.IOException; + +import org.archive.wayback.ResourceStore; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.resourcefile.ArcWarcFilenameFilter; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; + + +/** + * Implements ResourceStore where ARC/WARCs are accessed via HTTP 1.1 range + * requests. All files are assumed to be "rooted" at a particular HTTP URL, + * within a single directory, implying a file reverse-proxy to connect through + * to actual HTTP ARC/WARC locations. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class SimpleResourceStore implements ResourceStore { + + private String prefix = null; + + + public Resource retrieveResource(CaptureSearchResult result) throws IOException, + ResourceNotAvailableException { + + // extract ARC filename + String fileName = result.getFile(); + if(fileName == null || fileName.length() < 1) { + throw new IOException("No ARC/WARC name in search result..."); + } + + final long offset = result.getOffset(); + if(!fileName.endsWith(ArcWarcFilenameFilter.ARC_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.ARC_GZ_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.WARC_SUFFIX) + && !fileName.endsWith(ArcWarcFilenameFilter.WARC_GZ_SUFFIX)) { + fileName = fileName + ArcWarcFilenameFilter.ARC_GZ_SUFFIX; + } + + String fileUrl = prefix + fileName; + Resource r = null; + try { + + r = ResourceFactory.getResource(fileUrl, offset); + + } catch (IOException e) { + + e.printStackTrace(); + throw new ResourceNotAvailableException("Unable to retrieve", + e.getLocalizedMessage()); + } + return r; + } + + /** + * @return the prefix + */ + public String getPrefix() { + return prefix; + } + + /** + * @param prefix the prefix to set + */ + public void setPrefix(String prefix) { + this.prefix = prefix; + } + + public void shutdown() throws IOException { + // no-op + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:mergeinfo + Added: svn:eol-style + native Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2008-08-08 23:35:06 UTC (rev 2530) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFactory.java 2008-08-09 01:20:56 UTC (rev 2531) @@ -23,6 +23,16 @@ */ public class ResourceFactory { + public static Resource getResource(String urlOrPath, long offset) + throws IOException, ResourceNotAvailableException { + if(urlOrPath.startsWith("http://")) { + return getResource(new URL(urlOrPath), offset); + } else { + // assume local path: + return getResource(new File(urlOrPath), offset); + } + } + public static Resource getResource(File file, long offset) throws IOException, ResourceNotAvailableException { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |