From: <bra...@us...> - 2010-10-21 22:40:35
|
Revision: 3293 http://archive-access.svn.sourceforge.net/archive-access/?rev=3293&view=rev Author: bradtofel Date: 2010-10-21 22:40:29 +0000 (Thu, 21 Oct 2010) Log Message: ----------- INITIAL REV: new tool cdx-indexer, a replacement for arc-indexer and warc-indexer, as well as two new undocumented tools for dealing with specially compressed CDX indexes. We haven't figured out how to release the additional zlib C code to create these CDX's so the tools and code is dead weight at for outside institutions at the moment. Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search Added: trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/cdx-sample 2010-10-21 22:40:29 UTC (rev 3293) @@ -0,0 +1,44 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +my $v = 0; +sub USAGE { + my($msg,$code) = @_; + $msg = "" unless (defined($msg) && length($msg)); + print STDERR <<EOUSAGE; +$msg USAGE: $0 PATH NUM +Create a split file for use with Wayback hadoop indexing code on STDOUT. +Finds approximate offsets at host boundaries for file at PATH, producing +a split file with NUM parts, which indicates the number of reduce tasks. +EOUSAGE + exit($code); +} +my $path = shift || &USAGE("Need path to CDX argument 1\n\n",2); +if(($path eq "-h") or ($path eq "-help") or ($path eq "--help")) { + &USAGE(0); +} +my $num = shift || &USAGE("Need NUM chunk count argument 2\n\n",2); + +my $fh; +open($fh,$path) or die "FAILED open($path) ($!)"; +my $size = (-s $fh); +my $per = $size / $num; +# print first for blank: +print "\n"; +foreach my $i (1..$num-1) { + my $offset = $per * $i; + seek($fh,$offset,0) or die "failed seek($path,$offset,0) ($!)"; + # consume first line to align on next complete line: + my $line = <$fh>; + while(1) { + my $line = <$fh>; + die "bad line($line) in ($path)" unless length($line); + if($line =~ /^([^:\/]+)[:\/]/) { + print "$1\n"; + last; + } + print STDERR "Skipping wierd line($line)\n"; + } +} +close($fh) or die "FAILED close($path) ($!)"; Added: trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/zipline-manifest 2010-10-21 22:40:29 UTC (rev 3293) @@ -0,0 +1,82 @@ +#!/usr/bin/env sh +## +## This script allows querying and updating of a remote LocationDB from the +## command line, including syncronizing the LocationDB with an entire directory +## of ARCs files +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.resourceindex.ziplines.ZiplinesChunkIterator' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" + Added: trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search 2010-10-21 22:40:29 UTC (rev 3293) @@ -0,0 +1,82 @@ +#!/usr/bin/env sh +## +## This script creates a CDX file for all ARC files in a directory +## PUTs those CDX files into a remote pipeline, and informs a remote +## LocationDB of the locations of all the ARC files. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" + Property changes on: trunk/archive-access/projects/wayback/dist/src/scripts/zl-bin-search ___________________________________________________________________ Added: svn:executable + * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |