From: <bra...@us...> - 2009-11-06 02:06:02
|
Revision: 2892 http://archive-access.svn.sourceforge.net/archive-access/?rev=2892&view=rev Author: bradtofel Date: 2009-11-06 02:05:52 +0000 (Fri, 06 Nov 2009) Log Message: ----------- INITIAL REV: code we use at IA when extracting records from existing WARC files into a new WARC - this tool builds a WARC header record for the new WARC file. Added Paths: ----------- trunk/archive-access/projects/wayback/dist/src/scripts/warc-header trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java Added: trunk/archive-access/projects/wayback/dist/src/scripts/warc-header =================================================================== --- trunk/archive-access/projects/wayback/dist/src/scripts/warc-header (rev 0) +++ trunk/archive-access/projects/wayback/dist/src/scripts/warc-header 2009-11-06 02:05:52 UTC (rev 2892) @@ -0,0 +1,78 @@ +#!/usr/bin/env sh +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## WAYBACK_HOME Pointer to your wayback install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set WAYBACK_HOME. +if [ -z "$WAYBACK_HOME" ] +then + WAYBACK_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + JAVACMD=$JAVA_HOME/bin/java +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $WAYBACK_HOME/lib/*.jar $WAYBACK_HOME/*.jar 2> /dev/null` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + WAYBACK_HOME=`cygpath -p -w "$WAYBACK_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +# Main ArcIndexer class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.wayback.util.WARCHeader' +fi + +CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN "$@" + Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java 2009-11-06 02:05:52 UTC (rev 2892) @@ -0,0 +1,63 @@ +package org.archive.wayback.util; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.archive.io.warc.WARCWriter; +import org.archive.util.anvl.ANVLRecord; + +public class WARCHeader { + private void writeHeaderRecord(File target, File fieldsSrc, String id) + throws IOException { + + WARCWriter writer = null; + + BufferedOutputStream bos = + new BufferedOutputStream(new FileOutputStream(target)); + + FileInputStream is = new FileInputStream(fieldsSrc); + ANVLRecord ar = ANVLRecord.load(is); + + List<String> metadata = new ArrayList<String>(1); + metadata.add(ar.toString()); + + writer = new WARCWriter(null, bos, target, true, null, + metadata); + // Write a warcinfo record with description about how this WARC + // was made. + writer.writeWarcinfoRecord(target.getName(), "Made from " + + id + " by " + + this.getClass().getName()); + + } + + public static void main(String[] args) { + if (args.length != 3) { + System.err.println("USAGE: tgtWarc fieldsSrc id"); + System.err.println("\ttgtWarc is the path to the target WARC.gz"); + System.err.println("\tfieldsSrc is the path to the text of the record"); + System.err.println("\t\tmake sure each line is terminated by \\r\\n"); + System.err.println("\t\tand that the file ends with a blank, \\r\\n terminiated line"); + System.err.println("\tid is the XXX in:"); + System.err.println("\t\tContent-Description: Made from XXX by org.archive.wayback.util.WARCHeader"); + System.err.println("\t\tof the header record... header..."); + System.exit(1); + } + File target = new File(args[0]); + File fieldSrc = new File(args[1]); + String id = args[2]; + WARCHeader header = new WARCHeader(); + try { + header.writeHeaderRecord(target, fieldSrc, id); + } catch (IOException e) { + e.printStackTrace(); + System.exit(2); + } + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/WARCHeader.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |