From: <pau...@us...> - 2007-04-17 18:31:35
|
Revision: 5083 http://archive-crawler.svn.sourceforge.net/archive-crawler/?rev=5083&view=rev Author: paul_jack Date: 2007-04-17 11:31:32 -0700 (Tue, 17 Apr 2007) Log Message: ----------- First attempt to get maven to produce a tarball for Heritrix and dependencies. Doesn't quite work. * /heritrix/src/main/assembly/dist.xml A maven assembly descriptor for creating the tarball. * /heritrix/pom.xml Now uses maven-assembly-plugin to create a tarball. * /heritrix/src/scripts Moved to src/main/bin per maven standards. Modified Paths: -------------- branches/pjack_settings/crawler/heritrix/pom.xml Added Paths: ----------- branches/pjack_settings/crawler/heritrix/src/main/assembly/ branches/pjack_settings/crawler/heritrix/src/main/assembly/dist.xml branches/pjack_settings/crawler/heritrix/src/main/bin/ branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader.cmd branches/pjack_settings/crawler/heritrix/src/main/bin/cmdline-jmxclient-0.10.5.jar branches/pjack_settings/crawler/heritrix/src/main/bin/dependencies.xsl branches/pjack_settings/crawler/heritrix/src/main/bin/extractor branches/pjack_settings/crawler/heritrix/src/main/bin/extractor.cmd branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix.cmd branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix.cmd branches/pjack_settings/crawler/heritrix/src/main/bin/hoppath.pl branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor.cmd branches/pjack_settings/crawler/heritrix/src/main/bin/make_reports.pl branches/pjack_settings/crawler/heritrix/src/main/bin/manifest_bundle.pl branches/pjack_settings/crawler/heritrix/src/main/bin/xdocToTxt.xsl Removed Paths: ------------- branches/pjack_settings/crawler/heritrix/src/scripts/ Modified: branches/pjack_settings/crawler/heritrix/pom.xml =================================================================== --- branches/pjack_settings/crawler/heritrix/pom.xml 2007-04-13 22:31:12 UTC (rev 5082) +++ branches/pjack_settings/crawler/heritrix/pom.xml 2007-04-17 18:31:32 UTC (rev 5083) @@ -90,146 +90,8 @@ <version>2.0b4</version> <scope>compile</scope> </dependency> -<!-- <dependency> - <groupId>commons-lang</groupId> - <artifactId>commons-lang</artifactId> - <version>2.1</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - <version>1.0.4</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>commons-net</groupId> - <artifactId>commons-net</artifactId> - <version>1.4.1</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> - <version>1.3</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>commons-collections</groupId> - <artifactId>commons-collections</artifactId> - <version>3.1</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>commons-cli</groupId> - <artifactId>commons-cli</artifactId> - <version>1.0</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>net.htmlparser</groupId> - <artifactId>jericho-html</artifactId> - <version>2.3</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>com.sleepycat</groupId> - <artifactId>je</artifactId> - <version>3.2.13</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.dnsjava</groupId> - <artifactId>dnsjava</artifactId> - <version>1.6.2</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>tomcat</groupId> - <artifactId>servlet</artifactId> - <version>4.1.34</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>tomcat</groupId> - <artifactId>jasper-runtime</artifactId> - <version>4.1.30</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>tomcat</groupId> - <artifactId>jasper-compiler</artifactId> - <version>4.1.30</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>poi</groupId> - <artifactId>poi</artifactId> - <version>2.5.1</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>poi</groupId> - <artifactId>poi-scratchpad</artifactId> - <version>2.5.1-final-20040804</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>itext</groupId> - <artifactId>itext</artifactId> - <version>1.3</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>ant</groupId> - <artifactId>ant</artifactId> - <version>1.6.2</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>3.8.2</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>commons-pool</groupId> - <artifactId>commons-pool</artifactId> - <version>1.3</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>fastutil</groupId> - <artifactId>fastutil</artifactId> - <version>5.0.7</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.gnu.inet</groupId> - <artifactId>libidn</artifactId> - <version>0.6.5</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>net.java.dev.jets3t</groupId> - <artifactId>jets3t</artifactId> - <version>0.5.0</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>it.unimi.dsi</groupId> - <artifactId>mg4j</artifactId> - <version>1.0.1</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>com.anotherbigidea</groupId> - <artifactId>javaswf</artifactId> - <version>CVS-SNAPSHOT-1</version> - <scope>compile</scope> - </dependency> --> - </dependencies> + <build> <plugins> <plugin> @@ -272,7 +134,7 @@ <goals> <goal>generate-html</goal> </goals> - <phase>site</phase> + <phase>prepare-package</phase> </execution> </executions> <dependencies> @@ -300,6 +162,25 @@ </configuration> </plugin> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <configuration> + <descriptors> + <descriptor> + src/main/assembly/dist.xml + </descriptor> + </descriptors> + </configuration> + <executions> + <execution> + <id>job.assembly.package</id> + <phase>package</phase> + <goals> + <goal>attached</goal> + </goals> + </execution> + </executions> + </plugin> </plugins> </build> </project> Added: branches/pjack_settings/crawler/heritrix/src/main/assembly/dist.xml =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/assembly/dist.xml (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/assembly/dist.xml 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,39 @@ +<assembly> + <id>dist</id> + <formats> + <format>tar.gz</format> + <format>zip</format> + </formats> + <includeBaseDirectory>true</includeBaseDirectory> + <dependencySets> + <dependencySet> + <outputDirectory>/lib</outputDirectory> + </dependencySet> + </dependencySets> + <fileSets> + <fileSet> + <directory>.</directory> + <outputDirectory>/</outputDirectory> + <includes> + <include>README.txt</include> + <include>LICENSE.txt</include> + </includes> + </fileSet> + <fileSet> + <directory>target</directory> + <outputDirectory>/</outputDirectory> + <includes> + <include>heritrix-*.jar</include> + </includes> + </fileSet> + <fileSet> + <directory>target/site</directory> + <outputDirectory>/docs</outputDirectory> + </fileSet> + <fileSet> + <directory>src/main/bin</directory> + <outputDirectory>/bin</outputDirectory> + <fileMode>0755</fileMode> + </fileSet> + </fileSets> +</assembly> Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/arcreader) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,33 @@ +#!/usr/bin/env sh +## +## This script runs the arcreader main. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## HERITRIX_HOME Pointer to your heritrix install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set HERITRIX_HOME. +if [ -z "$HERITRIX_HOME" ] +then + HERITRIX_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +FOREGROUND='true' CLASS_MAIN='org.archive.io.arc.ARCReader' JMX_OFF='off' \ + $HERITRIX_HOME/bin/heritrix $@ Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader.cmd (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/arcreader.cmd) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader.cmd (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/arcreader.cmd 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,35 @@ +:: This is the Windows version of the extractor shell script +:: Caveats, see heritrix.cmd +:: +:: This script runs the arcreader main. +:: +:: Optional environment variables +:: +:: JAVA_HOME Point at a JDK install to use. +:: +:: HERITRIX_HOME Pointer to your heritrix install. If not present, we +:: make an educated guess based of position relative to this +:: script. +:: +:: JAVA_OPTS Java runtime options. +@echo off + +set PRGDIR=%~p0 + +if "%PRGDIR%"=="~p0" ( + cmd /E:ON /F:ON /V:ON /c %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 + goto:eof +) + +:: unset JMX_OFF afterwards if it wasn't set before +:: Won't work if script is aborted with Ctrl+C... +if not defined JMX_OFF set UNSET_JMX_OFF=true +set JMX_OFF=off +set CLASS_MAIN=org.archive.io.arc.ARCReader +call "%PRGDIR%\foreground_heritrix.cmd" %* +set CLASS_MAIN= +if not defined UNSET_JMX_OFF goto:eof +set JMX_OFF= +set UNSET_JMX_OFF= + +:eof \ No newline at end of file Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/cmdline-jmxclient-0.10.5.jar (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/cmdline-jmxclient-0.10.5.jar) =================================================================== (Binary files differ) Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/dependencies.xsl (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/dependencies.xsl) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/dependencies.xsl (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/dependencies.xsl 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,35 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!--Get the dependencies list from project.xml + + $Id$ + --> +<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> + <xsl:output method="text" version="1.0" encoding="UTF-8"/> + <xsl:param name="newline" select="'
'"/> + <xsl:param name="gt" select="'>'"/> + <xsl:param name="lt" select="'<'"/> + <xsl:param name="space" select="' '"/> + <xsl:param name="quot" select="'"'"/> + <xsl:template match="/"> + <xsl:apply-templates select="project/dependencies"/> + </xsl:template> + + + <xsl:template match="dependency"> + <xsl:value-of select="$newline" /> + <xsl:number count="dependency" format="1. " /> + <xsl:apply-templates/> + </xsl:template> + <xsl:template match="id"> + <xsl:apply-templates/> + <xsl:value-of select="$newline" /> + </xsl:template> + <xsl:template match="url|version|description|license"> + <xsl:value-of select="local-name()" /><xsl:text>: </xsl:text> + <xsl:apply-templates/> + <xsl:value-of select="$newline" /> + </xsl:template> + <xsl:template match="text()" > + <xsl:value-of select="normalize-space(.)" /><xsl:text /> + </xsl:template> +</xsl:stylesheet> Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/extractor (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/extractor) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/extractor (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/extractor 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,35 @@ +#!/usr/bin/env sh +## +## This script runs the org.archive.crawler.extractor.ExtractorTool main. +## Pass '--help' to get usage message. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## HERITRIX_HOME Pointer to your heritrix install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set HERITRIX_HOME. +if [ -z "$HERITRIX_HOME" ] +then + HERITRIX_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +FOREGROUND='true', \ +CLASS_MAIN='org.archive.crawler.extractor.ExtractorTool' \ + $HERITRIX_HOME/bin/heritrix $@ Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/extractor.cmd (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/extractor.cmd) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/extractor.cmd (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/extractor.cmd 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,29 @@ +:: This is the Windows version of the extractor shell script +:: Caveats, see heritrix.cmd +:: +:: This script runs the org.archive.crawler.extractor.ExtractorTool main. +:: Pass '--help' to get usage message. +:: +:: Optional environment variables +:: +:: JAVA_HOME Point at a JDK install to use. +:: +:: HERITRIX_HOME Pointer to your heritrix install. If not present, we +:: make an educated guess based of position relative to this +:: script. +:: +:: JAVA_OPTS Java runtime options. +@echo off + +set PRGDIR=%~p0 + +if "%PRGDIR%"=="~p0" ( + cmd /E:ON /F:ON /V:ON /c %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 + goto:eof +) + +set CLASS_MAIN=org.archive.crawler.extractor.ExtractorTool +call "%PRGDIR%\foreground_heritrix.cmd" %* +set CLASS_MAIN= + +:eof \ No newline at end of file Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/foreground_heritrix) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,39 @@ +#!/usr/bin/env sh +## +## This script launches the heritrix crawler and keeps the process in foreground +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## HERITRIX_HOME Pointer to your heritrix install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. +## +## FOREGROUND Set to any value -- e.g. 'true' -- if you want to run +## heritrix in foreground (Used by build system when it runs +## selftest to see if completed successfully or not).. +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set HERITRIX_HOME. +if [ -z "$HERITRIX_HOME" ] +then + HERITRIX_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +FOREGROUND='true' /bin/sh $HERITRIX_HOME/bin/heritrix $@ Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix.cmd (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/foreground_heritrix.cmd) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix.cmd (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/foreground_heritrix.cmd 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,40 @@ +:: This is the windows version of the foreground_heritrix shell script +:: The only difference to an invokation with "heritrix.cmd" is that no extra +:: (minimized) console window is created... +:: Caveats, see heritrix.cmd +:: +:: This script launches the heritrix crawler and keeps the window in foreground +:: +:: Optional environment variables +:: +:: JAVA_HOME Point at a JDK install to use. +:: +:: HERITRIX_HOME Pointer to your heritrix install. If not present, we +:: make an educated guess based of position relative to this +:: script. +:: +:: JAVA_OPTS Java runtime options. +:: +:: FOREGROUND Set to any value -- e.g. 'true' -- if you want to run +:: heritrix in foreground (Used by build system when it runs +:: selftest to see if completed successfully or not).. +:: +@echo off + +set PRGDIR=%~p0 + +if "%PRGDIR%"=="~p0" ( + cmd /E:ON /F:ON /V:ON /c %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 + goto:eof +) + +:: unset FOREGROUND afterwards if it wasn't set before +:: Won't work if script is aborted with Ctrl+C... +if not defined FOREGROUND set UNSET_FOREGROUND=true +set FOREGROUND=true +call "%PRGDIR%\heritrix.cmd" %* +if not defined UNSET_FOREGROUND goto:eof +set FOREGROUND= +set UNSET_FOREGROUND= + +:eof \ No newline at end of file Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/heritrix) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,195 @@ +#!/usr/bin/env sh +## +## This script launches the heritrix crawler. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## HERITRIX_HOME Pointer to your heritrix install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## HERITRIX_OUT Pathname to the Heritrix log file written when run in +## daemon mode. +## Default setting is $HERITRIX_HOME/heritrix_out.log +## +## JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +## +## FOREGROUND Set to any value -- e.g. 'true' -- if you want to run +## heritrix in foreground (Used by build system when it runs +## selftest to see if completed successfully or not). +## +## JMX_OPTS Default is to startup the JVM JMX administration +## on port 8849 if the JVM is SUN JVM 1.5. This allows JMX +## administration of Heritrix. If the JVM is other than the +## SUN JDK 1.5, the arguments are ignored. If you do not want +## to start the JVM JXM administration server on the SUN JDK +## 1.5, set this variable to empty string. +## +## JMX_PORT Port you'd like the JVM JMX administration server to run +## on. Default is 8849. +## +## JMX_OFF Set to a non-empty string to disable JMX (and JMX setup of +## password file, etc.) +## + +# Resolve links - $0 may be a softlink +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Read local heritrix properties if any. +if [ -f $HOME/.heritrixrc ] +then + . $HOME/.heritrixrc +fi + +# Set HERITRIX_HOME. +if [ -z "$HERITRIX_HOME" ] +then + HERITRIX_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +# Find JAVA_HOME. +if [ -z "$JAVA_HOME" ] +then + JAVA=`which java` + if [ -z "$JAVA" ] + then + echo "Cannot find JAVA. Please set JAVA_HOME or your PATH." + exit 1 + fi + JAVA_BINDIR=`dirname $JAVA` + JAVA_HOME=$JAVA_BINDIR/.. +fi + +if [ -z "$JAVACMD" ] +then + # It may be defined in env - including flags!! + # See '[ 1482761 ] BDB Adler32 gc-lock OOME risk' for why we include the + # 'je.disable.java.adler32'. + JAVACMD="$JAVA_HOME/bin/java -Dje.disable.java.adler32=true" +fi + +# Ignore previous classpath. Build one that contains heritrix jar and content +# of the lib directory into the variable CP. +for jar in `ls $HERITRIX_HOME/lib/*.jar $HERITRIX_HOME/*.jar` +do + CP=${CP}:${jar} +done + +# cygwin path translation +if expr `uname` : 'CYGWIN*' > /dev/null; then + CP=`cygpath -p -w "$CP"` + HERITRIX_HOME=`cygpath -p -w "$HERITRIX_HOME"` +fi + +# Make sure of java opts. +if [ -z "$JAVA_OPTS" ] +then + JAVA_OPTS=" -Xmx256m" +fi + +if [ -z "${JMX_OFF}" ] +then + if [ -z "${JMX_PORT}" ] + then + JMX_PORT=8849 + fi + + if [ -z "$JMX_OPTS" ] + then + JMX_OPTS="-Dcom.sun.management.jmxremote.port=${JMX_PORT} \ + -Dcom.sun.management.jmxremote.ssl=false \ + -Dcom.sun.management.jmxremote.password.file=${HERITRIX_HOME}/jmxremote.password" + + # Copy into place a jmxremote password file that uses the heritrix password + # interpolated (First need to find the current password if one supplied on + # command-line, else use whats in heritrix.properties as default). + # Need to make it so its only readable by user else jconsole won't use it. + JMX_PASSWORD=`echo "$@" |sed -n -e 's/.*--admin=[^:]*:\([^ ]*\).*/\1/p' \ + -e 's/.*-a *[^:]*:\([^ ]*\).*/\1/p'` + if [ -z "$JMX_PASSWORD" ] + then + JMX_PASSWORD=`sed -n -e 's/heritrix.cmdline.admin[ ]*=[^:]*:\(.*\)/\1/p' \ + ${HERITRIX_HOME}/conf/heritrix.properties` + fi + JMX_PWORD_FILE="${HERITRIX_HOME}/jmxremote.password" + if [ -f "${JMX_PWORD_FILE}" ] + then + rm -f "${JMX_PWORD_FILE}" + fi + sed -e "s/@PASSWORD@/${JMX_PASSWORD}/" \ + "${HERITRIX_HOME}/conf/jmxremote.password.template" > "${JMX_PWORD_FILE}" + chmod 600 "${JMX_PWORD_FILE}" + fi +fi + +# Main heritrix class. +if [ -z "$CLASS_MAIN" ] +then + CLASS_MAIN='org.archive.crawler.Heritrix' +fi + +# heritrix_dmesg.log contains startup output from the crawler main class. +# As soon as content appears in this log, this shell script prints the +# successful (or failed) startup content and moves off waiting on heritrix +# startup. This technique is done so we can show on the console startup +# messages emitted by java subsequent to the redirect of stdout and stderr. +startMessage="${HERITRIX_HOME}/heritrix_dmesg.log" + +# Remove any file that may have been left over from previous starts. +if [ -f $startMessage ] +then + rm -f $startmessage +fi +# Run heritrix as daemon. Redirect stdout and stderr to a file. +# Print start message with date, java version, java opts, ulimit, and uname. +if [ -z "$HERITRIX_OUT" ] +then + HERITRIX_OUT=${HERITRIX_HOME}/heritrix_out.log +fi +stdouterrlog=${HERITRIX_OUT} +echo "`date` Starting heritrix" >> $stdouterrlog +uname -a >> $stdouterrlog 2>&1 +${JAVACMD} ${JAVA_OPTS} -version >> $stdouterrlog 2>&1 +echo "JAVA_OPTS=${JAVA_OPTS}" >> $stdouterrlog +ulimit -a >> $stdouterrlog 2>&1 + +# If FOREGROUND is set, run heritrix in foreground. +if [ -n "$FOREGROUND" ] +then + CLASSPATH=${CP} $JAVACMD -Dheritrix.home=${HERITRIX_HOME} \ + -Djava.protocol.handler.pkgs=org.archive.net \ + -Dheritrix.out=${HERITRIX_OUT} ${JAVA_OPTS} ${JMX_OPTS} \ + $CLASS_MAIN $@ +else + CLASSPATH=${CP} nohup $JAVACMD -Dheritrix.home=${HERITRIX_HOME} \ + -Djava.protocol.handler.pkgs=org.archive.net \ + -Dheritrix.out=${HERITRIX_OUT} ${JAVA_OPTS} ${JMX_OPTS} \ + $CLASS_MAIN $@ >> ${stdouterrlog} 2>&1 & + + # Wait for content in the heritrix_dmesg.log file. + echo -n "`date` Starting heritrix" + while true + do + sleep 1 + if [ -s $startMessage ] + then + echo + cat $startMessage + rm -f $startMessage + break + fi + echo -n '.' + done +fi Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix.cmd (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/heritrix.cmd) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix.cmd (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/heritrix.cmd 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,296 @@ +:: This script launches the heritrix crawler on windows. While Heritrix +:: is unsupported on windows, see 2.1.1.3 in the User Manual +:: [http://crawler.archive.org/articles/user_manual.html], this script was +:: provided by Eric Jensen as a convenience to the windows-afflicted. +:: +:: It is a direct translation of the heritrix linux wrapper script -- and +:: because windows is not supported on Heritrix, it will likely lag the unix +:: start script. +:: +:: See also: +:: https://sourceforge.net/tracker/index.php?func=detail&aid=1514538&group_id=73833&atid=539102 +:: +:: Versions: +:: +:: 2006-07-17 Original Version by Eric Jensen +:: +:: 2006-08-04 Disclaimer added by Michael Stack +:: +:: 2006-08-28 A few fixes by Max Sch\xF6fmann: +:: - command extensions and veriable expansion are automatically +:: enabled +:: - JMX configuration fixed (not the fancy "sed" stuff however) +:: - Try to set permissions of JMX password file if Heritrix +:: fails to start and JMX is enabled +:: - a few more small improvements (java detection, fake background +:: execution...) +:: - comments changed from rem to :: and file renamed to .cmd +:: (to make clear it won't work on Win 9x...) +:: +:: +:: Optional environment variables +:: +:: JAVA_HOME Point at a JDK install to use. +:: +:: HERITRIX_HOME Pointer to your heritrix install. If not present, we +:: make an educated guess based of position relative to this +:: script. +:: +:: HERITRIX_OUT Pathname to the Heritrix log file written when run in +:: daemon mode. +:: Default setting is %HERITRIX_HOME%\heritrix_out.log +:: +:: JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. +:: +:: FOREGROUND Set to any value -- e.g. 'true' -- if you want to run +:: heritrix in foreground (Used by build system when it runs +:: selftest to see if completed successfully or not). +:: +:: JMX_OPTS Default is to startup the JVM JMX administration +:: on port 8849 if the JVM is SUN JVM 1.5. This allows JMX +:: administration of Heritrix. If the JVM is other than the +:: SUN JDK 1.5, the arguments are ignored. If you do not want +:: to start the JVM JXM administration server on the SUN JDK +:: 1.5, set this variable to empty string. +:: +:: JMX_PORT Port you'd like the JVM JMX administration server to run +:: on. Default is 8849. +:: +:: JMX_OFF Set to a non-empty string to disable JMX (and JMX setup of +:: password file, etc.) +:: +@echo off + +set PRG=%0 +set PRGDIR=%~p0 +:: windows doesn't have a sleep command build-in +set SLEEP=ping 127.0.0.1 -n 2 -w 1000 + +if "%1"=="RUN" goto run +if "%1"=="BGR" goto run_in_background +:: preserve original command line arguments +if "%*"=="*" ( + :: windows separates things like --digest=false into "--digest" and "false" if using %1 %2 %3... + :: But as command extensions are enabled by default, this should be no problem for most users + echo NOTICE: Try starting your console with "cmd /E:ON" if you are experiencing + echo problems passing command line arguments to Heritrix + echo. + set HERITRIX_CMDLINE=%1 %2 %3 %4 %5 %6 %7 %8 %9 +) else ( + set HERITRIX_CMDLINE=%* +) +:: Enabling command extensions and delayed variable expansion +cmd /E:ON /F:ON /V:ON /c %PRG% RUN +goto :end + +:run +:: Read local heritrix properties if any. +:: To do this on Windows, tempor. rename .heritrixrc to heritrixrc.cmd +:: This is of course only useful if .heritrixrc contains Windows style "set VAR=value" statements +set RC_PATH=%HOMEPATH% +if "%RC_PATH%"=="\" set RC_PATH=\. +if defined HOMEDRIVE set RC_PATH=%HOMEDRIVE%!RC_PATH! +if exist "!RC_PATH!\.heritrixrc" ( + ren "!RC_PATH!\.heritrixrc" heritrixrc.cmd + call "!RC_PATH!\heritrixrc.cmd" + ren "!RC_PATH!\heritrixrc.cmd" .heritrixrc +) +set RC_PATH= + +:: Set HERITRIX_HOME. +if defined HERITRIX_HOME goto find_java +set HERITRIX_HOME=%PRGDIR:~0,-4% +if "%PRGDIR:~-1%"=="\" set HERITRIX_HOME=%PRGDIR:~0,-5% + +:: Find JAVA_HOME or java if JAVACMD is not defined. +:find_java +if defined JAVACMD goto java_found +if defined JAVA_HOME goto set_javacmd + +:: Try to find java if neither JAVACMD nor JAVA_HOME is set: +java -version >nul 2>&1 +:: 9009 means "command not found" +if errorlevel 9009 goto no_java_home +:: something else is wrong with executing java +if errorlevel 1 goto no_java_home + +:: java seems to be in PATH +set JAVACMD=java -Dje.disable.java.adler32=true +:set_javacmd +if not defined JAVACMD set JAVACMD="%JAVA_HOME%\bin\java" -Dje.disable.java.adler32=true +:: It may be defined in env - including flags!! +:: See '[ 1482761 ] BDB Adler32 gc-lock OOME risk' for why we include the +:: 'je.disable.java.adler32'. +:java_found + +:: Ignore previous classpath. Build one that contains heritrix jar and content +:: of the lib directory into the variable CP. +set CP= +set OLD_CLASSPATH=%CLASSPATH% +for %%j in ("%HERITRIX_HOME%\lib\*.jar" "%HERITRIX_HOME%\*.jar") do set CP=!CP!;%%j +set CLASSPATH=!CP! + +:: DONT cygwin path translation +:: if expr `uname` : 'CYGWIN*' > /dev/null; then +:: CP=`cygpath -p -w "%CP"` +:: HERITRIX_HOME=`cygpath -p -w "%HERITRIX_HOME"` +:: fi + +:: Make sure of java opts. +if not defined JAVA_OPTS set JAVA_OPTS= -Xmx256m + +:: Setting environment vars in nested IFs is error prone, thus using GOTOs +if not defined JMX_OFF goto configure_jmx +goto jmx_configured + +:configure_jmx +if not defined JMX_PORT set JMX_PORT=8849 +if not defined JMX_OPTS set JMX_OPTS=-Dcom.sun.management.jmxremote.port=%JMX_PORT% -Dcom.sun.management.jmxremote.ssl=false "-Dcom.sun.management.jmxremote.password.file=%HERITRIX_HOME%\jmxremote.password" + +:: DONT Copy into place a jmxremote password file that uses the heritrix password +:: interpolated (First need to find the current password if one supplied on +:: command-line, else use whats in heritrix.properties as default). +:: Need to make it so its only readable by user else jconsole won't use it. +:: JMX_PASSWORD=`echo "%@" |sed -n -e 's/.*--admin=[^:]*:\([^ ]*\).*/\1/p' -e 's/.*-a *[^:]*:\([^ ]*\).*/\1/p'` +:: if [ -z "%JMX_PASSWORD" ] +:: then +:: JMX_PASSWORD=`sed -n -e 's/heritrix.cmdline.admin[ ]*=[^:]*:\(.*\)/\1/p' \ +:: %{HERITRIX_HOME}\conf\heritrix.properties` +:: fi +:: JMX_PWORD_FILE="%{HERITRIX_HOME}\jmxremote.password" +:: if [ -f "%{JMX_PWORD_FILE}" ] +:: then +:: rm -f "%{JMX_PWORD_FILE}" +:: fi +:: sed -e "s/@PASSWORD@/%{JMX_PASSWORD}/" \ +:: "%{HERITRIX_HOME}\conf\jmxremote.password.template" > "%{JMX_PWORD_FILE}" +:: chmod 600 "%{JMX_PWORD_FILE}" + +:jmx_configured + +:: Main heritrix class. +if not defined CLASS_MAIN set CLASS_MAIN=org.archive.crawler.Heritrix + +:: heritrix_dmesg.log contains startup output from the crawler main class. +:: As soon as content appears in this log, this shell script prints the +:: successful (or failed) startup content and moves off waiting on heritrix +:: startup. This technique is done so we can show on the console startup +:: messages emitted by java subsequent to the redirect of stdout and stderr. +set startMessage=%HERITRIX_HOME%\heritrix_dmesg.log + +:: Remove any file that may have been left over from previous starts. +if exist "%startMessage%" del "%startmessage%" +if exist "%HERITRIX_HOME%\jmx_permissions_broken" del "%HERITRIX_HOME%\jmx_permissions_broken" + +:: Run heritrix as daemon. Redirect stdout and stderr to a file. +:: Print start message with date, java version, java opts, ulimit, and uname. +if not defined HERITRIX_OUT set HERITRIX_OUT=%HERITRIX_HOME%\heritrix_out.log + +set stdouterrlog=%HERITRIX_OUT% +echo %DATE% %TIME% Starting heritrix >>"%stdouterrlog%" +:: uname -a >> %stdouterrlog% +%JAVACMD% %JAVA_OPTS% -version >>"%stdouterrlog%" 2>&1 +echo JAVA_OPTS=%JAVA_OPTS% >>"%stdouterrlog%" +:: ulimit -a >> %stdouterrlog 2>&1 + +:: DONT If FOREGROUND is set, run heritrix in foreground. +:: if defined FOREGROUND +:start_heritrix +if not defined FOREGROUND goto run_in_background +%JAVACMD% "-Dheritrix.home=%HERITRIX_HOME%" -Djava.protocol.handler.pkgs=org.archive.net "-Dheritrix.out=%HERITRIX_OUT%" %JAVA_OPTS% %JMX_OPTS% %CLASS_MAIN% %HERITRIX_CMDLINE% +:: errorlevel 130 if aborted with Ctrl+c (at least my sun jvm 1.5_07...) +if errorlevel 130 goto :end +if errorlevel 1 goto fix_jmx_permissions +goto :end + +:run_in_background +if not "%1"=="BGR" ( + start /MIN cmd /E:ON /F:ON /V:ON /c %PRG% BGR + goto wait_for_log_file +) else ( + title Heritrix + :: adding ">>%stdouterrlog% 2>&1" causes an access denied error as heritrix writes also to this file + %JAVACMD% "-Dheritrix.home=%HERITRIX_HOME%" -Djava.protocol.handler.pkgs=org.archive.net "-Dheritrix.out=%HERITRIX_OUT%" %JAVA_OPTS% %JMX_OPTS% %CLASS_MAIN% %HERITRIX_CMDLINE% + if errorlevel 130 goto :end + if errorlevel 1 echo.!ERRORLEVEL! >"%HERITRIX_HOME%\jmx_permissions_broken" + pause + ) +goto :end + +:wait_for_log_file +SET HERITRIX_COUNTER= +echo WARNING: It's currently not possible to run Heritrix in background +echo on Windows. It was just started minimized in a new Window +echo and will be shut down as soon as you log off. +echo. +echo %DATE% %TIME% Starting heritrix +:print_logfile +%SLEEP%>nul +if exist "%HERITRIX_HOME%\jmx_permissions_broken" ( + del "%HERITRIX_HOME%\jmx_permissions_broken" + goto fix_jmx_permissions +) +if exist "%startMessage%" ( + %SLEEP%>nul + type "%startMessage%" + :: can happen when heritrix writes to the file at the same time + if errorlevel 1 goto print_logfile + goto delete_logfile +) +:: keep trying for 30 more seconds +if "!HERITRIX_COUNTER!"==".............................." goto start_may_failed +set HERITRIX_COUNTER=.!HERITRIX_COUNTER! +echo . +goto print_logfile + +:delete_logfile +set HERITRIX_COUNTER= +%SLEEP%>nul +%SLEEP%>nul +del "%startMessage%" >nul 2>&1 +:: del doesn't set the ERRORLEVEL var if unsuccessful, so we can't try again +goto :end + +:fix_jmx_permissions +if not "%CLASS_MAIN%"=="org.archive.crawler.Heritrix" goto :start_may_failed +if defined PERMISSIONS_FIXED goto fix_jmx_permission_failed +echo. +echo Heritrix failed to start properly. Possible causes: +echo. +echo - another programm uses the port for the web inferface (8080 default) +echo (e.g. another Heritrix instance) +if defined JMX_OFF goto :end +echo - permissions problem with the JMX password file. +echo. +set /P FIXIT=Do you want to try to fix the permissions (Y/N)? +if /I "%FIXIT:~0,1%"=="n" goto :end +cacls "%HERITRIX_HOME%\jmxremote.password" /P %USERNAME%:R +if errorlevel 1 goto fix_jmx_permission_failed +set PERMISSIONS_FIXED=true +set /P RESTART=Restart Heritrix (Y/N)? +if /I "%RESTART:~0,1%"=="y" goto start_heritrix +goto :end + +:fix_jmx_permission_failed +set PERMISSIONS_FIXED= +echo Either fixing the permissions failed or there was another problem +goto :end + +:start_may_failed +set HERITRIX_COUNTER= +echo Starting Heritrix seems to have failed +goto :end + +:no_java_home +echo Please define either JAVA_HOME or JAVACMD or make sure java.exe is in PATH +goto :end + +:: needed if initially called without command extensions +:end +:: do some cleanup +set HERITRIX_CMDLINE= +if defined OLD_CLASSPATH set CLASSPATH=%OLD_CLASSPATH% +set CP= +set SLEEP= +set PRGDIR= +set PRG= Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/hoppath.pl (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/hoppath.pl) =================================================================== (Binary files differ) Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/htmlextractor) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,34 @@ +#!/usr/bin/env sh +## +## This script runs the org.archive.crawler.extractor.ExtractorHTMLTest main. +## +## Optional environment variables +## +## JAVA_HOME Point at a JDK install to use. +## +## HERITRIX_HOME Pointer to your heritrix install. If not present, we +## make an educated guess based of position relative to this +## script. +## +## JAVA_OPTS Java runtime options. +PRG="$0" +while [ -h "$PRG" ]; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`/"$link" + fi +done +PRGDIR=`dirname "$PRG"` + +# Set HERITRIX_HOME. +if [ -z "$HERITRIX_HOME" ] +then + HERITRIX_HOME=`cd "$PRGDIR/.." ; pwd` +fi + +FOREGROUND='true', \ +CLASS_MAIN='org.archive.crawler.extractor.ExtractorHTMLTest' \ + $HERITRIX_HOME/bin/heritrix $@ Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor.cmd (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/htmlextractor.cmd) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor.cmd (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/htmlextractor.cmd 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,29 @@ +:: This is the Windows version of the extractor shell script +:: Caveats, see heritrix.cmd +:: +:: This script runs the org.archive.crawler.extractor.ExtractorHTMLTest main. +:: +:: Optional environment variables +:: +:: JAVA_HOME Point at a JDK install to use. +:: +:: HERITRIX_HOME Pointer to your heritrix install. If not present, we +:: make an educated guess based of position relative to this +:: script. +:: +:: JAVA_OPTS Java runtime options. +@echo off + +set PRGDIR=%~p0 + +if "%PRGDIR%"=="~p0" ( + cmd /E:ON /F:ON /V:ON /c %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 + goto:eof +) + + +set CLASS_MAIN=org.archive.crawler.extractor.ExtractorHTMLTest +call "%PRGDIR%\foreground_heritrix.cmd" %* +set CLASS_MAIN= + +:eof Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/make_reports.pl (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/make_reports.pl) =================================================================== (Binary files differ) Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/manifest_bundle.pl (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/manifest_bundle.pl) =================================================================== (Binary files differ) Copied: branches/pjack_settings/crawler/heritrix/src/main/bin/xdocToTxt.xsl (from rev 5080, branches/pjack_settings/crawler/heritrix/src/scripts/xdocToTxt.xsl) =================================================================== --- branches/pjack_settings/crawler/heritrix/src/main/bin/xdocToTxt.xsl (rev 0) +++ branches/pjack_settings/crawler/heritrix/src/main/bin/xdocToTxt.xsl 2007-04-17 18:31:32 UTC (rev 5083) @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!--Transform xdoc files to text. + + $Id$ + --> +<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> + <xsl:output method="text" version="1.0" encoding="UTF-8"/> + <xsl:param name="newline" select="'
'"/> + <xsl:param name="gt" select="'>'"/> + <xsl:param name="lt" select="'<'"/> + <xsl:param name="space" select="' '"/> + <xsl:param name="quot" select="'"'"/> + <xsl:template match="/"> + <xsl:apply-templates/> + </xsl:template> + <xsl:template match="section"> + <xsl:value-of select="$newline" /><xsl:text /> + <xsl:number count="section" level="single" format="1.0. "/> + <xsl:value-of select="@name"/><xsl:text /> + <xsl:value-of select="$newline" /><xsl:text /> + <xsl:apply-templates/> + <xsl:value-of select="$newline" /><xsl:text /> + </xsl:template> + <xsl:template match="subsection"> + <xsl:value-of select="$newline" /><xsl:text /> + <xsl:number count="section|subsection" level="multiple" format="1.1. "/> + <xsl:value-of select="@name"/><xsl:text /> + <xsl:value-of select="$newline" /><xsl:text /> + <xsl:apply-templates/> + <xsl:value-of select="$newline" /><xsl:text /> + </xsl:template> + <xsl:template match="release"> + <xsl:value-of select="$newline" /><xsl:text /> + <xsl:value-of select="@version"/><xsl:text /> + <xsl:value-of select="$space" /><xsl:text /> + <xsl:value-of select="@date"/><xsl:text /> + <xsl:value-of select="$newline" /><xsl:text /> + <xsl:apply-templates/> + <xsl:value-of select="$newline" /><xsl:text /> + </xsl:template> + <xsl:template match="action"> + <xsl:value-of select="$newline" /><xsl:text /> + <xsl:value-of select="$quot" /><xsl:text /> + <xsl:apply-templates/> + <xsl:value-of select="$quot" /><xsl:text /> + <xsl:value-of select="$space" /><xsl:text /> + <xsl:value-of select="@type"/><xsl:text /> + <xsl:value-of select="$space" /><xsl:text /> + <xsl:value-of select="@dev"/><xsl:text /> + <xsl:value-of select="$space" /><xsl:text /> + <xsl:value-of select="$newline" /><xsl:text /> + </xsl:template> + <xsl:template match="a"> + <xsl:value-of select="normalize-space(.)"/><xsl:text /> + <xsl:value-of select="$space" /><xsl:text /> + <xsl:value-of select="$lt" /><xsl:text /> + <xsl:value-of select="@href"/><xsl:text /> + <xsl:value-of select="$gt" /><xsl:text /> + </xsl:template> + <xsl:template match="p"> + <xsl:apply-templates /> + <xsl:value-of select="$space" /><xsl:text /> + </xsl:template> + <xsl:template match="img"> <<xsl:value-of select="@src"/>> + <xsl:apply-templates/> + </xsl:template> + <xsl:template match="text()" > + <xsl:value-of select="normalize-space(.)" /><xsl:text /> + </xsl:template> +</xsl:stylesheet> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |