From: <sta...@us...> - 2007-01-22 21:20:26
|
Revision: 1450 http://archive-access.svn.sourceforge.net/archive-access/?rev=1450&view=rev Author: stack-sf Date: 2007-01-22 13:15:40 -0800 (Mon, 22 Jan 2007) Log Message: ----------- M projects/nutch/project.xml _M projects/nutch/src/images/nutchwax.jpg _M projects/nutch/src/images/nwa.jpg _M projects/nutch/src/images/iipc.gif D projects/nutch/bin/indexArcsLogReporter.py M projects/wayback/project.xml M projects/waxtoolbar/xdocs/downloads.xml M projects/waxtoolbar/project.xml M projects/nutch-trec/xdocs/index.xml M projects/nutch-trec/project.xml M projects/nutch-trec/README.txt M projects/wera/xdocs/downloads.xml M projects/wera/project.xml Purge more of the cvs references. Modified Paths: -------------- trunk/archive-access/projects/nutch/project.xml trunk/archive-access/projects/nutch-trec/README.txt trunk/archive-access/projects/nutch-trec/project.xml trunk/archive-access/projects/nutch-trec/xdocs/index.xml trunk/archive-access/projects/waxtoolbar/project.xml trunk/archive-access/projects/waxtoolbar/xdocs/downloads.xml trunk/archive-access/projects/wayback/project.xml trunk/archive-access/projects/wera/project.xml trunk/archive-access/projects/wera/xdocs/downloads.xml Removed Paths: ------------- trunk/archive-access/projects/nutch/bin/indexArcsLogReporter.py Property Changed: ---------------- trunk/archive-access/projects/nutch/src/images/iipc.gif trunk/archive-access/projects/nutch/src/images/nutchwax.jpg trunk/archive-access/projects/nutch/src/images/nwa.jpg Deleted: trunk/archive-access/projects/nutch/bin/indexArcsLogReporter.py =================================================================== --- trunk/archive-access/projects/nutch/bin/indexArcsLogReporter.py 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/nutch/bin/indexArcsLogReporter.py 2007-01-22 21:15:40 UTC (rev 1450) @@ -1,181 +0,0 @@ -#!/usr/bin/env python -# -# Feed indexArcs output to this script and it will summarize the content. -# -# $Id$ -# -'''Usage: %s [--help] [--debug] FILE1 FILE2... -Version: %s -Options: - -h, --help Print this usage message. - -d, --debug Enable debugging. - -c, --csv Output reports as cvs. -Run this script against output of indexArcs script. -''' -__author__ = "Michael Stack <stack at archive dot org>" -__date__ = "Wed Jan 25 11:34:16 PST 2006" -__version__ = "0.1.0" - - -import sys -import string -import time -import logging -import util - - -# Setup logging. -logging.basicConfig() -logger = logging.getLogger(sys.argv[0]) -logger.setLevel(logging.INFO) - - -class IndexArcsParser: - def __init__(self, fd): - self.fd = fd - - def readline(self): - line = self.fd.readline() - logger.debug(line) - return line - - def findStartTime(self, str2Find): - startTime = None - while 1: - line = self.readline() - if not line: - raise IOError, "Failed find of start ('%s') line" % str2Find - if line.find(str2Find) > 0: - startTime = makeTime(line.split()[0], line.split()[1]) - logger.debug('startTime %d' % int(startTime)) - break - return startTime - - def findEndTime(self, str2Find): - startReduceTime = None - while 1: - line = self.fd.readline() - if not line: - raise IOError, "Failed find of end ('%s') line" % str2Find - if not startReduceTime: - if line.find(' reduce ') > 0: - startReduceTime = makeTime(line.split()[0], line.split()[1]) - logger.debug('startReduceTime %d' % int(startReduceTime)) - continue - if line.find(str2Find) > 0: - logger.debug("Found in : " + line) - endTime = makeTime(line.split()[0], line.split()[1]) - logger.debug('endTime %d' % int(endTime)) - break - return startReduceTime, endTime - -def processLog(fd): - '''Expect particular format. Fail if its otherwise.''' - parser = IndexArcsParser(fd) - startTime, x = processImportArcs(parser) - updateCrawlDbEndTime = processUpdateCrawlDb(parser) - processInvertLinks(parser) - processIndexer(parser) - processDedup(parser) - x, endTime = processMerge(parser) - if globals().has_key("csv"): - logger.info("Total, %d" % int(endTime - startTime)) - else: - logger.info("Total process took %d" % int(endTime - startTime)) - -def makeTime(d, t): - return time.mktime(time.strptime("%s%s" % (d, t), "%y%m%d%H%M%S")) - -def processImportArcs(parser): - startTime = parser.findStartTime('importing arcs') - firstReduceTime, endTime = parser.findEndTime(" ImportArcs: done") - writeReport("ImportArcs", startTime, endTime, firstReduceTime) - return startTime, endTime - -def processUpdateCrawlDb(parser): - startTime = parser.findStartTime('updating crawldb') - firstReduceTime, endTime = parser.findEndTime("CrawlDb update: done") - writeReport("UpdateCrawlDb", startTime, endTime, firstReduceTime) - return startTime, endTime - -def processInvertLinks(parser): - startTime = parser.findStartTime(' inverting links ') - firstReduceTime, endTime = parser.findEndTime("LinkDb: done") - writeReport("InvertLinks", startTime, endTime, firstReduceTime) - return startTime, endTime - -def processIndexer(parser): - startTime = parser.findStartTime(' Indexer: linkdb: ') - firstReduceTime, endTime = parser.findEndTime("Indexer: done") - writeReport("Indexer", startTime, endTime, 0) - return startTime, endTime - -def processDedup(parser): - startTime = parser.findStartTime(' Dedup: starting') - firstReduceTime, endTime = parser.findEndTime("Dedup: done") - writeReport("Dedup", startTime, endTime, firstReduceTime) - return startTime, endTime - -def processMerge(parser): - startTime = parser.findStartTime(' index merge ') - firstReduceTime, endTime = parser.findEndTime(" Nutchwax finished") - writeReport("Merge", startTime, endTime, firstReduceTime) - return startTime, endTime - -def writeReport(task, startTime, endTime, firstReduceTime): - '''Write out report on import arcs mapreduce task.''' - if firstReduceTime: - mapTime = int(firstReduceTime - startTime) - reduceTime = int(endTime - firstReduceTime) - else: - mapTime = 0 - reduceTime = 0 - totalTime = int(endTime - startTime) - if globals().has_key("csv"): - logger.info("%s, %d, %d, %d" % - (task, totalTime, mapTime, reduceTime)) - else: - logger.info("%s took %d seconds, map %d %d%%, reduce %d %d%%" % - (task, totalTime, mapTime, util.formatPercent(mapTime, totalTime), - reduceTime, util.formatPercent(reduceTime, totalTime))) - -def usage(exitCode = 0, msg = None): - '''Print usage.''' - if msg: - print msg - print __doc__ % (sys.argv[0], __version__) - sys.exit(exitCode) - -def main(args): - '''Main entry point.''' - if len(args) == 1: - processLog(sys.stdin) - else: - # Do opt processing. - import getopt - try: - opts, args = getopt.getopt(sys.argv[1:], "hdc", - ["help", "debug", "csv"]) - except getopt.GetoptError, e: - usage(1, e) - for key, value in opts: - if key in ('-d', '--debug'): - logger.setLevel(logging.DEBUG) - continue - if key in ('-h', '--help'): - usage(0) - continue - if key in ('-c', '--csv'): - globals()["csv"] = 1 - continue - else: - raise ValueError, "Unexpected option %s", key - for name in args: - fd = open(name) - try: - processLog(fd) - finally: - fd.close() - -if __name__ == "__main__": - main(sys.argv) Modified: trunk/archive-access/projects/nutch/project.xml =================================================================== --- trunk/archive-access/projects/nutch/project.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/nutch/project.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -53,8 +53,8 @@ the connection element has the form: scm:<system>:<system specific connection string> --> <repository> - <connection>scm:svn:https://archive-access.svn.sourceforge.net/svnroot/archive-access/</connection> - <url>https://archive-access.svn.sourceforge.net/svnroot/archive-access/</url> + <connection>scm:svn:https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutch</connection> + <url>https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutch</url> </repository> <!-- any mailing lists for the project --> @@ -72,7 +72,7 @@ </archive> </mailingList> <mailingList> - <name>SVN Commits</name> + <name>Commits</name> <subscribe> http://lists.sourceforge.net/lists/listinfo/archive-access-cvs </subscribe> Property changes on: trunk/archive-access/projects/nutch/src/images/iipc.gif ___________________________________________________________________ Name: svn:keywords - Author Date Id Revision Name: svn:eol-style - native Property changes on: trunk/archive-access/projects/nutch/src/images/nutchwax.jpg ___________________________________________________________________ Name: svn:keywords - Author Date Id Revision Name: svn:eol-style - native Property changes on: trunk/archive-access/projects/nutch/src/images/nwa.jpg ___________________________________________________________________ Name: svn:mime-type - application/octet-stream Modified: trunk/archive-access/projects/nutch-trec/README.txt =================================================================== --- trunk/archive-access/projects/nutch-trec/README.txt 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/nutch-trec/README.txt 2007-01-22 21:15:40 UTC (rev 1450) @@ -9,7 +9,7 @@ assumes the nutch sources are in nutch/. So if you have a checkout of the nutch subversion repository: - $ cd ${ARCHIVE_ACCESS_CVS}/projects/nutch-trec + $ cd ${ARCHIVE_ACCESS}/projects/nutch-trec $ ln -s ${NUTCH_SVN}/trunk nutch $ ant @@ -17,7 +17,7 @@ source .java files from the .jj javacc file you need a copy of JavaCC in JavaCC/, eg: - $ cd ${ARCHIVE_ACCESS_CVS}/projects/nutch-trec + $ cd ${ARCHIVE_ACCESS}/projects/nutch-trec $ ln -s ${JAVACC_HOME} JavaCC $ ant javacc Modified: trunk/archive-access/projects/nutch-trec/project.xml =================================================================== --- trunk/archive-access/projects/nutch-trec/project.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/nutch-trec/project.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -63,11 +63,11 @@ <!-- the version control repository and http url for online access the connection element has the form: scm:<system>:<system specific connection string> --> + <repository> - <connection>scm:cvs:pserver:ano...@ar...:/cvsroot/archive-access:archive-access/projects/nutch-trec</connection> - <url>http://archive-access.cvs.sourceforge.net/archive-access/archive-access/projects/nutch-trec/</url> + <connection>scm:svn:https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutch-trec</connection> + <url>https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/nutch-trec</url> </repository> - <!-- any mailing lists for the project --> <mailingLists> <mailingList> @@ -83,7 +83,7 @@ </archive> </mailingList> <mailingList> - <name>CVS Commits</name> + <name>Commits</name> <subscribe> http://lists.sourceforge.net/lists/listinfo/archive-access-cvs </subscribe> Modified: trunk/archive-access/projects/nutch-trec/xdocs/index.xml =================================================================== --- trunk/archive-access/projects/nutch-trec/xdocs/index.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/nutch-trec/xdocs/index.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -17,7 +17,7 @@ assumes the nutch sources are in the subdirectory <code>nutch/</code>. So if you have a checkout of the nutch subversion repository: <pre> - $ cd ${ARCHIVE_ACCESS_CVS}/projects/nutch-trec + $ cd ${ARCHIVE_ACCESS}/projects/nutch-trec $ ln -s ${NUTCH_SVN}/trunk nutch $ ant</pre> </p> @@ -27,7 +27,7 @@ If you wish to rebuild the JavaCC generated sources from the .jj javacc file you need a copy of JavaCC in JavaCC/, eg: <pre> - $ cd ${ARCHIVE_ACCESS_CVS}/projects/nutch-trec + $ cd ${ARCHIVE_ACCESS}/projects/nutch-trec $ ln -s ${JAVACC_HOME} JavaCC $ ant javacc</pre> </p> Modified: trunk/archive-access/projects/waxtoolbar/project.xml =================================================================== --- trunk/archive-access/projects/waxtoolbar/project.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/waxtoolbar/project.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -45,9 +45,10 @@ <distributionSite>http://shell.sourceforge.net</distributionSite> <distributionDirectory>/home/users/s/st/${maven.username} </distributionDirectory> + <repository> - <connection>scm:cvs:pserver:ano...@cv...:/cvsroot/archive-access:archive-access</connection> - <url>http://cvs.sourceforge.net/viewcvs.py/archive-access/archive-access/projects/waxtoolbar</url> + <connection>scm:svn:https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/waxtoolbar</connection> + <url>https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/waxtoolbar</url> </repository> <mailingLists> <mailingList> @@ -63,7 +64,7 @@ </archive> </mailingList> <mailingList> - <name>CVS Commits</name> + <name>Commits</name> <subscribe> http://lists.sourceforge.net/lists/listinfo/archive-access-cvs </subscribe> Modified: trunk/archive-access/projects/waxtoolbar/xdocs/downloads.xml =================================================================== --- trunk/archive-access/projects/waxtoolbar/xdocs/downloads.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/waxtoolbar/xdocs/downloads.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -19,7 +19,7 @@ <p>Here is a <a href="http://builds.archive.org:8080/cruisecontrol/buildresults/HEAD-archive-access">pointer</a> to our continuous build box. The latest builds can be found under the 'Build Artifacts' link. Be aware that - this distribution has been made from CVS HEAD and CVS HEAD builds are + this distribution has been made from HEAD and HEAD builds are not guaranteed stable. </p> </subsection> Modified: trunk/archive-access/projects/wayback/project.xml =================================================================== --- trunk/archive-access/projects/wayback/project.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/wayback/project.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -79,8 +79,8 @@ scm:<system>:<system specific connection string> --> <repository> - <connection>scm:cvs:pserver:ano...@ar...:/cvsroot/archive-access:archive-access</connection> - <url>http://archive-access.cvs.sourceforge.net/archive-access/archive-access/projects/wayback/</url> + <connection>scm:svn:https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback</connection> + <url>https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wayback/</url> </repository> <versions /> @@ -100,7 +100,7 @@ </archive> </mailingList> <mailingList> - <name>CVS Commits</name> + <name>SVN Commits</name> <subscribe> http://lists.sourceforge.net/lists/listinfo/archive-access-cvs </subscribe> Modified: trunk/archive-access/projects/wera/project.xml =================================================================== --- trunk/archive-access/projects/wera/project.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/wera/project.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -65,8 +65,8 @@ the connection element has the form: scm:<system>:<system specific connection string> --> <repository> - <connection>scm:cvs:pserver:ano...@cv...:/cvsroot/archive-access:archive-access</connection> - <url>http://cvs.sourceforge.net/viewcvs.py/archive-access/archive-access/projects/wera</url> + <connection>scm:svn:https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wera</connection> + <url>https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/wera</url> </repository> <!-- any mailing lists for the project --> @@ -84,7 +84,7 @@ </archive> </mailingList> <mailingList> - <name>CVS Commits</name> + <name>Commits</name> <subscribe> http://lists.sourceforge.net/lists/listinfo/archive-access-cvs </subscribe> Modified: trunk/archive-access/projects/wera/xdocs/downloads.xml =================================================================== --- trunk/archive-access/projects/wera/xdocs/downloads.xml 2007-01-22 21:02:35 UTC (rev 1449) +++ trunk/archive-access/projects/wera/xdocs/downloads.xml 2007-01-22 21:15:40 UTC (rev 1450) @@ -18,7 +18,7 @@ <p>Here is a <a href="http://builds.archive.org:8080/cruisecontrol/buildresults/HEAD-archive-access">pointer</a> to our continuous build box. The latest builds can be found under the 'Build Artifacts' link. Be aware that - this distribution has been made from CVS HEAD and CVS HEAD builds are + this distribution has been made from HEAD and HEAD builds are not guaranteed stable. </p> </subsection> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-01-22 21:40:15
|
Revision: 1452 http://archive-access.svn.sourceforge.net/archive-access/?rev=1452&view=rev Author: stack-sf Date: 2007-01-22 13:40:12 -0800 (Mon, 22 Jan 2007) Log Message: ----------- A projects/wax A projects/wax/xdocs A projects/wax/.classpath A projects/wax/LICENSE.txt A projects/wax/project.properties A projects/wax/.project A projects/wax/conf A projects/wax/maven.xml A projects/wax/project.xml A projects/wax/lib A projects/wax/src A projects/wax/bin D projects/wax/bin/indexArcsLogReporter.py A projects/wax/README.txt A projects/wax/.cvsignore A projects/wax/build.xml D projects/nutch Rename of nutch subproject as wax. Added Paths: ----------- trunk/archive-access/projects/wax/ trunk/archive-access/projects/wax/.classpath trunk/archive-access/projects/wax/.cvsignore trunk/archive-access/projects/wax/.project trunk/archive-access/projects/wax/LICENSE.txt trunk/archive-access/projects/wax/README.txt trunk/archive-access/projects/wax/bin/ trunk/archive-access/projects/wax/build.xml trunk/archive-access/projects/wax/conf/ trunk/archive-access/projects/wax/lib/ trunk/archive-access/projects/wax/maven.xml trunk/archive-access/projects/wax/project.properties trunk/archive-access/projects/wax/project.xml trunk/archive-access/projects/wax/src/ trunk/archive-access/projects/wax/xdocs/ Removed Paths: ------------- trunk/archive-access/projects/nutch/ trunk/archive-access/projects/wax/.classpath trunk/archive-access/projects/wax/.cvsignore trunk/archive-access/projects/wax/.project trunk/archive-access/projects/wax/LICENSE.txt trunk/archive-access/projects/wax/README.txt trunk/archive-access/projects/wax/bin/ trunk/archive-access/projects/wax/build.xml trunk/archive-access/projects/wax/conf/ trunk/archive-access/projects/wax/lib/ trunk/archive-access/projects/wax/maven.xml trunk/archive-access/projects/wax/project.properties trunk/archive-access/projects/wax/project.xml trunk/archive-access/projects/wax/src/ trunk/archive-access/projects/wax/xdocs/ Copied: trunk/archive-access/projects/wax (from rev 1448, trunk/archive-access/projects/nutch) Deleted: trunk/archive-access/projects/wax/.classpath =================================================================== --- trunk/archive-access/projects/nutch/.classpath 2007-01-22 20:07:06 UTC (rev 1448) +++ trunk/archive-access/projects/wax/.classpath 2007-01-22 21:40:12 UTC (rev 1452) @@ -1,20 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<classpath> - <classpathentry kind="src" path="src/java"/> - <classpathentry kind="var" path="JRE_LIB" sourcepath="JRE_SRC"/> - <classpathentry kind="lib" path="lib/commons-codec-1.3.jar"/> - <classpathentry kind="lib" path="lib/commons-httpclient-3.0-rc3.jar"/> - <classpathentry kind="lib" path="lib/dsi.unimi.it-1.2.0.jar"/> - <classpathentry kind="lib" path="lib/s3-20061030.jar"/> - <classpathentry kind="lib" path="conf"/> - <classpathentry kind="lib" path="build"/> - <classpathentry combineaccessrules="false" kind="src" path="/heritrix"/> - <classpathentry combineaccessrules="false" kind="src" path="/nutch"/> - <classpathentry combineaccessrules="false" kind="src" path="/Hadoop"/> - <classpathentry kind="lib" path="/nutch/lib/servlet-api.jar"/> - <classpathentry kind="lib" path="/nutch/lib/commons-logging-1.0.4.jar"/> - <classpathentry kind="lib" path="/nutch/lib/junit-3.8.1.jar"/> - <classpathentry kind="lib" path="/nutch/conf"/> - <classpathentry kind="lib" path="nutch/build"/> - <classpathentry kind="output" path="target"/> -</classpath> Copied: trunk/archive-access/projects/wax/.classpath (from rev 1451, trunk/archive-access/projects/nutch/.classpath) =================================================================== --- trunk/archive-access/projects/wax/.classpath (rev 0) +++ trunk/archive-access/projects/wax/.classpath 2007-01-22 21:40:12 UTC (rev 1452) @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" path="src/java"/> + <classpathentry kind="var" path="JRE_LIB" sourcepath="JRE_SRC"/> + <classpathentry kind="lib" path="lib/commons-codec-1.3.jar"/> + <classpathentry kind="lib" path="lib/commons-httpclient-3.0-rc3.jar"/> + <classpathentry kind="lib" path="lib/dsi.unimi.it-1.2.0.jar"/> + <classpathentry kind="lib" path="lib/s3-20061030.jar"/> + <classpathentry kind="lib" path="conf"/> + <classpathentry kind="lib" path="build"/> + <classpathentry combineaccessrules="false" kind="src" path="/heritrix"/> + <classpathentry combineaccessrules="false" kind="src" path="/nutch"/> + <classpathentry combineaccessrules="false" kind="src" path="/Hadoop"/> + <classpathentry kind="lib" path="/nutch/lib/servlet-api.jar"/> + <classpathentry kind="lib" path="/nutch/lib/commons-logging-1.0.4.jar"/> + <classpathentry kind="lib" path="/nutch/lib/junit-3.8.1.jar"/> + <classpathentry kind="lib" path="/nutch/conf"/> + <classpathentry kind="lib" path="nutch/build"/> + <classpathentry kind="output" path="target"/> +</classpath> Deleted: trunk/archive-access/projects/wax/.cvsignore =================================================================== --- trunk/archive-access/projects/nutch/.cvsignore 2007-01-22 20:07:06 UTC (rev 1448) +++ trunk/archive-access/projects/wax/.cvsignore 2007-01-22 21:40:12 UTC (rev 1452) @@ -1,3 +0,0 @@ -segments -nutch -build Copied: trunk/archive-access/projects/wax/.cvsignore (from rev 1451, trunk/archive-access/projects/nutch/.cvsignore) =================================================================== --- trunk/archive-access/projects/wax/.cvsignore (rev 0) +++ trunk/archive-access/projects/wax/.cvsignore 2007-01-22 21:40:12 UTC (rev 1452) @@ -0,0 +1,3 @@ +segments +nutch +build Deleted: trunk/archive-access/projects/wax/.project =================================================================== --- trunk/archive-access/projects/nutch/.project 2007-01-22 20:07:06 UTC (rev 1448) +++ trunk/archive-access/projects/wax/.project 2007-01-22 21:40:12 UTC (rev 1452) @@ -1,24 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<projectDescription> - <name>nutchwax</name> - <comment></comment> - <projects> - </projects> - <buildSpec> - <buildCommand> - <name>org.eclipse.jdt.core.javabuilder</name> - <arguments> - </arguments> - </buildCommand> - </buildSpec> - <natures> - <nature>org.eclipse.jdt.core.javanature</nature> - </natures> - <linkedResources> - <link> - <name>java</name> - <type>2</type> - <locationURI>src/java</locationURI> - </link> - </linkedResources> -</projectDescription> Copied: trunk/archive-access/projects/wax/.project (from rev 1451, trunk/archive-access/projects/nutch/.project) =================================================================== --- trunk/archive-access/projects/wax/.project (rev 0) +++ trunk/archive-access/projects/wax/.project 2007-01-22 21:40:12 UTC (rev 1452) @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>nutchwax</name> + <comment></comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> + <linkedResources> + <link> + <name>java</name> + <type>2</type> + <locationURI>src/java</locationURI> + </link> + </linkedResources> +</projectDescription> Deleted: trunk/archive-access/projects/wax/LICENSE.txt =================================================================== --- trunk/archive-access/projects/nutch/LICENSE.txt 2007-01-22 20:07:06 UTC (rev 1448) +++ trunk/archive-access/projects/wax/LICENSE.txt 2007-01-22 21:40:12 UTC (rev 1452) @@ -1,510 +0,0 @@ -The nutchwax distribution is in the main nutch. Nutch is licensed -under the Apache 2.0 License available here: -http://www.apache.org/licenses/LICENSE-2.0.txt - -The nutchwax particular extensions to nutch are free software; you can -redistribute them and/or modify them under the terms of the GNU Lesser -Public license (LGPL) reproduced below. - - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - <one line to give the library's name and a brief idea of what it does.> - Copyright (C) <year> <name of author> - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - <signature of Ty Coon>, 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! Copied: trunk/archive-access/projects/wax/LICENSE.txt (from rev 1451, trunk/archive-access/projects/nutch/LICENSE.txt) =================================================================== --- trunk/archive-access/projects/wax/LICENSE.txt (rev 0) +++ trunk/archive-access/projects/wax/LICENSE.txt 2007-01-22 21:40:12 UTC (rev 1452) @@ -0,0 +1,510 @@ +The nutchwax distribution is in the main nutch. Nutch is licensed +under the Apache 2.0 License available here: +http://www.apache.org/licenses/LICENSE-2.0.txt + +The nutchwax particular extensions to nutch are free software; you can +redistribute them and/or modify them under the terms of the GNU Lesser +Public license (LGPL) reproduced below. + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + EN... [truncated message content] |
From: <sta...@us...> - 2007-01-23 00:29:15
|
Revision: 1455 http://archive-access.svn.sourceforge.net/archive-access/?rev=1455&view=rev Author: stack-sf Date: 2007-01-22 16:29:15 -0800 (Mon, 22 Jan 2007) Log Message: ----------- M projects/wax/project.properties M projects/wayback/project.properties M projects/waxtoolbar/project.properties M projects/infiniteurl/project.properties M projects/nutch-trec/project.properties M projects/wera/project.properties Point at maven 1.0.x repo. Modified Paths: -------------- trunk/archive-access/projects/infiniteurl/project.properties trunk/archive-access/projects/nutch-trec/project.properties trunk/archive-access/projects/wax/project.properties trunk/archive-access/projects/waxtoolbar/project.properties trunk/archive-access/projects/wayback/project.properties trunk/archive-access/projects/wera/project.properties Modified: trunk/archive-access/projects/infiniteurl/project.properties =================================================================== --- trunk/archive-access/projects/infiniteurl/project.properties 2007-01-23 00:21:51 UTC (rev 1454) +++ trunk/archive-access/projects/infiniteurl/project.properties 2007-01-23 00:29:15 UTC (rev 1455) @@ -4,3 +4,4 @@ # Mapping of jars. maven.jar.override = on maven.jar.servlet = ${basedir}/lib/servlet-tomcat-4.1.30.jar +maven.repo.remote=http://repo1.maven.org/maven Modified: trunk/archive-access/projects/nutch-trec/project.properties =================================================================== --- trunk/archive-access/projects/nutch-trec/project.properties 2007-01-23 00:21:51 UTC (rev 1454) +++ trunk/archive-access/projects/nutch-trec/project.properties 2007-01-23 00:29:15 UTC (rev 1455) @@ -8,3 +8,4 @@ maven.compile.fork = true maven.license.licenseFile = LICENSE.txt +maven.repo.remote=http://repo1.maven.org/maven Modified: trunk/archive-access/projects/wax/project.properties =================================================================== --- trunk/archive-access/projects/wax/project.properties 2007-01-23 00:21:51 UTC (rev 1454) +++ trunk/archive-access/projects/wax/project.properties 2007-01-23 00:29:15 UTC (rev 1455) @@ -34,3 +34,6 @@ maven.sdocbook.resources.include = **/*.gif,**/*.png,**/*.css maven.sdocbook.html.params = -PARAM generate.id.attributes 1 -PARAM section.autolabel 1 -PARAM part.autolabel 1 -PARAM chapter.autolabel 1 -PARAM generate.meta.abstract 1 -PARAM html.stylesheet docbook.css -PARAM css.decoration 1 maven.sdocbook.fo.params = -PARAM generate.id.attributes 1 -PARAM section.autolabel 1 -PARAM part.autolabel 1 -PARAM chapter.autolabel 1 -PARAM generate.meta.abstract 1 + + +maven.repo.remote=http://repo1.maven.org/maven Modified: trunk/archive-access/projects/waxtoolbar/project.properties =================================================================== --- trunk/archive-access/projects/waxtoolbar/project.properties 2007-01-23 00:21:51 UTC (rev 1454) +++ trunk/archive-access/projects/waxtoolbar/project.properties 2007-01-23 00:29:15 UTC (rev 1455) @@ -27,3 +27,5 @@ maven.compile.target = 1.4 maven.javadoc.source = 1.4 maven.test.source = 1.4 + +maven.repo.remote=http://repo1.maven.org/maven Modified: trunk/archive-access/projects/wayback/project.properties =================================================================== --- trunk/archive-access/projects/wayback/project.properties 2007-01-23 00:21:51 UTC (rev 1454) +++ trunk/archive-access/projects/wayback/project.properties 2007-01-23 00:29:15 UTC (rev 1455) @@ -68,3 +68,5 @@ maven.javadoc.version = true maven.javadoc.useexternalfile = no maven.javadoc.windowtitle = ${pom.name} ${pom.currentVersion} + +maven.repo.remote=http://repo1.maven.org/maven Modified: trunk/archive-access/projects/wera/project.properties =================================================================== --- trunk/archive-access/projects/wera/project.properties 2007-01-23 00:21:51 UTC (rev 1454) +++ trunk/archive-access/projects/wera/project.properties 2007-01-23 00:29:15 UTC (rev 1455) @@ -33,3 +33,6 @@ # Properties for building the ArcRetriever WAR. maven.war.src = ${maven.src.dir}/webapps/arcretriever maven.war.final.name = arcretriever.war + + +maven.repo.remote=http://repo1.maven.org/maven This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-01-23 05:08:46
|
Revision: 1460 http://archive-access.svn.sourceforge.net/archive-access/?rev=1460&view=rev Author: stack-sf Date: 2007-01-22 21:08:43 -0800 (Mon, 22 Jan 2007) Log Message: ----------- D projects/wax A projects/nutchwax M projects/nutchwax/maven.xml M projects/nutchwax/project.xml Changed my name. Name the nutchwax project nutchwax, not wax. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/maven.xml trunk/archive-access/projects/nutchwax/project.xml Added Paths: ----------- trunk/archive-access/projects/nutchwax/ Removed Paths: ------------- trunk/archive-access/projects/wax/ Copied: trunk/archive-access/projects/nutchwax (from rev 1459, trunk/archive-access/projects/wax) Modified: trunk/archive-access/projects/nutchwax/maven.xml =================================================================== --- trunk/archive-access/projects/wax/maven.xml 2007-01-23 04:13:59 UTC (rev 1459) +++ trunk/archive-access/projects/nutchwax/maven.xml 2007-01-23 05:08:43 UTC (rev 1460) @@ -15,7 +15,7 @@ <arg value="--rsh=ssh" /> <arg value="${maven.build.dir}/docs/"/> - <arg value="${maven.username}@archive-access.sf.net:/home/groups/a/ar/archive-access/htdocs/projects/wax/" /> + <arg value="${maven.username}@archive-access.sf.net:/home/groups/a/ar/archive-access/htdocs/projects/nutchwax/" /> </exec> </goal> Modified: trunk/archive-access/projects/nutchwax/project.xml =================================================================== --- trunk/archive-access/projects/wax/project.xml 2007-01-23 04:13:59 UTC (rev 1459) +++ trunk/archive-access/projects/nutchwax/project.xml 2007-01-23 05:08:43 UTC (rev 1460) @@ -4,7 +4,7 @@ <pomVersion>3</pomVersion> <!-- a unique name for this project --> - <id>wax</id> + <id>nutchwax</id> <!-- a short but descriptive name for the project --> <name>nutchwax</name> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-02-22 02:08:54
|
Revision: 1510 http://archive-access.svn.sourceforge.net/archive-access/?rev=1510&view=rev Author: stack-sf Date: 2007-02-21 18:08:52 -0800 (Wed, 21 Feb 2007) Log Message: ----------- First time add of commons project. Uses maven2 to build an archive-mapred.jar. Added Paths: ----------- trunk/archive-access/projects/commons/ trunk/archive-access/projects/commons/pom.xml trunk/archive-access/projects/commons/src/ trunk/archive-access/projects/commons/src/main/ trunk/archive-access/projects/commons/src/main/java/ trunk/archive-access/projects/commons/src/main/java/org/ trunk/archive-access/projects/commons/src/main/java/org/archive/ trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCMapRunner.java trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCRecordMapper.java trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCReporter.java trunk/archive-access/projects/commons/src/test/ trunk/archive-access/projects/commons/src/test/java/ trunk/archive-access/projects/commons/src/test/java/org/ trunk/archive-access/projects/commons/src/test/java/org/archive/ Added: trunk/archive-access/projects/commons/pom.xml =================================================================== --- trunk/archive-access/projects/commons/pom.xml (rev 0) +++ trunk/archive-access/projects/commons/pom.xml 2007-02-22 02:08:52 UTC (rev 1510) @@ -0,0 +1,135 @@ +<?xml version="1.0"?> +<!-- + POM reference: http://maven.apache.org/pom.html + + List of the better articles on maven: + + http://www.javaworld.com/javaworld/jw-05-2006/jw-0529-maven.html + http://www.javaworld.com/javaworld/jw-02-2006/jw-0227-maven_p.html + + URLs on converting from 1.0 to 2.0 maven (not much good generally): + + http://wiki.osafoundation.org/bin/view/Journal/Maven2Upgrade + http://maven.apache.org/guides/mini/guide-m1-m2.html + --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <groupId>org.archive</groupId> + <artifactId>archive-mapred</artifactId> + <packaging>jar</packaging> + <version>1.0-SNAPSHOT</version> + <name>Archive Commons</name> + <description>Common code used across all Archive projects. + </description> + <url>http://archive-access.sourceforge.net/projects/commons/</url> + <inceptionYear>2004</inceptionYear> + + <licenses> + <license> + <name>GNU LESSER GENERAL PUBLIC LICENSE</name> + <url>http://www.gnu.org/licenses/lgpl.txt</url> + <distribution>repo</distribution> + </license> + </licenses> + <organization> + <name>Internet Archive</name> + <url>http://www.archive.org/</url> + </organization> + <issueManagement> + <system>SourceForge</system> + <url>http://sourceforge.net/tracker/?group_id=118427</url> + </issueManagement> + <ciManagement> + <system>cruisecontrol</system> + <url>http://builds.archive.org:8080/cruisecontrol/</url> + </ciManagement> + <mailingLists> + <mailingList> + <name>Archive Access ARC Tools Discussion List</name> + <subscribe> + http://lists.sourceforge.net/lists/listinfo/archive-access-discuss + </subscribe> + <unsubscribe> + http://lists.sourceforge.net/lists/listinfo/archive-access-discuss + </unsubscribe> + <post>archive-access-discuss</post> + <archive> + http://sourceforge.net/mailarchive/forum.php?forum_id=45842 + </archive> + </mailingList> + <mailingList> + <name>Archive Access ARC Tools Commits</name> + <subscribe> + https://lists.sourceforge.net/lists/listinfo/archive-access-cvs + </subscribe> + <unsubscribe> + https://lists.sourceforge.net/lists/listinfo/archive-access-cvs + </unsubscribe> + <post>archive-access-cvs</post> + <archive> + http://sourceforge.net/mailarchive/forum.php?forum=archive-access-cvs + </archive> + </mailingList> + </mailingLists> + <scm> + <connection>scm:svn:https://archive-access.svn.sourceforge.net/svnroot/archive-access/</connection> + <tag>HEAD</tag> + <url>https://archive-access.svn.sourceforge.net/svnroot/archive-access/</url> + </scm> + <prerequisites> + <maven>2.0.4</maven> + </prerequisites> + + <build> + <plugins /> + </build> + + <repositories> + <repository> + <releases> + <enabled>true</enabled> + <updatePolicy>always</updatePolicy> + <checksumPolicy>warn</checksumPolicy> + </releases> + <snapshots> + <enabled>true</enabled> + <updatePolicy>never</updatePolicy> + <checksumPolicy>fail</checksumPolicy> + </snapshots> + <id>builds.archive.org</id> + <name></name> + <url>http://builds.archive.org:8080/maven2</url> + <layout>default</layout> + </repository> + </repositories> + + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>1.0.4</version> + </dependency> + + <dependency> + <groupId>org.apache</groupId> + <artifactId>hadoop</artifactId> + <version>0.10.1-core</version> + </dependency> + <dependency> + <groupId>org.archive</groupId> + <artifactId>archive-commons</artifactId> + <version>1.11.0-200702211853</version> + </dependency> + + </dependencies> + +</project> Added: trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCMapRunner.java =================================================================== --- trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCMapRunner.java (rev 0) +++ trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCMapRunner.java 2007-02-22 02:08:52 UTC (rev 1510) @@ -0,0 +1,263 @@ +/* + * $Id: ImportArcs.java 1494 2007-02-15 17:47:58Z stack-sf $ + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.access.nutch.mapred; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.ObjectWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapRunnable; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.ReflectionUtils; +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.arc.ARCConstants; +import org.archive.io.arc.ARCRecord; + +/** + * MapRunner that passes an ARCRecord to configured mapper. + * Configured mapper must be implementation of {@link ARCMapRunner}. + * @author stack + */ +public class ARCMapRunner implements MapRunnable { + public final Log LOG = LogFactory.getLog(this.getClass().getName()); + private ARCRecordMapper mapper; + + /** + * How long to spend indexing. + */ + private long maxtime; + + + public void configure(JobConf job) { + this.mapper = (ARCRecordMapper)ReflectionUtils. + newInstance(job.getMapperClass(), job); + // Value is in minutes. + this.maxtime = job.getLong("wax.index.timeout", 60) * 60 * 1000; + } + + public void run(RecordReader input, OutputCollector output, + Reporter reporter) + throws IOException { + try { + WritableComparable key = input.createKey(); // Unused. + Writable value = input.createValue(); + while (input.next(key, value)) { + doArc(value.toString(), output, new ARCReporter(reporter)); + } + } finally { + this.mapper.close(); + } + } + + protected void doArc(final String arcurl, final OutputCollector output, + final ARCReporter reporter) + throws IOException { + if ((arcurl == null) || arcurl.endsWith("work")) { + reporter.setStatus("skipping " + arcurl, true); + return; + } + + // Set off indexing in a thread so I can cover it with a timer. + final Thread t = new IndexingThread(arcurl, output, reporter); + t.setDaemon(true); + t.start(); + final long start = System.currentTimeMillis(); + try { + for (long period = this.maxtime; t.isAlive() && (period > 0); + period = this.maxtime - (System.currentTimeMillis() - start)) { + try { + t.join(period); + } catch (final InterruptedException e) { + e.printStackTrace(); + } + } + } finally { + cleanup(t, reporter); + } + } + + protected void cleanup(final Thread t, final ARCReporter reporter) + throws IOException { + if (!t.isAlive()) { + return; + } + reporter.setStatus("Killing indexing thread " + t.getName(), true); + t.interrupt(); + try { + // Give it some time to die. + t.join(1000); + } catch (final InterruptedException e) { + e.printStackTrace(); + } + if (t.isAlive()) { + LOG.info(t.getName() + " will not die"); + } + } + + private class IndexingThread extends Thread { + private final String arcLocation; + private final OutputCollector output; + private final ARCReporter reporter; + + public IndexingThread(final String arcloc, final OutputCollector o, + final ARCReporter r) { + // Name this thread same as ARC location. + super(arcloc); + this.arcLocation = arcloc; + this.output = o; + this.reporter = r; + } + + /** + * @return Null if fails download. + */ + protected ArchiveReader getArchiveReader() { + ArchiveReader arc = null; + // Need a thread that will keep updating TaskTracker during long + // downloads else tasktracker will kill us. + Thread reportingDuringDownload = null; + try { + this.reporter.setStatus("opening " + this.arcLocation, true); + reportingDuringDownload = new Thread("reportDuringDownload") { + public void run() { + while (!this.isInterrupted()) { + try { + synchronized (this) { + sleep(1000 * 60); // Sleep a minute. + } + reporter.setStatus("downloading " + + arcLocation); + } catch (final IOException e) { + e.printStackTrace(); + // No point hanging around if we're failing + // status. + break; + } catch (final InterruptedException e) { + // Interrupt flag is cleared. Just fall out. + break; + } + } + } + }; + reportingDuringDownload.setDaemon(true); + reportingDuringDownload.start(); + arc = ArchiveReaderFactory.get(this.arcLocation); + } catch (final Throwable e) { + try { + final String msg = "Error opening " + this.arcLocation + + ": " + e.toString(); + this.reporter.setStatus(msg, true); + LOG.info(msg); + } catch (final IOException ioe) { + LOG.warn(this.arcLocation, ioe); + } + } finally { + if ((reportingDuringDownload != null) + && reportingDuringDownload.isAlive()) { + reportingDuringDownload.interrupt(); + } + } + return arc; + } + + public void run() { + if (this.arcLocation == null || this.arcLocation.length() <= 0) { + return; + } + ArchiveReader arc = getArchiveReader(); + if (arc == null) { + return; + } + + try { + ARCMapRunner.this.mapper.onARCOpen(); + + // Iterate over each ARCRecord. + for (final Iterator i = arc.iterator(); + i.hasNext() && !currentThread().isInterrupted();) { + final ARCRecord rec = (ARCRecord)i.next(); + + + try { + ARCMapRunner.this.mapper.map( + new Text(rec.getMetaData().getUrl()), + new ObjectWritable(rec), this.output, + this.reporter); + + final long b = rec.getMetaData().getContentBegin(); + final long l = rec.getMetaData().getLength(); + final long recordLength = (l > b)? (l - b): l; + if (recordLength > + ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE) { + // Now, if the content length is larger than a + // standard ARC, then it is most likely the last + // record in the ARC because ARC is closed after we + // exceed 100MB (DEFAULT_MAX_ARC...). Calling + // hasNext above will make us read through the + // whole record, even if its a 1.7G video. On a + // loaded machine, this might cause us timeout with + // tasktracker -- so, just skip out here. + this.reporter.setStatus("skipping " + + this.arcLocation + " -- very long record " + + rec.getMetaData()); + break; + } + } catch (final Throwable e) { + // Failed parse of record. Keep going. + LOG.warn("Error processing " + rec.getMetaData(), e); + } + } + if (currentThread().isInterrupted()) { + LOG.info(currentThread().getName() + " interrupted"); + } + this.reporter.setStatus("closing " + this.arcLocation, true); + } catch (final Throwable e) { + // Problem parsing arc file. + final String msg = "Error parsing " + this.arcLocation; + try { + this.reporter.setStatus(msg, true); + } catch (final IOException ioe) { + ioe.printStackTrace(); + } + LOG.warn(msg, e); + } finally { + try { + arc.close(); + ARCMapRunner.this.mapper.onARCClose(); + } catch (final IOException e) { + e.printStackTrace(); + } + } + } + } + +} \ No newline at end of file Added: trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCRecordMapper.java =================================================================== --- trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCRecordMapper.java (rev 0) +++ trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCRecordMapper.java 2007-02-22 02:08:52 UTC (rev 1510) @@ -0,0 +1,49 @@ +/* + * $Id: ImportArcs.java 1494 2007-02-15 17:47:58Z stack-sf $ + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.access.nutch.mapred; + +import java.io.IOException; + +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.archive.io.arc.ARCRecord; + +/** + * Like {@link Mapper} but adds signaling of ARC open and close. + * @author stack + */ +public interface ARCRecordMapper extends Mapper { + /** + * Called after ARC open but before we call + * {@link #map(String, ARCRecord, OutputCollector, Reporter)} + * @throws IOException + */ + public void onARCOpen() throws IOException; + + /** + * Called on ARC close. + * @throws IOException + */ + public void onARCClose() throws IOException; +} Added: trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCReporter.java =================================================================== --- trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCReporter.java (rev 0) +++ trunk/archive-access/projects/commons/src/main/java/org/archive/mapred/ARCReporter.java 2007-02-22 02:08:52 UTC (rev 1510) @@ -0,0 +1,80 @@ +/* + * $Id: ImportArcs.java 1494 2007-02-15 17:47:58Z stack-sf $ + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.access.nutch.mapred; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.mapred.Reporter; + +/** + * Reporter that logs all status passed; a combined Reporter and logger. Only + * reports home every so often. + * @author stack + */ +public class ARCReporter implements Reporter { + public final Log LOG = LogFactory.getLog(this.getClass().getName()); + private final Reporter wrappedReporter; + private long nextUpdate = 0; + private long time = System.currentTimeMillis(); + + private static final long FIVE_MINUTES = 1000 * 60 * 5; + + public ARCReporter(final Reporter r) { + this.wrappedReporter = r; + } + + public void setStatus(final String msg) throws IOException { + setStatus(msg, false); + } + + public void setStatus(final String msg, final boolean writeThrough) + throws IOException { + LOG.info(msg); + // Only update tasktracker every second -- not for every record. + long now = System.currentTimeMillis(); + if (writeThrough || now > this.nextUpdate) { + this.wrappedReporter.setStatus(msg); + this.nextUpdate = now + 1000; + this.time = now; + } + } + + /** + * Update reporter if its a long time since last log only. + * @param msg Message to report IF we haven't reported in a long time. + * @throws IOException + */ + public void setStatusIfElapse(final String msg) + throws IOException { + long now = System.currentTimeMillis(); + if ((now - this.time) > FIVE_MINUTES) { + setStatus(msg); + } + } + + public void progress() throws IOException { + this.wrappedReporter.progress(); + } +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-02-26 21:44:06
|
Revision: 1514 http://archive-access.svn.sourceforge.net/archive-access/?rev=1514&view=rev Author: stack-sf Date: 2007-02-26 13:44:05 -0800 (Mon, 26 Feb 2007) Log Message: ----------- D commons A mapred M mapred/pom.xml Rename 'commons' project as 'mapred' project. Modified Paths: -------------- trunk/archive-access/projects/mapred/pom.xml Added Paths: ----------- trunk/archive-access/projects/mapred/ Removed Paths: ------------- trunk/archive-access/projects/commons/ Copied: trunk/archive-access/projects/mapred (from rev 1513, trunk/archive-access/projects/commons) Modified: trunk/archive-access/projects/mapred/pom.xml =================================================================== --- trunk/archive-access/projects/commons/pom.xml 2007-02-23 00:38:13 UTC (rev 1513) +++ trunk/archive-access/projects/mapred/pom.xml 2007-02-26 21:44:05 UTC (rev 1514) @@ -21,7 +21,7 @@ <packaging>jar</packaging> <version>0.1.0-SNAPSHOT</version> <name>Archive Commons</name> - <description>Common code used across all Archive projects. + <description>Common mapreduce code. </description> <url>http://archive-access.sourceforge.net/projects/commons/</url> <inceptionYear>2004</inceptionYear> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <sta...@us...> - 2007-03-22 19:24:13
|
Revision: 1627 http://archive-access.svn.sourceforge.net/archive-access/?rev=1627&view=rev Author: stack-sf Date: 2007-03-22 12:24:11 -0700 (Thu, 22 Mar 2007) Log Message: ----------- * projects/nutchwax/pom.xml Fixing javadoc generation. * projects/wayback/pom.xml Use newer version os archive-commons. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/pom.xml trunk/archive-access/projects/wayback/pom.xml Modified: trunk/archive-access/projects/nutchwax/pom.xml =================================================================== --- trunk/archive-access/projects/nutchwax/pom.xml 2007-03-22 00:07:39 UTC (rev 1626) +++ trunk/archive-access/projects/nutchwax/pom.xml 2007-03-22 19:24:11 UTC (rev 1627) @@ -255,12 +255,20 @@ <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-javadoc-plugin</artifactId> <configuration> - <javadocDirectory> + <source> ${basedir}/src/java - </javadocDirectory> + </source> <overview> ${basedir}/src/java/overview.html </overview> + <aggregate> + true + </aggregate> + <!-- + <additionalparam> + -classpath nutchwax-thirdparty/build/classes + </additionalparam> + --> </configuration> </plugin> <plugin> Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2007-03-22 00:07:39 UTC (rev 1626) +++ trunk/archive-access/projects/wayback/pom.xml 2007-03-22 19:24:11 UTC (rev 1627) @@ -162,7 +162,7 @@ -Dfile=/tmp/archive-commons-1.11.0-SNAPSHOT.jar -Durl=file:/0/maven2-repository/ \ -DgroupId=org.archive -DartifactId=archive-commons -Dpackaging=jar -Dversion=1.11.0-SNAPSHOT --> - <version>1.11.0-SNAPSHOT</version> + <version>1.12.0</version> </dependency> </dependencies> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-05-14 00:20:16
|
Revision: 2265 http://archive-access.svn.sourceforge.net/archive-access/?rev=2265&view=rev Author: binzino Date: 2008-05-13 17:20:24 -0700 (Tue, 13 May 2008) Log Message: ----------- Initial checkin of NutchWAX 0.12, a.k.a Nutch Archive Tools (NAT). Added Paths: ----------- trunk/archive-access/projects/nat/ trunk/archive-access/projects/nat/archive/ trunk/archive-access/projects/nat/archive/INSTALL.txt trunk/archive-access/projects/nat/archive/README.txt trunk/archive-access/projects/nat/archive/bin/ trunk/archive-access/projects/nat/archive/bin/nutchwax trunk/archive-access/projects/nat/archive/build.xml trunk/archive-access/projects/nat/archive/conf/ trunk/archive-access/projects/nat/archive/conf/nutch-site.xml trunk/archive-access/projects/nat/archive/conf/search-servers.txt trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml trunk/archive-access/projects/nat/archive/lib/ trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar trunk/archive-access/projects/nat/archive/src/ trunk/archive-access/projects/nat/archive/src/java/ trunk/archive-access/projects/nat/archive/src/java/org/ trunk/archive-access/projects/nat/archive/src/java/org/archive/ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/NutchWax.java trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/tools/ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/tools/DumpIndex.java trunk/archive-access/projects/nat/archive/src/plugin/ trunk/archive-access/projects/nat/archive/src/plugin/build-plugin.xml trunk/archive-access/projects/nat/archive/src/plugin/build.xml trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/build.xml trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/plugin.xml trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ trunk/archive-access/projects/nat/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/build.xml trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/plugin.xml trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/ConfigurableQueryFilter.java trunk/archive-access/projects/nat/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java Added: trunk/archive-access/projects/nat/archive/INSTALL.txt =================================================================== --- trunk/archive-access/projects/nat/archive/INSTALL.txt (rev 0) +++ trunk/archive-access/projects/nat/archive/INSTALL.txt 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,236 @@ + +INSTALL.txt +2008-05-06 +Aaron Binns + + +The NutchWAX 0.12 build and installation is as an "add-on" to an +existing Nutch 1.0-dev installation. + +NutchWAX 0.12 uses a simple 'ant' build script. The script compiles +the NutchWAX sources, using the libraries in the installed +Nutch-1.0-dev. + +We strongly recommend having *two* Nutch-1.0-dev installation +directories: one that you build NutchWAX against, and another into +which NutchWAX is deployed. + +NutchWAX is deployed by un-tar'ing the nutchwax-0.12.tar.gz file +*into* an existing Nutch-1.0-dev installation. Think of NutchWAX as +an add-on. We over-write a few Nutch config files, but the rest is +simply added to the existing Nutch-1.0-dev installation. + + +Nutch-1.0-dev +------------- + +As mentioned above, NutchWAX 0.12 is built against Nutch-1.0-dev. Now +Nutch doesn't have a 1.0 release package yet, so we have to use the +Nutch SVN trunk. The specific SVN revision that NutchWAX 0.12 is +built against is: + + 650739 + +To checkout this revision of Nutch, use: + + $ mkdir nutch + $ cd nutch + $ svn checkout -r 650739 http://svn.apache.org/repos/asf/lucene/nutch/trunk + +To build the nutch-1.0-dev.tar.gz package, use 'ant' + + $ cd trunk + $ ant tar + +This produces + + build/nutch-1.0-dev.tar.gz + +Which we then install *twice* + + $ mkdir -p ~/nutchwax-0.12/nutch-1.0-dev + $ tar xfz -C ~/nutchwax-0.12/nutch-1.0-dev build/nutch-1.0-dev.tar.gz + $ mkdir -p /opt/nutch-1.0-dev + $ tar xfz -C /opt/nutch-1.0-dev build/nutch-1.0-dev.tar.gz + +The idea is that we keep /opt/nutch-1.0-dev as our pristine copy which +we compile against, then, when we want to test NutcWAX, we deploy it +into ~/nutchwax-0.12/nutch-1.0-dev. + +Why can't we just use one installation of Nutch? Mainly to avoid +weirdness where we are compiling NutchWAX source against the same set +of libraries where we would be installing NutchWAX. Consider, when we +deploy NutchWAX, we copy the nutchwax.jar into the Nutch 'lib' +directory. If we use that same 'lib' directory for dependencies when +compiling the source, 'ant'/'javac' will likely get confused when +calculating dependencies. + +It's possible that you could successfully go through the +build/test/release cycle using one Nutch-1.0-dev directory, but these +instructions assume you will have two. + + +Build and install +----------------- + + 1. Install two Nutch-1.0-dev packages per the instructions above. + + 2. Edit build.xml to point to the "pristine" installation of Nutch-1.0-dev + + <!-- NOTE: Point this to your Nutch 1.0-dev directory --> + <property name="nutch.dir" value="/opt/nutch-1.0-dev" /> + + 3. Build NutchWAX-0.12 + + $ ant + + The default build rule is "package" which will compile all the source + and build an intallation tarball: nutchwax-0.12.tar.gz + + The "build.xml" file is pretty straightforward and just grepping + for the targets should be pretty obvious: compile, clean, etc. + + 4. Install NutchWAX into the build/test Nutch installation + + $ tar xfz -C ~/nutchwax-0.12/nutch-1.0-dev nutchwax-0.12.tar.gz + +That's it! + +All we do is add our libraries (nutchwax.jar and dependencies), the +'nutchwax' helper script, plugins for indexing and querying, and a few +config files. + +Except for the config files, no files in the Nutch-1.0-dev +installation are over-written, only added. The "nutch-site.xml" file +is over-written, but that file is empty in a vanilla Nutch +installation, so there's small risk of over-writing something. + + +HOWTO run and test +------------------ + +The 'nutchwax' helper script is installed in the Nutch-1.0-dev 'bin' +directory next to the 'nutch' helper script. + +The 'nutchwax' script is used to run the NutchWAX-specific tools, use +the regular 'nutch' script for regular Nutch activities. + +The 'nutchwax' script runs two tools + + "import" Import a set of .arc/.warc files from a manifest, creating + a Nutch segment. + + "dumpindex" Debug tool that dumps a Lucene index, such as the ones + created by Nutch's "index" tool. + +The idea is that the NutchWAX "import" tool supplants the Nutch +generate and fetch cycle. Rather than generating and fetching +segments, we import the .arc/.warc files directly into a newly created +segment. Then, we process that segment just as we normally would with +Nutch. + +For example, + + $ cd nutch-test + $ cat > manifest + http://someserver/foo-bar-baz.arc mycollection + ^D + $ nutch-1.0-dev/bin/nutchwax import manifest + +This will import the arc file listed in the manifest into a newly +created segment. The segment is created by default in a directory +hierarchy of the form: + + segments/[date-timestamp] + +This mirrors the way segments are created in vanilla Nutch by the +"generate" command. + +You can explicitly name the segment if you want, e.g. + + nutchwax import manifest mysegment + +Once the segment is created by the importing of ARC files with +NutchWAX, you can use Nutch to perform the rest of the steps. For +example: + + $ nutch-1.0-dev/bin/nutchwax import manifest + $ nutch-1.0-dev/bin/nutch updatedb crawldb -dir segments + $ nutch-1.0-dev/bin/nutch invertlinks linkdb -dir segments + $ nutch-1.0-dev/bin/nutch index indexes crawldb linkdb segments/* + $ nutch-1.0-dev/bin/nutch merge index indexes + +This is pretty much the minimal set of steps to import and index a set +of ARC files. The crawldb update and link inversion steps are pro +forma and don't have anything to do with NutchWAX specifically, but +are a part of regular Nutch processing. + +Now you have a Nutch "index" directory and are ready to search! + +Searching is done as in vanilla Nutch. Either launch the Nutch webapp +or use the command-line interface to NutchBean to run some test +searches. Nothing NutchWAX-specific here. + + +Miscellaneous notes +------------------- + +1. Plugins + +There are two plugins bundled with NutchWAX: + + index-nutchwax + query-nutchwax + +See the "plugin.includes" property in nutch-site.xml to see where +these plugins are added to the filter chain. + +The index-nutchwax plugin ensures that WAX-specifici metadata is +transferred from the Nutch Content object to the Lucene Document +object, which is placed in the Lucene index. + +The query-nutchwax plugin is used to process query requests against +those same meta-data fields. It also expands the capabilities of +searching the basic Nutch fields as well. + +2. URL filters + +Nutch's URL filter by default filters-out many common URL oddities +that would normally trip-up Nutch's crawler. However, when importing +content from ARC files, filtering out content probably doens't make +sense. That is, whatever content made it into the ARC file should be +imported, no matter what the URL looks like. + +To change the URL filter, edit the Nutch file 'conf/regex-urlfilter.txt'. +To pass all content through the filter, remove all filter rules except +for the last one: + + # accept anything else + +. + +3. conf/tika-mimetypes.xml + +NutchWAX comes with a fixed copy of tika-mimetypes.xml. The version +in Nutch revision 650739 has a few bugs in it which cause parsing to +fail for many document types. The bugs are: + + o Move + + <mime-type type="application/xml"> + <alias type="text/xml" /> + <glob pattern="*.xml" /> + </mime-type> + + definition higher up in the file, before the reference to it. + + o Remove + + <mime-type type="application/x-ms-dos-executable"> + <alias type="application/x-dosexec;exe" /> + </mime-type> + + as the ';' character is illegal according to the comments in the + Nutch code. + +The copy of "conf/tika-mimetypes.xml" bundled with NutchWAX fixes +these two bugs. Added: trunk/archive-access/projects/nat/archive/README.txt =================================================================== --- trunk/archive-access/projects/nat/archive/README.txt (rev 0) +++ trunk/archive-access/projects/nat/archive/README.txt 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,105 @@ + +README.txt +2008-05-06 +Aaron Binns + + +This is the NutchWAX-0.12 source that John Lee handed-off to me. It +is a work-in-progress. + +Compared to NutchWAX-0.10 (and earlier) it is *much* simpler. The +main WAX-specific code is in just a few files really: + +src/java/org/archive/nutchwax/ArcsToSegment.java + + This is the meat of the WAX logic for processing .arc files and + generating Nutch segments. Once we use this to generate a set of + segments for the .arc files, we can use the rest of vanilla + Nutch-1.0-dev to invert links and index the content with Lucene. + + This conversion code is heavily edited from: + + nutch-1.0-dev/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java + + taken from the Nutch SVN head (a.k.a the "1.0-dev" in-development). + + Ours differs in a few important ways: + + o Rather than taking a directory with .arc files as input, we take + a manifest file with URLs to .arc files. This way, the manifest + is split up among the distributed Hadoop jobs and the .arc files + are processed in whole by each worker. + + In the Nutch-1.0-dev, the ArcSegmentCreator.java expects the + input directory to contain the .arc files and (AFAICT) splits + them up and distributes them across the Hadoop workers. This + seems really inefficient to me, I think our approach is much + better -- at least for us. + + o Related to the way input files are split and processed, we use + the standard Archive ARCReader class just like Heritrix and + Wayback. + + The ArcSegmentCreator.java in Nutch-1.0-dev doesn't use our + ARCReader because of licensing imcompatibility. Ours is under + GPL and Nutch-1.0-dev forbids the use of GPL code. + + We are in the process of re-licensing or dual-licensing with + Apache License, but until then, our ARCReader code won't be incldued + in mainline Nutch. + + This isn's a problem per se, but worth noting in case anyone + looks at the Nutch-1.0-dev code and wonders why they built their + own (horribly inefficient) .arc reader. + + o We add metadata fields to the processed document for WAX-specific + purposes: + + content.getMetadata().set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); + content.getMetadata().set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ) ; + content.getMetadata().set( NutchWax.COLLECTION_KEY, collection); + content.getMetadata().set( NutchWax.ARCHIVE_DATE_KEY, meta.getDate() ); + + The addition of the arcname and collection key is pretty + obvious. I don't know why the content-type isn't added in the + vanilla Nutch-1.0-dev. + + Also, we should review the use of the ARCHIVE_DATE_KEY in that + John Lee mentioned to me that there was possibly duplicate date + fields put in the index: one that is a plain old Java date, and + one that is a 14-digit date string for use with Wayback. + +src/java/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/NutchWaxIndexingFilter.java +src/java/plugin/index-nutchwax/plugin.xml + + This filter is pretty straightforward. All it does is take the + metadata fields that were added to the document (as described above) + and placed in the Lucene index so that we can make use of them at + search-time. + +src/java/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/MultipleFieldQueryFilter.java +src/java/plugin/query-nutchwax/plugin.xml + + This is a single query filter that can be used for querying single + fields from a single implementation. It does *not* allow for + querying multiple fields as you can already do that via Nutch. + + What this filter does is allows one to more-or-less create query + filters in a data-driven manner rather than having to code-up a new + class for each field. That is, before one would have to create a + CollectionQueryFilter class to filter on the "collection" field. + With the MultipleFieldQueryFilter class, you can specify that the + "collection" field is to be filterable via the plugin.xml file and + "nutchwax.filter.query" configuration property. + +src/java/org/archive/nutchwax/NutchWax.java + + Just a simple enum used by the above two classes for the metadata + keys. + +src/java/org/archive/nutchwax/tools/DumpIndex.java + + A simple command-line utility to dump the contents of a Lucene + index. Used for debugging. + + Added: trunk/archive-access/projects/nat/archive/bin/nutchwax =================================================================== --- trunk/archive-access/projects/nat/archive/bin/nutchwax (rev 0) +++ trunk/archive-access/projects/nat/archive/bin/nutchwax 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + + +# The following is cribbed from the 'nutch' script to ascertain the +# location of Nutch so we can call its scripts. +# +# resolve links - $0 may be a softlink +THIS="$0" +while [ -h "$THIS" ]; do + ls=`ls -ld "$THIS"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '.*/.*' > /dev/null; then + THIS="$link" + else + THIS=`dirname "$THIS"`/"$link" + fi +done + +THIS_DIR=`dirname "$THIS"` +NUTCH_HOME=`cd "$THIS_DIR/.." ; pwd` + +# Now that we have NUTCH_HOME, process the command-line. + +case "$1" in + import) + shift + if [ $# -eq 0 ]; then + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment + exit 1 + fi + if [ -z "$2" ]; then + segment=`date +"%Y%m%d%H%M%S"` + segment="segments/${segment}" + else + segment="$2" + fi + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment "$1" "${segment}" + ;; + dumpindex) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DumpIndex $@ + ;; + *) + echo "" + echo "Usage: nutchwax COMMAND" + echo "where COMMAND is one of:" + echo " import Import ARCs into a new Nutch segment" + echo " dumpindex Dump an index to the screen" + echo "" + exit 1 + ;; +esac + +exit 0 Property changes on: trunk/archive-access/projects/nat/archive/bin/nutchwax ___________________________________________________________________ Name: svn:executable + * Added: trunk/archive-access/projects/nat/archive/build.xml =================================================================== --- trunk/archive-access/projects/nat/archive/build.xml (rev 0) +++ trunk/archive-access/projects/nat/archive/build.xml 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,138 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="nutchwax" default="job"> + + <property name="nutch.dir" value="../../" /> + + <property name="src.dir" value="src" /> + <property name="lib.dir" value="lib" /> + <property name="build.dir" value="${nutch.dir}/build" /> + <!-- HACK: Need to import default.properties like Nutch does --> + <property name="dist.dir" value="${build.dir}/nutch-1.0-dev" /> + + <target name="nutch-compile-core"> + <ant dir="${nutch.dir}" target="compile-core" inheritAll="false" /> + </target> + + <target name="nutch-compile-plugins"> + <ant dir="${nutch.dir}" target="compile-plugins" inheritAll="false" /> + </target> + + <target name="compile-core" depends="nutch-compile-core"> + <javac + destdir="${build.dir}/classes" + debug="true" + verbose="false" + source="1.5" + target="1.5" + encoding="UTF-8" + fork="true" + nowarn="true" + deprecation="false"> + <src path="${src.dir}/java" /> + <include name="**/*.java" /> + <classpath> + <pathelement location="${build.dir}/classes" /> + <fileset dir="${lib.dir}"> + <include name="*.jar"/> + </fileset> + <fileset dir="${nutch.dir}/lib"> + <include name="*.jar"/> + </fileset> + </classpath> + </javac> + </target> + + <target name="compile-plugins"> + <ant dir="src/plugin" target="deploy" inheritAll="false" /> + </target> + + <!-- + These targets all call down to the corresponding target in the + Nutch build.xml file. This way all of the 'ant' build commands + can be executed from this directory and everything should get + built as expected. + --> + <target name="compile" depends="compile-core, compile-plugins, nutch-compile-plugins"> + </target> + + <target name="jar" depends="compile-core"> + <ant dir="${nutch.dir}" target="jar" inheritAll="false" /> + </target> + + <target name="job" depends="compile"> + <ant dir="${nutch.dir}" target="job" inheritAll="false" /> + </target> + + <target name="war" depends="compile"> + <ant dir="${nutch.dir}" target="war" inheritAll="false" /> + </target> + + <target name="javadoc" depends="compile"> + <ant dir="${nutch.dir}" target="javadoc" inheritAll="false" /> + </target> + + <target name="tar" depends="package"> + <ant dir="${nutch.dir}" target="tar" inheritAll="false" /> + </target> + + <target name="clean"> + <ant dir="${nutch.dir}" target="clean" inheritAll="false" /> + </target> + + <!-- This one does a little more after calling down to the relevant + Nutch target. After Nutch has copied everything into the + distribution directory, we add our script, libraries, etc. + + Rather than over-write the standard Nutch configuration files, + we place ours in a newly created directory + + contrib/archive/conf + + and let the individual user decide whether or not to + incorporate our modifications. + --> + <target name="package" depends="jar, job, war, javadoc"> + <ant dir="${nutch.dir}" target="package" inheritAll="false" /> + + <copy todir="${dist.dir}/lib" includeEmptyDirs="false"> + <fileset dir="lib"/> + </copy> + + <copy todir="${dist.dir}/bin"> + <fileset dir="bin"/> + </copy> + + <chmod perm="ugo+x" type="file"> + <fileset dir="${dist.dir}/bin"/> + </chmod> + + <mkdir dir="${dist.dir}/contrib/archive/conf"/> + <copy todir="${dist.dir}/contrib/archive/conf"> + <fileset dir="conf" /> + </copy> + + <copy todir="${dist.dir}/contrib/archive"> + <fileset dir="."> + <include name="*.txt" /> + </fileset> + </copy> + + </target> + +</project> Added: trunk/archive-access/projects/nat/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nat/archive/conf/nutch-site.xml (rev 0) +++ trunk/archive-access/projects/nat/archive/conf/nutch-site.xml 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,65 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<!-- Put site-specific property overrides in this file. --> + +<configuration> + +<property> + <name>plugin.includes</name> + <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> + <!-- Also, add 'parse-pdf' --> + <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> + <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic</value> +</property> + +<property> + <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing. + The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" + Where the only required part is the "src-key", the rest will assume the following defaults: + lowercase = true + store = true + tokenize = false + dest-key = src-key + --> + <name>nutchwax.filter.index</name> + <value> + arcname:false + collection + date + type + </value> +</property> + +<property> + <!-- Configure the 'query-nutchwax' plugin. Specify which fields to make searchable via "field:[term|phrase]" query syntax, and whether they are "raw" fields or not. + The specification format is "raw:name:lowercase:boost" or "field:name:boost". Default values are + lowercase = true + boost = 1.0f + There is no "lowercase" property for "field" specification because the Nutch FieldQueryFilter doesn't expose the option, unlike the RawFieldQueryFilter. + AFAICT, the order isn't important. --> + <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> + <name>nutchwax.filter.query</name> + <value> + raw:arcname:false + raw:collection + raw:type + field:anchor + field:content + field:host + field:title + </value> +</property> + +<!-- Over-ride setting in Nutch "nutch-default.xml" file. We do *not* want Content-Type detection via magic resolution because the implementation + in Nutch reads in the entire content body (which could be a 1GB MPG movie), then converts it to a String before examining the first dozen or + so bytes/characters for magic matching. Since we archvie large files, this is bad, and OOMs occur. So, we disable this feature and keep + the Content-Type that is already in the (W)ARC file. --> +<property> + <name>mime.type.magic</name> + <value>false</value> + <description>Defines if the mime content type detector uses magic resolution. + </description> +</property> + +</configuration> Added: trunk/archive-access/projects/nat/archive/conf/search-servers.txt =================================================================== --- trunk/archive-access/projects/nat/archive/conf/search-servers.txt (rev 0) +++ trunk/archive-access/projects/nat/archive/conf/search-servers.txt 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1 @@ +localhost 9000 Added: trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml =================================================================== --- trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml (rev 0) +++ trunk/archive-access/projects/nat/archive/conf/tika-mimetypes.xml 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,364 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Description: This xml file defines the valid mime types used by Tika. + The mime types within this file are based on the types in the mime-types.xml + file available in Apache Nutch. +--> + +<mime-info> + + <mime-type type="text/plain"> + <magic priority="50"> + <match value="This is TeX," type="string" offset="0" /> + <match value="This is METAFONT," type="string" offset="0" /> + </magic> + <glob pattern="*.txt" /> + <glob pattern="*.asc" /> + </mime-type> + + <mime-type type="text/html"> + <magic priority="50"> + <match value="<!DOCTYPE HTML" type="string" + offset="0:64" /> + <match value="<!doctype html" type="string" + offset="0:64" /> + <match value="<HEAD" type="string" offset="0:64" /> + <match value="<head" type="string" offset="0:64" /> + <match value="<TITLE" type="string" offset="0:64" /> + <match value="<title" type="string" offset="0:64" /> + <match value="<html" type="string" offset="0:64" /> + <match value="<HTML" type="string" offset="0:64" /> + <match value="<BODY" type="string" offset="0" /> + <match value="<body" type="string" offset="0" /> + <match value="<TITLE" type="string" offset="0" /> + <match value="<title" type="string" offset="0" /> + <match value="<!--" type="string" offset="0" /> + <match value="<h1" type="string" offset="0" /> + <match value="<H1" type="string" offset="0" /> + <match value="<!doctype HTML" type="string" offset="0" /> + <match value="<!DOCTYPE html" type="string" offset="0" /> + </magic> + <glob pattern="*.html" /> + <glob pattern="*.htm" /> + </mime-type> + + <mime-type type="application/xml"> + <alias type="text/xml" /> + <glob pattern="*.xml" /> + </mime-type> + + <mime-type type="application/xhtml+xml"> + <sub-class-of type="text/xml" /> + <glob pattern="*.xhtml" /> + <root-XML namespaceURI='http://www.w3.org/1999/xhtml' + localName='html' /> + </mime-type> + + <mime-type type="application/vnd.ms-powerpoint"> + <glob pattern="*.ppz" /> + <glob pattern="*.ppt" /> + <glob pattern="*.pps" /> + <glob pattern="*.pot" /> + <magic priority="50"> + <match value="0xcfd0e011" type="little32" offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/vnd.ms-excel"> + <magic priority="50"> + <match value="Microsoft Excel 5.0 Worksheet" type="string" + offset="2080" /> + </magic> + <glob pattern="*.xls" /> + <glob pattern="*.xlc" /> + <glob pattern="*.xll" /> + <glob pattern="*.xlm" /> + <glob pattern="*.xlw" /> + <glob pattern="*.xla" /> + <glob pattern="*.xlt" /> + <glob pattern="*.xld" /> + <alias type="application/msexcel" /> + </mime-type> + + <mime-type type="application/vnd.oasis.opendocument.text"> + <glob pattern="*.odt" /> + </mime-type> + + + <mime-type type="application/zip"> + <alias type="application/x-zip-compressed" /> + <magic priority="40"> + <match value="PK\003\004" type="string" offset="0" /> + </magic> + <glob pattern="*.zip" /> + </mime-type> + + <mime-type type="application/vnd.oasis.opendocument.text"> + <glob pattern="*.oth" /> + </mime-type> + + <mime-type type="application/msword"> + <magic priority="50"> + <match value="\x31\xbe\x00\x00" type="string" offset="0" /> + <match value="PO^Q`" type="string" offset="0" /> + <match value="\376\067\0\043" type="string" offset="0" /> + <match value="\333\245-\0\0\0" type="string" offset="0" /> + <match value="Microsoft Word 6.0 Document" type="string" + offset="2080" /> + <match value="Microsoft Word document data" type="string" + offset="2112" /> + </magic> + <glob pattern="*.doc" /> + <alias type="application/vnd.ms-word" /> + </mime-type> + + <mime-type type="application/octet-stream"> + <magic priority="50"> + <match value="\037\036" type="string" offset="0" /> + <match value="017437" type="host16" offset="0" /> + <match value="0x1fff" type="host16" offset="0" /> + <match value="\377\037" type="string" offset="0" /> + <match value="0145405" type="host16" offset="0" /> + </magic> + <glob pattern="*.bin" /> + </mime-type> + + <mime-type type="application/pdf"> + <magic priority="50"> + <match value="%PDF-" type="string" offset="0" /> + </magic> + <glob pattern="*.pdf" /> + <alias type="application/x-pdf" /> + </mime-type> + + <mime-type type="application/atom+xml"> + <root-XML localName="feed" + namespaceURI="http://purl.org/atom/ns#" /> + </mime-type> + + <mime-type type="application/mac-binhex40"> + <glob pattern="*.hqx" /> + </mime-type> + + <mime-type type="application/mac-compactpro"> + <glob pattern="*.cpt" /> + </mime-type> + + <mime-type type="application/rtf"> + <glob pattern="*.rtf"/> + <alias type="text/rtf" /> + </mime-type> + + <mime-type type="application/rss+xml"> + <alias type="text/rss" /> + <root-XML localName="rss" /> + <root-XML namespaceURI="http://purl.org/rss/1.0/" /> + <glob pattern="*.rss" /> + </mime-type> + + <!-- added in by mattmann --> + <mime-type type="application/x-mif"> + <alias type="application/vnd.mif" /> + </mime-type> + + <mime-type type="application/vnd.wap.wbxml"> + <glob pattern="*.wbxml" /> + </mime-type> + + <mime-type type="application/vnd.wap.wmlc"> + <_comment>Compiled WML Document</_comment> + <glob pattern="*.wmlc" /> + </mime-type> + + <mime-type type="application/vnd.wap.wmlscriptc"> + <_comment>Compiled WML Script</_comment> + <glob pattern="*.wmlsc" /> + </mime-type> + + <mime-type type="text/vnd.wap.wmlscript"> + <_comment>WML Script</_comment> + <glob pattern="*.wmls" /> + </mime-type> + + <mime-type type="application/x-bzip"> + <alias type="application/x-bzip2" /> + </mime-type> + + <mime-type type="application/x-bzip-compressed-tar"> + <glob pattern="*.tbz" /> + <glob pattern="*.tbz2" /> + </mime-type> + + <mime-type type="application/x-cdlink"> + <_comment>Virtual CD-ROM CD Image File</_comment> + <glob pattern="*.vcd" /> + </mime-type> + + <mime-type type="application/x-director"> + <_comment>Shockwave Movie</_comment> + <glob pattern="*.dcr" /> + <glob pattern="*.dir" /> + <glob pattern="*.dxr" /> + </mime-type> + + <mime-type type="application/x-futuresplash"> + <_comment>Macromedia FutureSplash File</_comment> + <glob pattern="*.spl" /> + </mime-type> + + <mime-type type="application/x-java"> + <alias type="application/java" /> + </mime-type> + + <mime-type type="application/x-koan"> + <_comment>SSEYO Koan File</_comment> + <glob pattern="*.skp" /> + <glob pattern="*.skd" /> + <glob pattern="*.skt" /> + <glob pattern="*.skm" /> + </mime-type> + + <mime-type type="application/x-latex"> + <_comment>LaTeX Source Document</_comment> + <glob pattern="*.latex" /> + </mime-type> + + <!-- JC CHANGED + <mime-type type="application/x-mif"> + <_comment>FrameMaker MIF document</_comment> + <glob pattern="*.mif"/> + </mime-type> --> + + <mime-type type="application/ogg"> + <alias type="application/x-ogg" /> + </mime-type> + + <mime-type type="application/x-rar"> + <alias type="application/x-rar-compressed" /> + </mime-type> + + <mime-type type="application/x-shellscript"> + <alias type="application/x-sh" /> + </mime-type> + + <mime-type type="application/xhtml+xml"> + <glob pattern="*.xht" /> + </mime-type> + + <mime-type type="audio/midi"> + <glob pattern="*.kar" /> + </mime-type> + + <mime-type type="audio/x-pn-realaudio"> + <alias type="audio/x-realaudio" /> + </mime-type> + + <mime-type type="image/tiff"> + <magic priority="50"> + <match value="0x4d4d2a00" type="string" offset="0" /> + <match value="0x49492a00" type="string" offset="0" /> + </magic> + </mime-type> + + <mime-type type="message/rfc822"> + <magic priority="50"> + <match type="string" value="Relay-Version:" offset="0" /> + <match type="string" value="#! rnews" offset="0" /> + <match type="string" value="N#! rnews" offset="0" /> + <match type="string" value="Forward to" offset="0" /> + <match type="string" value="Pipe to" offset="0" /> + <match type="string" value="Return-Path:" offset="0" /> + <match type="string" value="From:" offset="0" /> + <match type="string" value="Message-ID:" offset="0" /> + <match type="string" value="Date:" offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/x-javascript"> + <glob pattern="*.js" /> + </mime-type> + + + <mime-type type="image/vnd.wap.wbmp"> + <_comment>Wireless Bitmap File Format</_comment> + <glob pattern="*.wbmp" /> + </mime-type> + + <mime-type type="image/x-psd"> + <alias type="image/photoshop" /> + </mime-type> + + <mime-type type="image/x-xcf"> + <alias type="image/xcf" /> + <magic priority="50"> + <match type="string" value="gimp xcf " offset="0" /> + </magic> + </mime-type> + + <mime-type type="application/x-shockwave-flash"> + <glob pattern="*.swf"/> + <magic priority="50"> + <match type="string" value="FWS" offset="0"/> + <match type="string" value="CWS" offset="0"/> + </magic> + </mime-type> + + <mime-type type="model/iges"> + <_comment> + Initial Graphics Exchange Specification Format + </_comment> + <glob pattern="*.igs" /> + <glob pattern="*.iges" /> + </mime-type> + + <mime-type type="model/mesh"> + <glob pattern="*.msh" /> + <glob pattern="*.mesh" /> + <glob pattern="*.silo" /> + </mime-type> + + <mime-type type="model/vrml"> + <glob pattern="*.vrml" /> + </mime-type> + + <mime-type type="text/x-tcl"> + <alias type="application/x-tcl" /> + </mime-type> + + <mime-type type="text/x-tex"> + <alias type="application/x-tex" /> + </mime-type> + + <mime-type type="text/x-texinfo"> + <alias type="application/x-texinfo" /> + </mime-type> + + <mime-type type="text/x-troff-me"> + <alias type="application/x-troff-me" /> + </mime-type> + + <mime-type type="video/vnd.mpegurl"> + <glob pattern="*.mxu" /> + </mime-type> + + <mime-type type="x-conference/x-cooltalk"> + <_comment>Cooltalk Audio</_comment> + <glob pattern="*.ice" /> + </mime-type> + +</mime-info> Added: trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nat/archive/lib/commons-2.0.1-SNAPSHOT.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nat/archive/lib/commons-httpclient-3.0.1.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nat/archive/lib/fastutil-5.0.3.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java =================================================================== --- trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java (rev 0) +++ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcReader.java 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,273 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax; + +import java.util.Iterator; +import java.util.Map; +import java.util.HashMap; +import java.io.IOException; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.ArchiveRecord; +import org.archive.io.ArchiveRecordHeader; + +import org.archive.io.arc.ARCConstants; +import org.archive.io.arc.ARCReader; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; +import org.archive.io.warc.WARCConstants; +import org.archive.io.warc.WARCRecord; + +import org.apache.commons.httpclient.Header; + + +/** + * <p> + * Reader of both ARC and WARC format archive files. This is not a + * general-purpose archive file reader, but is written specifically + * for NutchWAX. It's possible that this could become a + * general-purpose archive file reader, but for now, consider it + * custom-tailored to the needs of NutchWAX. + * </p> + * <p> + * <code>ArcReader</code> is a wrapper around the underlying + * <code>ArchiveReader</code> implementation + * (<code>ARCReader</code>/<code>WARCReader</code>) which converts + * <code>WARCRecord</code>s to <code>ARCRecord</code>s on the fly. + * </p> + * <p> + * If an <code>ARCReader</code> is being wrapped, then the + * underlying <code>ARCRecord</code>s are read and passed-through + * unmolested. + * </p> + * <p> + * If a <code>WARCReader</code> is being wrapped, then the + * <code>WARCRecord</code>s are converted to <code>ARCRecord</code>s + * on the fly. + * </p> + * <p> + * <strong>WARNING:</strong> We only convert WARC + * <code>response</code> records. All other WARC record types are + * returned as <code>null</code> by the iterator's + * <code>next()</code> method. So, when using the iterator, don't + * forget to check for a <code>null</code> value returned by + * <code>next()</code>. + * </p> + */ +public class ArcReader implements Iterable<ARCRecord> +{ + private ArchiveReader reader; + + /** + * Construct an <code>ArcReader<code> wrapping an + * <code>ArchiveReader</code> instance. + * + * @param reader the ArchiveReader instance to wrap + */ + public ArcReader( ArchiveReader reader ) + { + this.reader = reader; + } + + /** + * Returns an iterator over <code>ARCRecord</code>s in the wrapped + * <code>ArchiveReader</code>, converting <code>WARCRecords</code> + * to <code>ARCRecords</code> on-the-fly. + * + * @return an interator + */ + public Iterator<ARCRecord> iterator( ) + { + return new ArcIterator( ); + } + + /** + * + */ + private class ArcIterator implements Iterator<ARCRecord> + { + private Iterator<ArchiveRecord> i; + + /** + * Construct an <code>ArcIterator</code>, skipping the header + * record if the wrapped reader is an <code>ARCReader</code>. + */ + public ArcIterator( ) + { + this.i = ArcReader.this.reader.iterator( ); + + if ( ArcReader.this.reader instanceof ARCReader ) + { + // Skip the first record, which is a "filedesc://" + // record describing the ARC file. + if ( this.i.hasNext( ) ) this.i.next( ); + } + } + + /** + * Returns <code>true</code> if the iteration has more elements. + * Will return <code>true</code> even if the value returned by the + * next call to <code>next()</code> returns <code>null</code>. + * + * @return <code>true</code> if the iterator has more elements. + */ + public boolean hasNext( ) + { + return this.i.hasNext( ); + } + + /** + * Returns the next element in the iteration. Calling this method + * repeatedly until the <code>hasNext()</code> method returns + * <code>false</code> will return each element in the underlying + * collection exactly once. + * + * @return the next element in the iteration, which can be <code>null</code> + */ + public ARCRecord next( ) + { + try + { + ArchiveRecord record = this.i.next( ); + + if ( record instanceof ARCRecord ) + { + // Just return the ARCRecord as-is. + ARCRecord arc = (ARCRecord) record; + + return arc; + } + + if ( record instanceof WARCRecord ) + { + WARCRecord warc = (WARCRecord) record; + + ARCRecord arc = convert( warc ); + + return arc; + } + + // If we get here then the record we reaad in was neither an ARC + // or WARC record. What is a good exception to throw? + throw new RuntimeException( "Record neither ARC nor WARC: " + record.getClass( ) ); + } + catch ( IOException ioe ) + { + throw new RuntimeException( ioe ); + } + } + + /** + * Unsupported optional operation. + * + * @throw UnsupportedOperationException + */ + public void remove( ) + { + throw new UnsupportedOperationException( ); + } + + /** + * Convert a WARCRecord to an ARCRecord. Only "response" + * WARCRecords are converted to meaningful ARCRecords. All other + * WARCRecord types are converted to <code>null</code>. + * + * @param warc the WARCRecord to convert + * @return the corresponding ARCRecord, <code>null</code> if WARCRecord not a "reponse" record + */ + private ARCRecord convert( WARCRecord warc ) + throws IOException + { + ArchiveRecordHeader header = warc.getHeader( ); + + // We only care about "response" WARC records. + if ( ! WARCConstants.RESPONSE.equals( header.getHeaderValue( WARCConstants.HEADER_KEY_TYPE ) ) ) + { + return null; + } + + // Construct an ARCRecordMetadata object based on the info in + // the ArchiveRecordHeader. + Map arcMetadataFields = new HashMap( ); + arcMetadataFields.put( ARCConstants.URL_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_URI ) ); + arcMetadataFields.put( ARCConstants.IP_HEADER_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_IP ) ); + arcMetadataFields.put( ARCConstants.DATE_FIELD_KEY, header.getHeaderValue( WARCConstants.HEADER_KEY_DATE ) ); + arcMetadataFields.put( ARCConstants.MIMETYPE_FIELD_KEY, header.getHeaderValue( null ) ); // We don't know the MIME type of the *payload* in a WARC (yet) + arcMetadataFields.put( ARCConstants.LENGTH_FIELD_KEY, header.getHeaderValue( WARCConstants.CONTENT_LENGTH ) ); + arcMetadataFields.put( ARCConstants.VERSION_FIELD_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? + arcMetadataFields.put( ARCConstants.ABSOLUTE_OFFSET_KEY, header.getHeaderValue( null ) ); // FIXME: Do we need actual values for these? + + ARCRecordMetaData metadata = new ARCRecordMetaData( header.getReaderIdentifier( ), arcMetadataFields ); + + // Then, create an ARCRecord using the WARCRecord and the + // ARCRecordMetaData object we just created. + ARCRecord arc = new ARCRecord( warc, + metadata, + 0, // offset + ArcReader.this.reader.isDigest( ), + ArcReader.this.reader.isStrict( ), + true // parse HTTP headers + ); + + // Now that we've created the ARCRecord, we get the HTTP headers + // from it. From these HTTP headers, we obtain the Content-Type + // of the ARCRecord's payload, then set value as the MIME-type + // of the ARCRecord itself. + + // If the response is something other than HTTP + // (like DNS) there are no HTTP headers. + if ( arc.getHttpHeaders( ) != null ) + { + for ( Header h : arc.getHttpHeaders( ) ) + { + if ( h.getName( ).equals( "Content-Type" ) ) + { + arc.getMetaData( ).getHeaderFields( ).put( ARCConstants.MIMETYPE_FIELD_KEY, h.getValue( ) ); + } + } + } + + return arc; + } + + } + + /** + * Simple test/debug driver to read an archive file and print out + * the header for each record. + */ + public static void main( String args[] ) throws Exception + { + if ( args.length != 1 ) + { + System.out.println( "ReaderTest <(w)arc file>" ); + System.exit( 1 ); + } + + String arcName = args[0]; + + ArchiveReader r = ArchiveReaderFactory.get( arcName ); + + ArcReader reader = new ArcReader( r ); + + for ( ARCRecord rec : reader ) + { + if ( rec != null ) System.out.println( rec.getHeader( ) ); + } + } +} Added: trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java =================================================================== --- trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java (rev 0) +++ trunk/archive-access/projects/nat/archive/src/java/org/archive/nutchwax/ArcsToSegment.java 2008-05-14 00:20:24 UTC (rev 2265) @@ -0,0 +1,553 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Map.Entry; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.fetcher.FetcherOutputFormat; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLFilterException; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.apache.nutch.util.StringUtil; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; + + +/** + * Convert Archive files (.arc/.warc) files to a Nutch segment. This + * is sometimes called "importing" other times "converting", the terms + * are equivalent. + * + * <code>ArcsToSegment</code> is coded as a Hadoop job and is intended + * to be run within the Hadoop framework, or at least started by the + * Hadoop launcher incorporated into Nutch. Although there is a + * <code>main</code> driver, the Nutch launcher script is strongly + * recommended. + * + * This class was initially adapted from the Nutch + * <code>Fetcher</code> class. The premise is since the Nutch + * fetching process acquires external content and places it in a Nutch + * segment, we can perform a similar activity by taking content from + * the ARC files and place that content in a Nutch segment in a + * similar fashion. Ideally, once the <code>ArcsToSegment</code> is + * used to import a set of ARCs into a Nutch segment, the resulting + * segment should be more-or-less the same as one created by Nutch's + * own Fetcher. + * + * Since we are mimicing the Nutch Fetcher, we have to be careful + * about some implementation details that might not seem relevant + * to the importing of ARC files. I've noted those details with + * comments prefaced with "?:". + */ +public class ArcsToSegment extends Configured implements Tool, Mapper +{ + + public static final Log LOG = LogFactory.getLog( ArcsToSegment.class ); + + private JobConf jobConf; + private URLFilters urlFilters; + private ScoringFilters scfilters; + private ParseUtil parseUtil; + private URLNormalizers normalizers; + private int interval; + + private long numSkipped; + private long numImported; + private long bytesSkipped; + private long bytesImported; + + /** + * ?: Is this necessary? + */ + public ArcsToSegment() + { + + } + + /** + * <p>Constructor that sets the job configuration.</p> + * + * @param conf + */ + public ArcsToSegment( Configuration conf ) + { + setConf( conf ); + } + + /** + * <p>Configures the job. Sets the url filters, scoring filters, url normalizers + * and other relevant data.</p> + * + * @param job The job configuration. + */ + public void configure( JobConf job ) + { + // set the url filters, scoring filters the parse util and the url + // normalizers + this.jobConf = job; + this.urlFilters = new URLFilters ( jobConf ); + this.scfilters = new ScoringFilters( jobConf ); + this.parseUtil = new ParseUtil ( jobConf ); + this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER ); + this.interval = jobConf.getInt( "db.fetch.interval.default", 2592000 ); + } + + /** + * In Mapper interface. + * @inherit + */ + public void close() + { + + } + + /** + * <p>Runs the Map job to translate an arc file into output for Nutch + * segments.</p> + * + * @param key Line number in manifest corresponding to the <code>value</code> + * @param value A line from the manifest + * @param output The output collecter. + * @param reporter The progress reporter. + */ + public void map( WritableComparable key, + Writable value, + OutputCollector output, + Reporter reporter ) + throws IOException + { + String arcUrl = ""; + String collection = ""; + String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY ); + + // Each line of the manifest is "<url> <collection>" where <collection> is optional + String[] line = value.toString().split( " " ); + arcUrl = line[0]; + + if ( line.length > 1 ) + { + collection = line[1]; + } + + if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl ); + + ArchiveReader r = ArchiveReaderFactory.get( arcUrl ); + + ArcReader reader = new ArcReader( r ); + + try + { + for ( ARCRecord record : reader ) + { + // When reading WARC files, records of type other than + // "response" are returned as 'null' by the Iterator, so + // we skip them. + if ( record == null ) continue ; + + importRecord( record, segmentName, collection, output ); + + // FIXME: What does this do exactly? + reporter.progress(); + } + } + finally + { + r.close(); + + if ( LOG.isInfoEnabled() ) + { + LOG.info( "Completed ARC: " + arcUrl ); + LOG.info( "URLs skipped : " + this.numSkipped ); + LOG.info( "URLs imported: " + this.numImported ); + LOG.info( "URLs total : " + ( this.numSkipped + this.numImported ) ); + } + } + + } + + /** + * Import an ARCRecord. + * + * @param record + * @param segmentName + * @param collectionName + * @param output + * @return whether record was imported or not (i.e. filtered out due to URL filtering rules, etc.) + */ + private boolean importRecord( ARCRecord record, String segmentName, String collectionName, OutputCollector output ) + { + ARCRecordMetaData meta = record.getMetaData(); + + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" ); + + /* ?: On second thought, DON'T do this. Even if we don't have a + parser registered for a content-type, we still want to index + its URL and possibly other meta-data. + */ + /* + // First, check to see if we have a parser registered for the + // URL's Content-Type, so we don't read in some huge video file + // only to discover we don't have a parser for it. + if ( ! this.hasRegisteredParser( meta.getMimetype() ) ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "No parser registered for: " + meta.getMimetype() ); + + this.numSkipped++; + this.bytesSkipped += meta.getLength(); + + return false ; + } + */ + + // ?: Arguably, we shouldn't be normalizing nor filtering based + // on the URL. If the document made it into the (W)ARC file, then + // it should be indexed. But then again, the normalizers and + // filters can be disabled in the Nutch configuration files. + String url = this.normalizeAndFilterUrl( meta.getUrl() ); + + if ( url == null ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() ); + + this.numSkipped++; + this.bytesSkipped += meta.getLength(); + + return false; + } + + // URL is good, let's import the content. + if ( LOG.isInfoEnabled() ) LOG.info( "Import URL: " + meta.getUrl() ); + this.numImported++; + this.bytesImported += meta.getLength(); + + try + { + ... [truncated message content] |
From: <bi...@us...> - 2008-06-02 18:55:10
|
Revision: 2272 http://archive-access.svn.sourceforge.net/archive-access/?rev=2272&view=rev Author: binzino Date: 2008-06-02 11:53:46 -0700 (Mon, 02 Jun 2008) Log Message: ----------- Move NutchWAX 0.12 from 'projects/nat' to 'projects/nutchwax'. Added Paths: ----------- trunk/archive-access/projects/nutchwax/ Removed Paths: ------------- trunk/archive-access/projects/nat/ Copied: trunk/archive-access/projects/nutchwax (from rev 2271, trunk/archive-access/projects/nat) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |