You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: Brad <bra...@us...> - 2005-12-01 01:48:30
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv6166/src/java/org/archive/wayback Modified Files: QueryRenderer.java Log Message: Comments Index: QueryRenderer.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/QueryRenderer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** QueryRenderer.java 16 Nov 2005 03:11:29 -0000 1.1 --- QueryRenderer.java 1 Dec 2005 01:48:22 -0000 1.2 *************** *** 43,55 **** public interface QueryRenderer extends PropertyConfigurable { public void renderException(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, WaybackException exception) throws ServletException, IOException; public void renderUrlResults(HttpServletRequest httpRequest, ! HttpServletResponse response, WaybackRequest wbRequest, SearchResults results, ReplayResultURIConverter uriConverter) throws ServletException, IOException; public void renderUrlPrefixResults(HttpServletRequest httpRequest, HttpServletResponse response, WaybackRequest wbRequest, --- 43,88 ---- public interface QueryRenderer extends PropertyConfigurable { + /** Show an output page indicating that a request to the Wayback Machine + * failed for some reason, as determined by the WaybackException argument. + * + * @param httpRequest the HttpServletRequest + * @param httpResponse the HttpServletResponse + * @param wbRequest the WaybackRequest that caused the exception + * @param exception the WaybackException thrown while handling + * @throws ServletException + * @throws IOException + */ public void renderException(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, WaybackException exception) throws ServletException, IOException; + /** Show the SearchResults of the request for this particular URL + * + * @param httpRequest the HttpServletRequest + * @param httpResponse the HttpServletResponse + * @param wbRequest the WaybackRequest that returned the results + * @param results the SearchResults that the WaybackRequest matched + * @param uriConverter the URI converter to use to translate matching + * results into replayable URLs + * @throws ServletException + * @throws IOException + */ public void renderUrlResults(HttpServletRequest httpRequest, ! HttpServletResponse httpResponse, WaybackRequest wbRequest, SearchResults results, ReplayResultURIConverter uriConverter) throws ServletException, IOException; + /** Show the SearchResults of the request which may have resulted in + * multiple matching URLs. + * + * @param httpRequest the HttpServletRequest + * @param response the HttpServletResponse + * @param wbRequest the WaybackRequest that returned the results + * @param results the SearchResults that the WaybackRequest matched + * @param uriConverter the URI converter to use to translate matching + * results into replayable URLs + * @throws ServletException + * @throws IOException + */ public void renderUrlPrefixResults(HttpServletRequest httpRequest, HttpServletResponse response, WaybackRequest wbRequest, |
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20026/src/java/org/archive/access/nutch Modified Files: ImportArcs.java IndexArcs.java Removed Files: CollectionDeleteDuplicates.java NutchwaxIndexMerger.java NutchwaxSegmentMergeTool.java Log Message: Added some comments. --- CollectionDeleteDuplicates.java DELETED --- Index: ImportArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/ImportArcs.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ImportArcs.java 29 Nov 2005 21:43:43 -0000 1.2 --- ImportArcs.java 30 Nov 2005 03:52:48 -0000 1.3 *************** *** 31,62 **** import java.util.logging.Level; import java.util.logging.Logger; - import java.net.URI; import org.apache.commons.httpclient.Header; ! import org.apache.nutch.io.Writable; import org.apache.nutch.io.WritableComparable; - import org.apache.nutch.io.UTF8; - import org.apache.nutch.io.MD5Hash; - import org.apache.nutch.protocol.Content; - import org.apache.nutch.util.NutchConf; - import org.apache.nutch.util.NutchConfigured; - import org.apache.nutch.util.mime.MimeType; - import org.apache.nutch.util.mime.MimeTypes; - import org.apache.nutch.util.mime.MimeTypeException; - import org.apache.nutch.mapred.JobConf; import org.apache.nutch.mapred.JobClient; import org.apache.nutch.mapred.Mapper; import org.apache.nutch.mapred.OutputCollector; import org.apache.nutch.mapred.Reporter; - import org.apache.nutch.crawl.CrawlDatum; - import org.apache.nutch.crawl.Fetcher; - import org.apache.nutch.crawl.FetcherOutput; - import org.apache.nutch.crawl.FetcherOutputFormat; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseUtil; ! import org.apache.nutch.parse.ParseImpl; ! import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; --- 31,59 ---- import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.Header; ! import org.apache.nutch.crawl.CrawlDatum; ! import org.apache.nutch.crawl.Fetcher; ! import org.apache.nutch.crawl.FetcherOutput; ! import org.apache.nutch.crawl.FetcherOutputFormat; ! import org.apache.nutch.io.MD5Hash; ! import org.apache.nutch.io.UTF8; import org.apache.nutch.io.Writable; import org.apache.nutch.io.WritableComparable; import org.apache.nutch.mapred.JobClient; + import org.apache.nutch.mapred.JobConf; import org.apache.nutch.mapred.Mapper; import org.apache.nutch.mapred.OutputCollector; import org.apache.nutch.mapred.Reporter; import org.apache.nutch.parse.Parse; + import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseUtil; ! import org.apache.nutch.protocol.Content; ! import org.apache.nutch.util.NutchConf; ! import org.apache.nutch.util.NutchConfigured; ! import org.apache.nutch.util.mime.MimeType; ! import org.apache.nutch.util.mime.MimeTypeException; ! import org.apache.nutch.util.mime.MimeTypes; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; *************** *** 96,100 **** this.collectionName = job.get("archive.collection", "web"); this.segmentName = job.get(Fetcher.SEGMENT_NAME_KEY); ! if (job.getBoolean("importarcs.verbose", false)) { LOG.setLevel(Level.FINE); --- 93,97 ---- this.collectionName = job.get("archive.collection", "web"); this.segmentName = job.get(Fetcher.SEGMENT_NAME_KEY); ! if (job.getBoolean("importarcs.verbose", false)) { LOG.setLevel(Level.FINE); --- NutchwaxSegmentMergeTool.java DELETED --- --- NutchwaxIndexMerger.java DELETED --- Index: IndexArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/IndexArcs.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** IndexArcs.java 29 Nov 2005 21:43:43 -0000 1.2 --- IndexArcs.java 30 Nov 2005 03:52:48 -0000 1.3 *************** *** 17,35 **** package org.archive.access.nutch; ! import java.io.*; ! import java.net.*; ! import java.util.*; ! import java.text.*; ! import java.util.logging.*; ! import org.apache.nutch.io.*; ! import org.apache.nutch.fs.*; ! import org.apache.nutch.util.*; ! import org.apache.nutch.mapred.*; ! import org.apache.nutch.crawl.*; import org.apache.nutch.indexer.IndexMerger; ! import org.apache.nutch.util.mime.MimeType; ! import org.apache.nutch.util.mime.MimeTypeException; ! import org.apache.nutch.util.mime.MimeTypes; public class IndexArcs { --- 17,34 ---- package org.archive.access.nutch; ! import java.io.File; ! import java.text.SimpleDateFormat; ! import java.util.Date; ! import java.util.logging.Logger; ! import org.apache.nutch.crawl.CrawlDb; ! import org.apache.nutch.crawl.DeleteDuplicates; ! import org.apache.nutch.crawl.Indexer; ! import org.apache.nutch.crawl.LinkDb; ! import org.apache.nutch.fs.NutchFileSystem; import org.apache.nutch.indexer.IndexMerger; ! import org.apache.nutch.mapred.JobConf; ! import org.apache.nutch.util.LogFormatter; ! import org.apache.nutch.util.NutchConf; public class IndexArcs { *************** *** 45,50 **** public static void main(String args[]) throws Exception { if (args.length < 2) { ! System.out.println("Usage: IndexArcs <arcsDir> <crawlDir> " + ! "[-noimport] [-noinvert] [-noindex]"); return; } --- 44,57 ---- public static void main(String args[]) throws Exception { if (args.length < 2) { ! System.out.println("Usage: IndexArcs <arcDir> <crawlDir> " + ! "[-noimport] [-noupdate] [-noinvert] [-noindex]"); ! System.out.println("Args:"); ! System.out.println(" arcDir Directory of ARCs to index."); ! System.out.println(" crawlDir Directory for indexing product."); ! System.out.println("Options:"); ! System.out.println(" noimport Do not run the ARC import step."); ! System.out.println(" noupdate Do not run the crawl db update step."); ! System.out.println(" noinvert Do not run the invert links step."); ! System.out.println(" noindex Do not run the indexing step."); return; } *************** *** 109,113 **** } - LOG.info("IndexArcs finished: " + crawlDir); } --- 116,119 ---- |
From: Michael S. <sta...@us...> - 2005-11-30 03:24:22
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/bin In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15535/bin Removed Files: arcs2segs.sh indexarcs.sh nutch Log Message: * bin/arcs2segs.sh * bin/indexarcs.sh * bin/nutch Remove vestiges of pre-mapreduce nutchwax. --- arcs2segs.sh DELETED --- --- indexarcs.sh DELETED --- --- nutch DELETED --- |
From: Gordon M. <go...@us...> - 2005-11-30 02:36:59
|
Update of /cvsroot/archive-access/archive-access/projects/wayback In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv8448 Added Files: plan.txt Log Message: * plan.txt project roadmap/todo/brainstorming notes; initial commit. extend/revise at will --- NEW FILE: plan.txt --- == NGWM NEXT STEPS == 0.2 (week of Nov 28 - Dec 2) - fix windows file-pipeline bug (move pipeline state into bdb?) - better install/admin instructions (in file and in UI) - bundle small default dataset (sample ARC) for immediate use - remove extraneous files from build distro bundle - polish UI to clarify local operation/collection/etc. - release via SF 'file release' mechanism, announce to lists - demo for team next Tuesday (Dec 6) for UI/feature feedback 0.4 (by first week of January) - verified performance at typical scale of contract collections - retrieve/cache/respect freshest robots.txt - handle multiple named collections - nice, flexible results-list UI (classic calendar or other) - admin tasks password-protected - nice to have: manual exclusions - nice to have: floating in-page indicator of WM date/status - nice to have: clean nutch integration 0.6 (as necessary) - TBD == BRAINSTORMING == Error-handling: - when URL not available - if auto-fetch, make sure an in-page floaty announces that it's a fresh retrieve; log to side for QA purposes - if not auto-fetch, offer link to broader search (all local collections) or remote collections (such as public wayback machine) Entry pages: - when scanning ARCs, offer option to add /root pages to 'entry pages' list - also allow admin to add entry pages - display recommended entry pages below main search box (or on separate page) Search/Admin UI: - make host, collection very clear in UI (unless suppressed) - highlight especially 'localhost' connections (different color background?) - ape google/search-engine style to maximum extent practical Replay UI: - in-page presence - collapsable uri/date/collection indicator - mouseover indicators? - n/a graphic for n/a images - stayback firefox extension - would look for special indicator inside page that it's a wayback session - if found, would refuse to load inline resources or follow clicks without patching URI back to wayback machine (assuming intercept/callback is possible) Heritrix integration ideas: - bundled? just installed alongside? - simultaneous update during crawl? - 'not here yet but scheduled' error/placeholder IMGs? - schedule-when-requested-via-WM? - linkify all logs/reports to WM? |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:54
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/web In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555/src/web Modified Files: search.jsp Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. Index: search.jsp =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/web/search.jsp,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** search.jsp 24 Nov 2005 03:06:26 -0000 1.27 --- search.jsp 29 Nov 2005 21:43:43 -0000 1.28 *************** *** 16,19 **** --- 16,20 ---- import="org.apache.nutch.util.NutchConf" import="org.archive.access.nutch.NutchwaxQuery" + import="org.archive.util.ArchiveUtils" %><%! *************** *** 186,193 **** String summary = summaries[i]; String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); ! ! String archiveDate = FORMAT.format(new Date(bean.getFetchDate(detail))); ! String archiveDisplayDate = ! DISPLAY_FORMAT.format(new Date(bean.getFetchDate(detail))); String archiveCollection = detail.getValue("collection"); String url = detail.getValue("url"); --- 187,194 ---- String summary = summaries[i]; String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); ! ! Date date = new Date(Long.valueOf(detail.getValue("date"))*1000); ! String archiveDate = FORMAT.format(date); ! String archiveDisplayDate = DISPLAY_FORMAT.format(date); String archiveCollection = detail.getValue("collection"); String url = detail.getValue("url"); |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:54
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/plugin In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555/src/plugin Modified Files: build.xml Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. Index: build.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/plugin/build.xml,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** build.xml 25 Jul 2005 20:35:00 -0000 1.5 --- build.xml 29 Nov 2005 21:43:43 -0000 1.6 *************** *** 9,14 **** <ant dir="index-ia" target="deploy"/> <ant dir="query-ia" target="deploy"/> ! <ant dir="parse-default" target="deploy"/> ! </target> <!-- ====================================================== --> --- 9,14 ---- <ant dir="index-ia" target="deploy"/> <ant dir="query-ia" target="deploy"/> ! <!-- <ant dir="parse-default" target="deploy"/> --> ! </target> <!-- ====================================================== --> *************** *** 18,22 **** <ant dir="index-ia" target="test"/> <ant dir="query-ia" target="test"/> ! <ant dir="parse-default" target="test"/> </target> --- 18,22 ---- <ant dir="index-ia" target="test"/> <ant dir="query-ia" target="test"/> ! <!-- <ant dir="parse-default" target="test"/> --> </target> *************** *** 27,31 **** <ant dir="index-ia" target="clean"/> <ant dir="query-ia" target="clean"/> ! <ant dir="parse-default" target="clean"/> </target> --- 27,31 ---- <ant dir="index-ia" target="clean"/> <ant dir="query-ia" target="clean"/> ! <!-- <ant dir="parse-default" target="clean"/> --> </target> |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:54
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/plugin/parse-ext In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555/src/plugin/parse-ext Modified Files: plugin.xml Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. Index: plugin.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/plugin/parse-ext/plugin.xml,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** plugin.xml 4 Jun 2005 01:01:32 -0000 1.3 --- plugin.xml 29 Nov 2005 21:43:43 -0000 1.4 *************** *** 24,28 **** contentType="application/pdf" pathSuffix="pdf" ! command="@PWD@/bin/parse-pdf.sh" timeout="30"/> --- 24,28 ---- contentType="application/pdf" pathSuffix="pdf" ! command="bin/parse-pdf.sh" timeout="30"/> |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:54
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555/src/java/org/archive/access/nutch Added Files: ImportArcs.java IndexArcs.java Removed Files: Arc2Segment.java Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. --- NEW FILE: ImportArcs.java --- /* * $Id: ImportArcs.java,v 1.2 2005/11/29 21:43:43 stack-sf Exp $ * * Copyright (C) 2003 Internet Archive. * * This file is part of the archive-access tools project * (http://sourceforge.net/projects/archive-access). * * The archive-access tools are free software; you can redistribute them and/or * modify them under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or any * later version. * * The archive-access tools are distributed in the hope that they will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser * Public License for more details. * * You should have received a copy of the GNU Lesser Public License along with * the archive-access tools; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.access.nutch; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; import java.net.URI; import org.apache.commons.httpclient.Header; import org.apache.nutch.io.Writable; import org.apache.nutch.io.WritableComparable; import org.apache.nutch.io.UTF8; import org.apache.nutch.io.MD5Hash; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConf; import org.apache.nutch.util.NutchConfigured; import org.apache.nutch.util.mime.MimeType; import org.apache.nutch.util.mime.MimeTypes; import org.apache.nutch.util.mime.MimeTypeException; import org.apache.nutch.mapred.JobConf; import org.apache.nutch.mapred.JobClient; import org.apache.nutch.mapred.Mapper; import org.apache.nutch.mapred.OutputCollector; import org.apache.nutch.mapred.Reporter; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Fetcher; import org.apache.nutch.crawl.FetcherOutput; import org.apache.nutch.crawl.FetcherOutputFormat; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseImpl; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; import org.archive.util.ArchiveUtils; import org.archive.util.TextUtils; public class ImportArcs extends NutchConfigured implements Mapper { private static final Logger LOG = Logger.getLogger(ImportArcs.class.getName()); private static final String WHITESPACE = "\\s+"; public static final String ARCFILENAME_KEY = "arcname"; public static final String ARCFILEOFFSET_KEY = "arcoffset"; public static final String ARCCOLLECTION_KEY = "collection"; private static final String CONTENT_TYPE_KEY = "content-type"; private static final String TEXT_TYPE = "text/"; private static final String APPLICATION_TYPE = "application/"; private boolean indexAll; private int contentLimit; private MimeTypes mimeTypes; private String collectionName; private String segmentName; public ImportArcs() { super(null); } public ImportArcs(NutchConf conf) { super(conf); } public void configure(JobConf job) { setConf(job); this.indexAll = job.getBoolean("archive.index.all", false); this.contentLimit = job.getInt("http.content.limit", 100000); this.mimeTypes = MimeTypes.get(job.get("mime.types.file")); this.collectionName = job.get("archive.collection", "web"); this.segmentName = job.get(Fetcher.SEGMENT_NAME_KEY); if (job.getBoolean("importarcs.verbose", false)) { LOG.setLevel(Level.FINE); } System.setProperty("java.protocol.handler.pkgs", "org.archive.net"); } public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String arcLocation = ((UTF8)value).toString(); LOG.info("opening "+arcLocation); ARCReader arc = null; String arcName = null; try { arc = ARCReaderFactory.get(arcLocation); } catch (Throwable e) { LOG.log(Level.WARNING, "Error opening: " + arcLocation, e); return; } // Don't run the digester. Digest is unused and it costs CPU. arc.setDigest(false); try { for (Iterator i = arc.iterator(); i.hasNext();) { ARCRecord rec = (ARCRecord) i.next(); if (arcName == null) { // first entry has arc name arcName = rec.getMetaData().getUrl(); reporter.setStatus(arcName); } if (rec.getStatusCode() != 200) continue; try { processRecord(arcName, rec, output); } catch (Throwable e) { LOG.log(Level.WARNING, "Error processing: " + arcLocation, e); } } } catch (Throwable e) { // problem parsing arc file LOG.log(Level.WARNING, "Error parsing: " + arcLocation, e); } finally { arc.close(); } } private void processRecord(final String arcName, final ARCRecord rec, OutputCollector output) throws IOException { ARCRecordMetaData arcData = rec.getMetaData(); String url = arcData.getUrl(); // Look at ARCRecord meta data line mimetype. It can be empty. String mimetype = arcData.getMimetype(); if (mimetype != null && mimetype.length() > 0) { mimetype = mimetype.toLowerCase(); } else { MimeType mt = mimeTypes.getMimeType(url); if (mt != null) { mimetype = mt.getName(); } } mimetype = checkMimetype(mimetype); if (skip(mimetype)) { return; } // Copy http headers to nutch metadata. Properties metaData = new Properties(); Header[] headers = rec.getHttpHeaders(); for (int j = 0; j < headers.length; j++) { Header header = headers[j]; if (mimetype == null) { // Special handling. If mimetype is null, try getting it from // the http header. I've seen arc record lines with empty // content-type and a MIME unparseable file ending; i.e. .MID. if (header.getName() != null && header.getName().toLowerCase().equals(CONTENT_TYPE_KEY)) { mimetype = checkMimetype(header.getValue().toLowerCase()); if (skip(mimetype)) { return; } } } metaData.put(header.getName(), header.getValue()); } String noSpacesMimetype = TextUtils.replaceAll(WHITESPACE, mimetype, "-"); LOG.info("adding " + Long.toString(arcData.getLength()) + " bytes of mimetype " + noSpacesMimetype + " " + url); // Add the collection name, the arcfile name, and the offset. // Also add mimetype. Needed by the ia indexers. metaData.put(ARCCOLLECTION_KEY, this.collectionName); metaData.put(ARCFILENAME_KEY, arcName); metaData.put(ARCFILEOFFSET_KEY, Long.toString(arcData.getOffset())); metaData.put(CONTENT_TYPE_KEY, mimetype); // Collect content bytes // TODO: Skip if unindexable type. rec.skipHttpHeader(); ByteArrayOutputStream contentBuffer = new ByteArrayOutputStream(); byte[] buf = new byte[1024 * 4]; int total = 0; int len = rec.read(buf, 0, buf.length); while (len != -1 && total < this.contentLimit) { total += len; contentBuffer.write(buf, 0, len); len = rec.read(buf, 0, buf.length); } // System.out.println("--------------"); // System.out.write(contentBuffer.toByteArray()); // System.out.println("--------------"); byte[] contentBytes = contentBuffer.toByteArray(); Content content = new Content(url, url, contentBytes, mimetype, metaData); CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS); metaData.put(Fetcher.DIGEST_KEY, MD5Hash.digest(contentBytes).toString()); metaData.put(Fetcher.SEGMENT_NAME_KEY, segmentName); metaData.put(Fetcher.SCORE_KEY, Float.toString(datum.getScore())); long date = 0; try { date = ArchiveUtils.parse14DigitDate(arcData.getDate()).getTime(); } catch (java.text.ParseException e) { LOG.severe("Failed parse of date: " + arcData.getDate()); } datum.setFetchTime(date); Parse parse = null; ParseStatus parseStatus; try { parse = ParseUtil.parse(content); parseStatus = parse.getData().getStatus(); } catch (Exception e) { parseStatus = new ParseStatus(e); } if (!parseStatus.isSuccess()) { LOG.warning("Error parsing: "+url+": "+parseStatus); parse = null; } output.collect(new UTF8(url), new FetcherOutput(datum, null, parse!=null ? new ParseImpl(parse):null)); } protected boolean skip(final String mimetype) { boolean decision = false; // Are we to index all content? if (!indexAll) { if ((mimetype == null) || (!mimetype.startsWith(TEXT_TYPE) && !mimetype.startsWith(APPLICATION_TYPE))) { // Skip any but basic types. decision = true; } } return decision; } protected static String checkMimetype(String mimetype) { // Test the mimetype makes sense. If not, clear it. try { new MimeType(mimetype); } catch (MimeTypeException e) { mimetype = null; } return mimetype; } public void importArcs(File arcUrlsDir, File segment) throws IOException { LOG.info("ImportArcs: starting"); LOG.info("ImportArcs: arcUrlsDir: " + arcUrlsDir); LOG.info("ImportArcs: segment: " + segment); JobConf job = new JobConf(getConf()); job.setJar("build/nutchwax.job.jar"); job.set(Fetcher.SEGMENT_NAME_KEY, segment.getName()); job.setInputDir(arcUrlsDir); job.setMapperClass(ImportArcs.class); job.setOutputDir(segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(UTF8.class); job.setOutputValueClass(FetcherOutput.class); JobClient.runJob(job); LOG.info("ImportArcs: done"); } public static void main(String[] args) throws Exception { // parse command line options String usage = "Usage: ImportArcs arcUrlsDir segmentDir"; if (args.length != 2) { System.err.println(usage); System.exit(-1); } File arcUrlsDir = new File(args[0]); File segmentDir = new File(args[1]); new ImportArcs(NutchConf.get()).importArcs(arcUrlsDir, segmentDir); } } --- NEW FILE: IndexArcs.java --- /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.access.nutch; import java.io.*; import java.net.*; import java.util.*; import java.text.*; import java.util.logging.*; import org.apache.nutch.io.*; import org.apache.nutch.fs.*; import org.apache.nutch.util.*; import org.apache.nutch.mapred.*; import org.apache.nutch.crawl.*; import org.apache.nutch.indexer.IndexMerger; import org.apache.nutch.util.mime.MimeType; import org.apache.nutch.util.mime.MimeTypeException; import org.apache.nutch.util.mime.MimeTypes; public class IndexArcs { public static final Logger LOG = LogFormatter.getLogger(IndexArcs.class.getName()); private static String getDate() { return new SimpleDateFormat("yyyyMMddHHmmss").format (new Date(System.currentTimeMillis())); } /* Import and index a set of arc files. */ public static void main(String args[]) throws Exception { if (args.length < 2) { System.out.println("Usage: IndexArcs <arcsDir> <crawlDir> " + "[-noimport] [-noinvert] [-noindex]"); return; } JobConf conf = new JobConf(NutchConf.get()); File arcsDir = new File(args[0]); File crawlDir = new File(args[1]); boolean noImport = false; boolean noUpdate = false; boolean noInvert = false; boolean noIndex = false; for (int i = 2; i < args.length; i++) { if ("-noimport".equals(args[i])) { noImport = true; } else if ("-noupdate".equals(args[i])) { noUpdate = true; } else if ("-noinvert".equals(args[i])) { noInvert = true; } else if ("-noindex".equals(args[i])) { noIndex = true; } } NutchFileSystem fs = NutchFileSystem.get(conf); LOG.info("IndexArcs started in: " + crawlDir); LOG.info("arcsDir = " + arcsDir); File crawlDb = new File(crawlDir + "/crawldb"); File linkDb = new File(crawlDir + "/linkdb"); File segments = new File(crawlDir + "/segments"); File indexes = new File(crawlDir + "/indexes"); File index = new File(crawlDir + "/index"); File tmpDir = conf.getLocalFile("crawl", getDate()); if (!noImport) { // import arcs File segment = new File(segments, getDate()); LOG.info("importing arcs in " + arcsDir + " to " + segment); new ImportArcs(conf).importArcs(arcsDir, segment); } if (!noUpdate) { // update crawldb LOG.info("updating crawldb in " + crawlDb); File[] segmentFiles = fs.listFiles(segments); new CrawlDb(conf).update(crawlDb, segmentFiles[segmentFiles.length-1]); } if (!noInvert) { // invert links LOG.info("inverting links in " + segments); new LinkDb(conf).invert(linkDb, segments); } if (!noIndex) { // index LOG.info("indexing " + crawlDir); new Indexer(conf).index(indexes,crawlDb,linkDb,fs.listFiles(segments)); new DeleteDuplicates(conf).dedup(new File[] { indexes }); new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge(); } LOG.info("IndexArcs finished: " + crawlDir); } } --- Arc2Segment.java DELETED --- |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:53
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555/lib Added Files: commons-codec-1.3.jar Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. --- NEW FILE: commons-codec-1.3.jar --- (This appears to be a binary file; contents omitted.) |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:53
|
Update of /cvsroot/archive-access/archive-access/projects/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555 Modified Files: .classpath build.xml project.properties Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. Index: .classpath =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/.classpath,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** .classpath 2 Sep 2005 01:08:18 -0000 1.11 --- .classpath 29 Nov 2005 21:43:42 -0000 1.12 *************** *** 7,11 **** <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="lib" path="nutch/lib/lucene-1.9-rc1-dev.jar"/> ! <classpathentry kind="lib" path="nutch/build/nutch-0.7.jar"/> <classpathentry kind="lib" path="lib/arc-1.5.1-200508191341.jar"/> <classpathentry kind="lib" path="lib/commons-httpclient-3.0-alpha2.jar"/> --- 7,11 ---- <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="lib" path="nutch/lib/lucene-1.9-rc1-dev.jar"/> ! <classpathentry kind="lib" path="nutch/build/nutch-0.8-dev.jar"/> <classpathentry kind="lib" path="lib/arc-1.5.1-200508191341.jar"/> <classpathentry kind="lib" path="lib/commons-httpclient-3.0-alpha2.jar"/> Index: project.properties =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/project.properties,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** project.properties 22 Oct 2005 02:43:23 -0000 1.16 --- project.properties 29 Nov 2005 21:43:42 -0000 1.17 *************** *** 18,23 **** # Local jars to add to classpath. maven.jar.override = on - maven.jar.corenutch = ${basedir}/nutch/build/nutch-0.7.jar maven.jar.lucene = ${basedir}/nutch/lib/lucene-1.9-rc1-dev.jar maven.jar.arc = ${basedir}/lib/arc-1.5.1-200508191341.jar maven.jar.servlet-api = ${basedir}/nutch/lib/servlet-api.jar --- 18,23 ---- # Local jars to add to classpath. maven.jar.override = on maven.jar.lucene = ${basedir}/nutch/lib/lucene-1.9-rc1-dev.jar + maven.jar.corenutch = ${basedir}/nutch/build/nutch-0.8-dev.jar maven.jar.arc = ${basedir}/lib/arc-1.5.1-200508191341.jar maven.jar.servlet-api = ${basedir}/nutch/lib/servlet-api.jar Index: build.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/build.xml,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** build.xml 28 Nov 2005 22:12:31 -0000 1.13 --- build.xml 29 Nov 2005 21:43:42 -0000 1.14 *************** *** 66,74 **** <copy file="${root}/src/plugin/parse-ext/plugin.xml" todir="${nutch.root}/build/plugins/parse-ext/" ! overwrite="true" ! filtering="true" > ! <filterset> ! <filter token="PWD" value="${root}"/> ! </filterset> </copy> --- 66,70 ---- <copy file="${root}/src/plugin/parse-ext/plugin.xml" todir="${nutch.root}/build/plugins/parse-ext/" ! overwrite="true"> </copy> *************** *** 113,116 **** --- 109,127 ---- <!-- ================================================================== --> + <!-- Make job jar --> + <!-- ================================================================== --> + <!-- --> + <!-- ================================================================== --> + <target name="job" depends="compile"> + <zip destfile="${build.dir}/${name}.job.jar"> + <zipfileset prefix="classes" file="${conf.dir}/ia-parse-plugins.xml"/> + <zipfileset prefix="bin" file="bin/parse-pdf.sh" filemode="755"/> + <zipfileset prefix="classes" dir="${build.classes}"/> + <zipfileset refid="lib.jars"/> + </zip> + </target> + + + <!-- ================================================================== --> <!-- Compile test code --> <!-- ================================================================== --> |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:53
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/conf In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555/conf Modified Files: nutch-site.xml nutch-site.xml.template Added Files: ia-parse-plugins.xml Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. Index: nutch-site.xml.template =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml.template,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** nutch-site.xml.template 28 Nov 2005 21:23:22 -0000 1.5 --- nutch-site.xml.template 29 Nov 2005 21:43:42 -0000 1.6 *************** *** 1,17 **** <?xml version="1.0"?> ! <!--Internet Archive Nutch configuration. This config. is what gets built into ! nutchwax. Overrides a few Nutch defaults and adds nutchwax specific ! config (Such config. options have an 'archive' prefix). ! --> ! <nutch-conf> ! <!-- Enable parse-ext (parse-ext is a parser that calls the 'ext'ernal program ! xpdf to parse pdf files). Also enable parse-default and the ia plugins. ! --> <property> <name>plugin.includes</name> ! <value>urlfilter-regex|parse-(text|html|ext|default)|index-(basic|ia)|query-(basic|site|url|ia)</value> </property> --- 1,24 ---- <?xml version="1.0"?> ! <!--Internet Archive Nutch(WAX) configuration. Bulk of below is overrides ! for nutch-default.xml but on the end we add a few new properties with ! 'archive' prefix. ! This conf file is picked up and built into nutchwax distribution. Is ! mostly same as the nutch-site.xml but with nutchwax specific configurations ! added: i.e. we index all content rather than just a subset. ! --> ! <nutch-conf> <property> <name>plugin.includes</name> ! <value>urlfilter-regex|parse-(text|html|js|ext)|index-(basic|ia)|query-(basic|site|url|ia)</value> ! <description>Regular expression naming plugin directory names to ! include. Any plugin not matching this expression is excluded. ! In any case you need at least include the nutch-extensionpoints plugin. ! ! Override Nutch defaults to add nutchwax/ia (Internet Archive) plugins. ! The parse-ext is used to call the native pdftotext parsing application/pdf. ! </description> </property> *************** *** 27,40 **** <property> ! <name>indexer.boost.by.link.count</name> ! <value>true</value> ! <description>Use in-degree as poor-man's link analysis.</description> </property> <property> ! <name>indexer.max.tokens</name> ! <value>100000</value> ! <description>Don't truncate documents as much as by default. ! </description> </property> --- 34,62 ---- <property> ! <name>indexer.max.tokens</name> ! <value>100000</value> ! <description> ! The maximum number of tokens that will be indexed for a single field ! in a document. This limits the amount of memory required for ! indexing, so that collections with very large files will not crash ! the indexing process by running out of memory. ! ! Note that this effectively truncates large documents, excluding ! from the index tokens that occur further in the document. If you ! know your source documents are large, be sure to set this value ! high enough to accomodate the expected size. If you set it to ! Integer.MAX_VALUE, then the only limit is your memory, but you ! should anticipate an OutOfMemoryError. ! ! Make it ten times default. ! </description> </property> <property> ! <name>parse.plugin.file</name> ! <value>ia-parse-plugins.xml</value> ! <description>The name of the file that defines the associations between ! content-types and parsers. ! </description> </property> *************** *** 42,45 **** --- 64,74 ---- <name>http.content.limit</name> <value>10000000</value> + + <description>The length limit for downloaded content, in bytes. + If this value is nonnegative (>=0), content longer than it will be truncated; + otherwise, no truncation at all. + + Used limiting amount of an ARC Record indexing during ARC ingest time. + </description> </property> *************** *** 47,54 **** <name>io.map.index.skip</name> <value>7</value> ! <description>Use less RAM. Index files get read into memory. This config. ! says read 1/7th only in at a time. Random access is slower but use more ! memory. ! </description> </property> --- 76,83 ---- <name>io.map.index.skip</name> <value>7</value> ! <description>Use less RAM. ! Index files get read into memory. This config. says read 1/7th only in ! at a time. Random access is slower but use more memory. ! </description> </property> *************** *** 60,80 **** more memory but make searches somewhat faster. Larger values use less memory but make searches somewhat slower. ! For lucene indexes, normally. The default is 128. ! Write every 1024 entries rather than every 128, the default. </description> </property> <property> ! <name>indexer.maxMergeDocs</name> ! <value>2147483647</value> ! <description>This number determines the maximum number of Lucene ! Documents to be merged into a new Lucene segment. Larger values ! increase indexing speed and reduce the number of Lucene segments, ! which reduces the number of open file handles; however, this also ! increases RAM usage during indexing. ! Doug says: "There was a bogus value for indexer.maxMergeDocs in ! nutch-default.xml which made indexing really slow. The correct ! value is something really big (like Integer.MAX_VALUE)." </description> </property> --- 89,108 ---- more memory but make searches somewhat faster. Larger values use less memory but make searches somewhat slower. ! ! Default is 128. </description> </property> <property> ! <name>indexer.mergeFactor</name> ! <value>30</value> ! <description>The factor that determines the frequency of Lucene segment ! merges. This must not be less than 2, higher values increase indexing ! speed but lead to increased RAM usage, and increase the number of ! open file handles (which may lead to "Too many open files" errors). ! NOTE: the "segments" here have nothing to do with Nutch segments, they ! are a low-level data unit used by Lucene. ! Default is 50. </description> </property> *************** *** 86,90 **** The number of context terms to display preceding and following matching terms in a hit summary. ! Make summaries a little longer than the default. </description> </property> --- 114,119 ---- The number of context terms to display preceding and following matching terms in a hit summary. ! ! Make summaries a little longer than the default. </description> </property> *************** *** 95,98 **** --- 124,143 ---- <description> The total number of terms to display in a hit summary. + + Make summaries a little longer than the default. + </description> + </property> + + <property> + <name>searcher.dir</name> + <value>crawl</value> + <description> + Path to root of crawl. This directory is searched (in + order) for either the file search-servers.txt, containing a list of + distributed search servers, or the directory "index" containing + merged indexes, or the directory "segments" containing segment + indexes. + + Included here for convenience. </description> </property> *************** *** 101,140 **** <name>collections.host</name> <value>collections.example.org</value> ! <description>The name of the server hosting collections. ! </description> </property> - <!-- The name of this archive collection. - DEPRECATED. Now search.jsp uses the 'collection' returned by the search - result drawing up the wayback URL and at index time, use the - command-line 'collection' option. - <property> <name>archive.collection</name> ! <value>be05</value> ! </property> ! --> ! <!-- ! <property> ! <name>searcher.dir</name> ! <value>/home/stack/workspace/nutch-datadir</value> ! <description>Optionally, hardcode the nutch datadir location rather ! than rely on tomcat startup location. ! </description> </property> - --> <property> <name>archive.index.all</name> <value>true</value> - <description>If set to true, all contenttypes are indexed. - Otherwise we only index text/* and application/* - </description> </property> <property> <name>archive.skip.big.html</name> ! <value>-1</value> <description>If text/html is larger than value, just skip it completely. Use this setting to bypass problematic massive text/html (We were seeing --- 146,175 ---- <name>collections.host</name> <value>collections.example.org</value> ! <description>The name of the server hosting collections. ! Used by the webapp conjuring URLs that point to page renderor (e.g. wayback). ! </description> </property> <property> <name>archive.collection</name> ! <value>CHANGEME</value> ! <description>Name of collection being searched. Used at ARC ingest time to ! add a 'collection' field to the indexed document. ! Set this before starting an indexing. ! </description> </property> + <!--If set to true, all contenttypes are indexed. Otherwise we only + index text/* and application/* + --> <property> <name>archive.index.all</name> <value>true</value> </property> <property> <name>archive.skip.big.html</name> ! <value>10000000</value> <description>If text/html is larger than value, just skip it completely. Use this setting to bypass problematic massive text/html (We were seeing *************** *** 142,152 **** value is -1 which says don't skip text/html docs.</description> </property> <property> <name>archive.index.redirects</name> ! <value>-false</value> <description>If true, we index redirects (status code 30x). </description> </property> - </nutch-conf> --- 177,187 ---- value is -1 which says don't skip text/html docs.</description> </property> + <property> <name>archive.index.redirects</name> ! <value>-true</value> <description>If true, we index redirects (status code 30x). </description> </property> </nutch-conf> --- NEW FILE: ia-parse-plugins.xml --- <?xml version="1.0" encoding="UTF-8"?> <!-- Copyright 2005 The Apache Software Foundation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Author : mattmann Description: This xml file represents a natural ordering for which parsing plugin should get called for a particular mimeType. --> <parse-plugins> <!-- by default if the mimeType is set to *, or can't be determined, use parse-text --> <mimeType name="*"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/java"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/msword"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/pdf"> <plugin id="parse-ext" /> </mimeType> <mimeType name="application/rss+xml"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/vnd.ms-excel"> <plugin id="parse-msexcel" /> </mimeType> <mimeType name="application/vnd.ms-powerpoint"> <plugin id="parse-mspowerpoint" /> </mimeType> <mimeType name="application/vnd.wap.wbxml"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/vnd.wap.wmlc"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/vnd.wap.wmlscriptc"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/xhtml+xml"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-bzip2"> <!-- try and parse it with the zip parser --> <plugin id="parse-zip" /> </mimeType> <mimeType name="application/x-csh"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-gzip"> <!-- try and parse it with the zip parser --> <plugin id="parse-zip" /> </mimeType> <mimeType name="application/x-javascript"> <plugin id="parse-js" /> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-kword"> <!-- try and parse it with the word parser --> <plugin id="parse-msword" /> </mimeType> <mimeType name="application/x-kspread"> <!-- try and parse it with the msexcel parser --> <plugin id="parse-msexcel" /> </mimeType> <mimeType name="application/x-latex"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-netcdf"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-sh"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-tcl"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-tex"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-texinfo"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff-man"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff-me"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff-ms"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/zip"> <plugin id="parse-zip" /> </mimeType> <mimeType name="message/news"> <plugin id="parse-text" /> </mimeType> <mimeType name="message/rfc822"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/css"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/html"> <plugin id="parse-html" /> </mimeType> <mimeType name="text/plain"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/richtext"> <plugin id="parse-rtf" /> <plugin id="parse-msword" /> </mimeType> <mimeType name="text/rtf"> <plugin id="parse-rtf" /> <plugin id="parse-msword" /> </mimeType> <mimeType name="text/sgml"> <plugin id="parse-html" /> <plugin id="parse-text" /> </mimeType> <mimeType name="text/tab-separated-values"> <plugin id="parse-msexcel" /> <plugin id="parse-text" /> </mimeType> <mimeType name="text/vnd.wap.wml"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/vnd.wap.wmlscript"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/xml"> <plugin id="parse-text" /> <plugin id="parse-html" /> <plugin id="parse-rss" /> </mimeType> <mimeType name="text/x-setext"> <plugin id="parse-text" /> </mimeType> <!-- Types for parse-ext plugin: required for unit tests to pass. --> <mimeType name="application/vnd.nutch.example.cat"> <plugin id="parse-ext" /> </mimeType> <mimeType name="application/vnd.nutch.example.md5sum"> <plugin id="parse-ext" /> </mimeType> </parse-plugins> Index: nutch-site.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** nutch-site.xml 30 Sep 2005 21:07:07 -0000 1.27 --- nutch-site.xml 29 Nov 2005 21:43:42 -0000 1.28 *************** *** 1,40 **** <?xml version="1.0"?> ! <!-- Internet Archive Nutch configuration --> <nutch-conf> - - <!-- Override a few Nutch defaults --> - - - <!-- Enable parse-ext (parse-ext is a parser that calls the 'ext'ernal program - xpdf to parse pdf files. Also enable parse-default and the ia plugins. - --> <property> <name>plugin.includes</name> ! <value>urlfilter-regex|parse-(text|html|ext|default)|index-(basic|ia)|query-(basic|site|url|ia)</value> </property> - <!-- keep all links, not just inter-host --> - <!-- db updates will be FASTER if set to true. - Downside is that link text from same site won't be included. - (More valuable to take anchor text from other hosts). Use this - if wide variety of sites to index. - --> <property> <name>db.ignore.internal.links</name> <value>false</value> </property> - <!-- use in-degree as poor-man's link analysis --> <property> ! <name>indexer.boost.by.link.count</name> ! <value>true</value> </property> - <!-- don't truncate documents as much as by default --> <property> ! <name>indexer.max.tokens</name> ! <value>100000</value> </property> --- 1,58 ---- <?xml version="1.0"?> ! <!--Internet Archive Nutch(WAX) configuration. Bulk of below is overrides ! for nutch-default.xml but on the end we add a few new properties with ! 'archive' prefix. ! --> <nutch-conf> <property> <name>plugin.includes</name> ! <value>urlfilter-regex|parse-(text|html|js|ext)|index-(basic|ia)|query-(basic|site|url|ia)</value> ! <description>Regular expression naming plugin directory names to ! include. Any plugin not matching this expression is excluded. ! In any case you need at least include the nutch-extensionpoints plugin. ! ! Override Nutch defaults to add nutchwax/ia (Internet Archive) plugins. ! The parse-ext is used to call the native pdftotext parsing application/pdf. ! </description> </property> <property> <name>db.ignore.internal.links</name> <value>false</value> + <description>Keep all links, not just inter-host. db updates will be + FASTER if set to true. Downside is that link text from same site won't + be included (More valuable to take anchor text from other hosts). Use + this if wide variety of sites to index. + </description> </property> <property> ! <name>indexer.max.tokens</name> ! <value>100000</value> ! <description> ! The maximum number of tokens that will be indexed for a single field ! in a document. This limits the amount of memory required for ! indexing, so that collections with very large files will not crash ! the indexing process by running out of memory. ! ! Note that this effectively truncates large documents, excluding ! from the index tokens that occur further in the document. If you ! know your source documents are large, be sure to set this value ! high enough to accomodate the expected size. If you set it to ! Integer.MAX_VALUE, then the only limit is your memory, but you ! should anticipate an OutOfMemoryError. ! ! Make it ten times default. ! </description> </property> <property> ! <name>parse.plugin.file</name> ! <value>ia-parse-plugins.xml</value> ! <description>The name of the file that defines the associations between ! content-types and parsers. ! </description> </property> *************** *** 42,60 **** <name>http.content.limit</name> <value>10000000</value> </property> - <!-- use less RAM --> - <!-- Index files get read into memory. This config. says read 1/7th only in - at a time. Random access is slower but use more memory. --> <property> <name>io.map.index.skip</name> <value>7</value> </property> - - - <!-- For lucene indexes, normally. The default is 128. - Write every 1024 entries rather than every 128, the default. - --> <property> <name>indexer.termIndexInterval</name> --- 60,81 ---- <name>http.content.limit</name> <value>10000000</value> + + <description>The length limit for downloaded content, in bytes. + If this value is nonnegative (>=0), content longer than it will be truncated; + otherwise, no truncation at all. + + Used limiting amount of an ARC Record indexing during ARC ingest time. + </description> </property> <property> <name>io.map.index.skip</name> <value>7</value> + <description>Use less RAM. + Index files get read into memory. This config. says read 1/7th only in + at a time. Random access is slower but use more memory. + </description> </property> <property> <name>indexer.termIndexInterval</name> *************** *** 64,87 **** more memory but make searches somewhat faster. Larger values use less memory but make searches somewhat slower. </description> </property> <property> ! <name>indexer.maxMergeDocs</name> ! <value>2147483647</value> ! <description>This number determines the maximum number of Lucene ! Documents to be merged into a new Lucene segment. Larger values ! increase indexing speed and reduce the number of Lucene segments, ! which reduces the number of open file handles; however, this also ! increases RAM usage during indexing. ! Doug says: "There was a bogus value for indexer.maxMergeDocs in ! nutch-default.xml which made indexing really slow. The correct ! value is something really big (like Integer.MAX_VALUE)." </description> </property> - - <!-- make summaries a little longer than the default --> <property> <name>searcher.summary.context</name> --- 85,107 ---- more memory but make searches somewhat faster. Larger values use less memory but make searches somewhat slower. + + Default is 128. </description> </property> <property> ! <name>indexer.mergeFactor</name> ! <value>30</value> ! <description>The factor that determines the frequency of Lucene segment ! merges. This must not be less than 2, higher values increase indexing ! speed but lead to increased RAM usage, and increase the number of ! open file handles (which may lead to "Too many open files" errors). ! NOTE: the "segments" here have nothing to do with Nutch segments, they ! are a low-level data unit used by Lucene. ! Default is 50. </description> </property> <property> <name>searcher.summary.context</name> *************** *** 90,93 **** --- 110,115 ---- The number of context terms to display preceding and following matching terms in a hit summary. + + Make summaries a little longer than the default. </description> </property> *************** *** 98,128 **** <description> The total number of terms to display in a hit summary. </description> </property> - <!-- the name of the server hosting collections.--> <property> <name>collections.host</name> <value>collections.example.org</value> </property> - <!-- The name of this archive collection. - DEPRECATED. Now search.jsp uses the 'collection' returned by the search - result drawing up the wayback URL and at index time, use the - command-line 'collection' option. - <property> <name>archive.collection</name> ! <value>be05</value> ! </property> ! --> ! <!--Optionally, hardcode the nutch datadir location rather ! than rely on tomcat startup location. ! <property> ! <name>searcher.dir</name> ! <value>/home/stack/workspace/nutch-datadir</value> </property> - --> <!--If set to true, all contenttypes are indexed. Otherwise we only --- 120,159 ---- <description> The total number of terms to display in a hit summary. + + Make summaries a little longer than the default. + </description> + </property> + + <property> + <name>searcher.dir</name> + <value>crawl</value> + <description> + Path to root of crawl. This directory is searched (in + order) for either the file search-servers.txt, containing a list of + distributed search servers, or the directory "index" containing + merged indexes, or the directory "segments" containing segment + indexes. + + Included here for convenience. </description> </property> <property> <name>collections.host</name> <value>collections.example.org</value> + <description>The name of the server hosting collections. + Used by the webapp conjuring URLs that point to page renderor (e.g. wayback). + </description> </property> <property> <name>archive.collection</name> ! <value>CHANGEME</value> ! <description>Name of collection being searched. Used at ARC ingest time to ! add a 'collection' field to the indexed document. ! Set this before starting an indexing. ! </description> </property> <!--If set to true, all contenttypes are indexed. Otherwise we only *************** *** 131,135 **** <property> <name>archive.index.all</name> ! <value>true</value> </property> </nutch-conf> --- 162,183 ---- <property> <name>archive.index.all</name> ! <value>false</value> ! </property> ! ! <property> ! <name>archive.skip.big.html</name> ! <value>-1</value> ! <description>If text/html is larger than value, just skip it completely. ! Use this setting to bypass problematic massive text/html (We were seeing ! the text/html parser hang for hours in bad, big html docs). Default ! value is -1 which says don't skip text/html docs.</description> ! </property> ! ! <property> ! <name>archive.index.redirects</name> ! <value>-false</value> ! <description>If true, we index redirects (status code 30x). ! </description> </property> + </nutch-conf> |
From: Michael S. <sta...@us...> - 2005-11-29 21:43:52
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/bin In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30555/bin Added Files: indexArcs.sh Removed Files: arc2seg.sh Log Message: Merge 'mapred' branch into HEAD. * .classpath * project.properties Update to point at new 0.8 nutch. * build.xml Merge in 'mapred'. Add job target. * conf/nutch-site.xml Cleanup. Removed unused properties or properties that have same values as nutch-default.xml (Except 'searcher.dir' -- keeping that here because we'll usually want to change it). Reordered so archive properties are towards the end. Brought forward descriptions from nutch-default where missing. * conf/nutch-site.xml.template Copy of nutch-site.xml but with the nutchwax defaults turned on. * src/plugin/build.xml Commented out parse-default. * src/plugin/parse-ext/plugin.xml Changed path to parse-pdf.sh. * src/web/search.jsp 'mapred' update. * bin/indexArcs.sh * conf/ia-parse-plugins.xml * lib/commons-codec-1.3.jar * src/java/org/archive/access/nutch/ImportArcs.java * src/java/org/archive/access/nutch/IndexArcs.java Added. * bin/arc2seg.sh * src/java/org/archive/access/nutch/Arc2Segment.java Removed. --- NEW FILE: indexArcs.sh --- #!/bin/sh # resolve links - $0 may be a softlink THIS="$0" while [ -h "$THIS" ]; do ls=`ls -ld "$THIS"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '.*/.*' > /dev/null; then THIS="$link" else THIS=`dirname "$THIS"`/"$link" fi done # some directories THIS_DIR=`dirname "$THIS"` PROJECT_HOME=`cd "$THIS_DIR/.." ; pwd` # If no 'nutch' directory, assume the binaries-only layout (All scripts are # in a single 'bin' directory and NUTCH_HOME=PROJECT_HOME). NUTCH_HOME="${PROJECT_HOME}/nutch" if [ ! -d "${NUTCH_HOME}" ] then NUTCH_HOME="${PROJECT_HOME}" fi if [ "$JAVA_HOME" = "" ]; then echo "Error: JAVA_HOME is not set." exit 1 fi JAVA=$JAVA_HOME/bin/java if [ -z "$JAVA_OPTS" ] then JAVA_OPTS=(-Xmx400m -server) fi # CLASSPATH initially contains conf dirs CLASSPATH=${PROJECT_HOME}/conf:${NUTCH_HOME}/conf # for developers, add classes to CLASSPATH if [ -d "$PROJECT_HOME/build/classes" ]; then CLASSPATH=${CLASSPATH}:$PROJECT_HOME/build/classes fi # for developers, add Nutch classes to CLASSPATH if [ -d "$NUTCH_HOME/build/classes" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes fi if [ -d "$NUTCH_HOME/build/plugins" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build fi if [ -d "$NUTCH_HOME/build/test/classes" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes fi # so that filenames w/ spaces are handled correctly in loops below IFS= # for releases, add Nutch jar to CLASSPATH for f in $NUTCH_HOME/nutch-*.jar; do CLASSPATH=${CLASSPATH}:$f; done # add plugins to classpath if [ -d "$NUTCH_HOME/plugins" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME fi # Add our libs to CLASSPATH but take care to make heritrix jar come # before the httpclient jar (heritrix overlays a couple of httpclient # classes). httpclient_jar= for f in ${PROJECT_HOME}/lib/*.jar; do case `basename $f` in commons-httpclient*.jar) httpclient_jar=$f ;; *) CLASSPATH=${CLASSPATH}:$f ;; esac done CLASSPATH=${CLASSPATH}:${httpclient_jar} # Add Nutch libs to CLASSPATH for f in $NUTCH_HOME/lib/*.jar; do CLASSPATH=${CLASSPATH}:$f; done # restore ordinary behaviour unset IFS CLASS=org.archive.access.nutch.IndexArcs # cygwin path translation if expr match `uname` 'CYGWIN*' &> /dev/null; then CLASSPATH=`cygpath -p -w "$CLASSPATH"` fi # Run it. Add in to java.net.URL the heritrix rsync handler. exec $JAVA ${JAVA_OPTS[@]} \ -Djava.protocol.handler.pkgs=org.archive.net \ -classpath "$CLASSPATH" $CLASS "$@" --- arc2seg.sh DELETED --- |
From: Michael S. <sta...@us...> - 2005-11-29 18:58:11
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26889/xdocs Modified Files: gettingstarted.xml index.xml operation.xml requirements.xml srcbuild.xml Log Message: * xdocs/gettingstarted.xml * xdocs/index.xml * xdocs/operation.xml * xdocs/requirements.xml * xdocs/srcbuild.xml Add labels denoting current instruction as for 0.4.2 NutchWAX or earlier. Index: index.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/index.xml,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** index.xml 29 Nov 2005 00:22:44 -0000 1.17 --- index.xml 29 Nov 2005 18:58:02 -0000 1.18 *************** *** 25,29 **** <subsection name="Release 0.4.2 - 11/28/2005"> <p>Minor fixes. Built for 1.4.x Java and added Google-like paging. Last ! release against Nutch-0.7 and move to mapreduce nutch platform.</p> </subsection> <subsection name="Release 0.4.1 - 11/03/2005"> --- 25,29 ---- <subsection name="Release 0.4.2 - 11/28/2005"> <p>Minor fixes. Built for 1.4.x Java and added Google-like paging. Last ! release against Nutch-0.7 before move to mapreduce nutch platform.</p> </subsection> <subsection name="Release 0.4.1 - 11/03/2005"> Index: operation.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/operation.xml,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** operation.xml 29 Jul 2005 22:12:23 -0000 1.5 --- operation.xml 29 Nov 2005 18:58:02 -0000 1.6 *************** *** 12,16 **** <p>Below we detail indexing and setting up search of a Web Archive Collection (WAC) of ARCS using ! NutchWAX. Our setup is intentionally simple. This makes it easier to explain. Assumption that only one server is indexing and that only one searcher will be running against a single index: In other words, --- 12,17 ---- <p>Below we detail indexing and setting up search of a Web Archive Collection (WAC) of ARCS using ! NutchWAX 0.4.2 or earlier (pre-mapreduce-based NutchWAX). ! Our setup is intentionally simple. This makes it easier to explain. Assumption that only one server is indexing and that only one searcher will be running against a single index: In other words, Index: gettingstarted.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/gettingstarted.xml,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** gettingstarted.xml 10 Nov 2005 01:30:42 -0000 1.12 --- gettingstarted.xml 29 Nov 2005 18:58:02 -0000 1.13 *************** *** 20,24 **** <a href="srcbuild.html">Building from src</a> page).</p> ! <subsection name="Indexing and searching a small collection"> <p>Try out NutchWAX on a small collection of ARCs first. <pre>% cd ${NUTCHWAX} --- 20,24 ---- <a href="srcbuild.html">Building from src</a> page).</p> ! <subsection name="Indexing and searching a small collection - Using NutchWAX 0.4.2 or earlier (i.e pre-mapreduce NutchWAX)"> <p>Try out NutchWAX on a small collection of ARCs first. <pre>% cd ${NUTCHWAX} Index: requirements.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/requirements.xml,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** requirements.xml 1 Sep 2005 21:22:09 -0000 1.5 --- requirements.xml 29 Nov 2005 18:58:02 -0000 1.6 *************** *** 16,20 **** <subsection name="Tomcat"> ! <p>Tested working with version 5.0.28. </p> </subsection> --- 16,20 ---- <subsection name="Tomcat"> ! <p>Tested working with version 5.0.28 and 5.5.12. </p> </subsection> Index: srcbuild.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/srcbuild.xml,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** srcbuild.xml 11 Nov 2005 00:52:17 -0000 1.10 --- srcbuild.xml 29 Nov 2005 18:58:02 -0000 1.11 *************** *** 13,17 **** </section> ! <section name="Building NutchWAX from src"> <p>Do an anonymous cvs checkout of the archive-access/projects/nutch project. See http://archive-access.sourceforge.net/projects/nutch/cvs-usage.html --- 13,17 ---- </section> ! <section name="Building NutchWAX from src - Pre-mapreduce (i.e. Nutch-0.7 and NutchWAX 0.4.2 or earlier)"> <p>Do an anonymous cvs checkout of the archive-access/projects/nutch project. See http://archive-access.sourceforge.net/projects/nutch/cvs-usage.html |
From: Michael S. <sta...@us...> - 2005-11-29 05:40:36
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/conf In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22127/conf Modified Files: nutch-site.xml.template Log Message: Part of '[ 1312200 ] [nutchwax+wera] Pages at end of redirects not found.' * conf/nutch-site.xml.template Make default redirect be off by default. * src/java/org/archive/access/nutch/Arc2Segment.java Fix logic around redirect (Was skipping 30xs). Log state of indirect flag. (isIndexRedirects): Added. Index: nutch-site.xml.template =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml.template,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** nutch-site.xml.template 23 Nov 2005 23:56:16 -0000 1.4 --- nutch-site.xml.template 28 Nov 2005 21:23:22 -0000 1.5 *************** *** 144,148 **** <property> <name>archive.index.redirects</name> ! <value>-true</value> <description>If true, we index redirects (status code 30x). </description> --- 144,148 ---- <property> <name>archive.index.redirects</name> ! <value>-false</value> <description>If true, we index redirects (status code 30x). </description> |
From: Michael S. <sta...@us...> - 2005-11-29 05:35:11
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22127/src/java/org/archive/access/nutch Modified Files: Arc2Segment.java Log Message: Part of '[ 1312200 ] [nutchwax+wera] Pages at end of redirects not found.' * conf/nutch-site.xml.template Make default redirect be off by default. * src/java/org/archive/access/nutch/Arc2Segment.java Fix logic around redirect (Was skipping 30xs). Log state of indirect flag. (isIndexRedirects): Added. Index: Arc2Segment.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Arc2Segment.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** Arc2Segment.java 23 Nov 2005 23:56:16 -0000 1.32 --- Arc2Segment.java 28 Nov 2005 21:23:22 -0000 1.33 *************** *** 144,147 **** --- 144,151 ---- } + public boolean isIndexRedirects() { + return Arc2Segment.indexRedirects; + } + public void addArc(String arcFile) throws IOException { File f = new File(arcFile); *************** *** 158,166 **** for (Iterator i = arc.iterator(); i.hasNext();) { ARCRecord rec = (ARCRecord)i.next(); ! if (rec.getStatusCode() != 200 || ! (this.indexRedirects && ! rec.getStatusCode() >= 300 && ! rec.getStatusCode() < 400)) { ! continue; } try { --- 162,171 ---- for (Iterator i = arc.iterator(); i.hasNext();) { ARCRecord rec = (ARCRecord)i.next(); ! if (rec.getStatusCode() != 200) { ! if (!(this.indexRedirects && ! (rec.getStatusCode() >= 300 && ! rec.getStatusCode() < 400))) { ! continue; ! } } try { *************** *** 225,230 **** if (arcData.getLength() >= bigHtmlMax) { LOG.info("skipping big html " + ! Long.toString(arcData.getLength()) + " bytes of mimetype " + ! noSpacesMimetype + " " + url); return; } --- 230,235 ---- if (arcData.getLength() >= bigHtmlMax) { LOG.info("skipping big html " + ! Long.toString(arcData.getLength()) + ! " bytes of mimetype " + noSpacesMimetype + " " + url); return; } *************** *** 244,248 **** // Collect content bytes - // TODO: Skip if unindexable type. rec.skipHttpHeader(); ByteArrayOutputStream contentBuffer = new ByteArrayOutputStream(); --- 249,252 ---- *************** *** 356,359 **** --- 360,364 ---- LOG.info("Index all mimetypes: " + arc2Segment.isIndexAll()); LOG.info("skipBigHtml " + skipBigHtml + ", cutoff size " + bigHtmlMax); + LOG.info("Index redirects " + arc2Segment.isIndexRedirects()); try { |
From: Michael S. <sta...@us...> - 2005-11-29 00:22:51
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32652/xdocs Modified Files: index.xml Log Message: * xdocs/index.xml Edit. Index: index.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/index.xml,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** index.xml 28 Nov 2005 23:25:38 -0000 1.16 --- index.xml 29 Nov 2005 00:22:44 -0000 1.17 *************** *** 25,29 **** <subsection name="Release 0.4.2 - 11/28/2005"> <p>Minor fixes. Built for 1.4.x Java and added Google-like paging. Last ! release against Nutch-0.7 and move to mapreduce.</p> </subsection> <subsection name="Release 0.4.1 - 11/03/2005"> --- 25,29 ---- <subsection name="Release 0.4.2 - 11/28/2005"> <p>Minor fixes. Built for 1.4.x Java and added Google-like paging. Last ! release against Nutch-0.7 and move to mapreduce nutch platform.</p> </subsection> <subsection name="Release 0.4.1 - 11/03/2005"> |
From: Michael S. <sta...@us...> - 2005-11-29 00:14:08
|
Update of /cvsroot/archive-access/archive-access/projects/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31224 Modified Files: project.xml Log Message: * project.xml Move past 0.4.2 minor release. Index: project.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/project.xml,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** project.xml 28 Nov 2005 22:12:31 -0000 1.26 --- project.xml 29 Nov 2005 00:14:01 -0000 1.27 *************** *** 12,16 **** <!-- The version of the project under development, e.g. 1.1, 1.2, 2.0-SNAPSHOT --> ! <currentVersion>0.4.2${version.build.suffix}</currentVersion> <!-- details about the organization that 'owns' the project --> --- 12,16 ---- <!-- The version of the project under development, e.g. 1.1, 1.2, 2.0-SNAPSHOT --> ! <currentVersion>0.5.0${version.build.suffix}</currentVersion> <!-- details about the organization that 'owns' the project --> |
From: Michael S. <sta...@us...> - 2005-11-29 00:10:43
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30202/src/articles Modified Files: releasenotes.xml Log Message: * src/articles/releasenotes.xml Notes on release 0.4.2. Index: releasenotes.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/articles/releasenotes.xml,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** releasenotes.xml 4 Nov 2005 08:28:38 -0000 1.5 --- releasenotes.xml 29 Nov 2005 00:10:35 -0000 1.6 *************** *** 12,15 **** --- 12,25 ---- </authorgroup> </articleinfo> + <sect1 id="0_4_2"> + <title>Release 0.4.2 - 11/28/05</title> + <abstract> + <para>Last release before move to mapreduce</para> + </abstract> + <para>Minor fixes: + Added Google-like results paging and built for a 1.4.x Java target. + </para> + </sect1> + <sect1 id="0_4_1"> <title>Release 0.4.1 - 11/04/05</title> |
From: Michael S. <sta...@us...> - 2005-11-28 23:25:46
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17059/xdocs Modified Files: index.xml Log Message: * xdocs/index.xml Remove double entry for 0.4.0 Index: index.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/index.xml,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** index.xml 28 Nov 2005 22:35:59 -0000 1.15 --- index.xml 28 Nov 2005 23:25:38 -0000 1.16 *************** *** 27,38 **** release against Nutch-0.7 and move to mapreduce.</p> </subsection> - <subsection name="Release 0.4.0 - 10/21/2005"> - <p>NutchWAX 0.4.0 is built against Nutch-0.7. Lots of Bug Fixes - (See <a href="articles/releasenotes.html">Release Notes</a>). - This release has been coordinated with a new release of - <a href="http://archive-access.sourceforge.net/projects/wera/">WERA</a>, - a web archive collection viewer application. - </p> - </subsection> <subsection name="Release 0.4.1 - 11/03/2005"> <p>Bug fix for double encoding issue in NutchWAX 0.4.0.</p> --- 27,30 ---- |
From: Michael S. <sta...@us...> - 2005-11-28 22:36:08
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3967/xdocs Modified Files: index.xml Log Message: * xdocs/index.xml Note on 0.4.2 release. Index: index.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/index.xml,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** index.xml 4 Nov 2005 08:30:01 -0000 1.14 --- index.xml 28 Nov 2005 22:35:59 -0000 1.15 *************** *** 23,26 **** --- 23,38 ---- </section> <section name="News"> + <subsection name="Release 0.4.2 - 11/28/2005"> + <p>Minor fixes. Built for 1.4.x Java and added Google-like paging. Last + release against Nutch-0.7 and move to mapreduce.</p> + </subsection> + <subsection name="Release 0.4.0 - 10/21/2005"> + <p>NutchWAX 0.4.0 is built against Nutch-0.7. Lots of Bug Fixes + (See <a href="articles/releasenotes.html">Release Notes</a>). + This release has been coordinated with a new release of + <a href="http://archive-access.sourceforge.net/projects/wera/">WERA</a>, + a web archive collection viewer application. + </p> + </subsection> <subsection name="Release 0.4.1 - 11/03/2005"> <p>Bug fix for double encoding issue in NutchWAX 0.4.0.</p> |
From: Michael S. <sta...@us...> - 2005-11-28 22:12:48
|
Update of /cvsroot/archive-access/archive-access/projects/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31811 Modified Files: build.xml project.xml Log Message: * build.xml Make source and target 1.4 java. * project.xml Up the version number for release. Index: build.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/build.xml,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** build.xml 10 Nov 2005 02:03:16 -0000 1.12 --- build.xml 28 Nov 2005 22:12:31 -0000 1.13 *************** *** 85,88 **** --- 85,90 ---- destdir="${build.classes}" debug="${javac.debug}" + target="1.4" + source="1.4" deprecation="${javac.deprecation}"> <classpath refid="classpath"/> Index: project.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/project.xml,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** project.xml 4 Nov 2005 19:30:14 -0000 1.25 --- project.xml 28 Nov 2005 22:12:31 -0000 1.26 *************** *** 12,16 **** <!-- The version of the project under development, e.g. 1.1, 1.2, 2.0-SNAPSHOT --> ! <currentVersion>0.4.1${version.build.suffix}</currentVersion> <!-- details about the organization that 'owns' the project --> --- 12,16 ---- <!-- The version of the project under development, e.g. 1.1, 1.2, 2.0-SNAPSHOT --> ! <currentVersion>0.4.2${version.build.suffix}</currentVersion> <!-- details about the organization that 'owns' the project --> |
From: Michael S. <sta...@us...> - 2005-11-24 03:06:34
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/web In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv6069/src/web Modified Files: search.jsp Log Message: * src/web/search.jsp Fix bug where moving forward two pages at a time instead of one on 'next'. Index: search.jsp =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/web/search.jsp,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** search.jsp 23 Nov 2005 23:56:16 -0000 1.26 --- search.jsp 24 Nov 2005 03:06:26 -0000 1.27 *************** *** 129,133 **** <input name="query" size=44 value="<%=htmlQueryString%>"> <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>"> ! <input type="hidden" name="collection" value="<%=collection%>"> <input type="submit" value="<i18n:message key="search"/>"> <% if (sort != null) { %> --- 129,135 ---- <input name="query" size=44 value="<%=htmlQueryString%>"> <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>"> ! <% if (collection != null) { %> ! <input type="hidden" name="collection" value="<%=collection%>"> ! <% } %> <input type="submit" value="<i18n:message key="search"/>"> <% if (sort != null) { %> *************** *** 281,285 **** for (long pageIndex = displayMin; pageIndex <= displayMax; pageIndex++) { long pageStart = (pageIndex - 1) * hitsPerPage; ! String pageUrl = request.getContextPath() + "/search.jsp?" + "query=" + htmlQueryString + "&start=" + pageStart + --- 283,287 ---- for (long pageIndex = displayMin; pageIndex <= displayMax; pageIndex++) { long pageStart = (pageIndex - 1) * hitsPerPage; ! String pageUrl = "search.jsp?" + "query=" + htmlQueryString + "&start=" + pageStart + *************** *** 295,299 **** %> <a href="<%=pageUrl%>"><%=pageIndex%></a> ! else { %> <b><%=pageIndex%></b> --- 297,303 ---- %> <a href="<%=pageUrl%>"><%=pageIndex%></a> ! <% ! } ! else { %> <b><%=pageIndex%></b> *************** *** 305,309 **** <% if (currentPage < pagesAvailable) { ! long nextPageStart = (currentPage + 1) * hitsPerPage; String nextPageUrl = request.getContextPath() + "/search.jsp?" + "query=" + htmlQueryString + --- 309,313 ---- <% if (currentPage < pagesAvailable) { ! long nextPageStart = currentPage * hitsPerPage; String nextPageUrl = request.getContextPath() + "/search.jsp?" + "query=" + htmlQueryString + *************** *** 347,351 **** } %> ! <table bgcolor="3333ff" align="right"> <tr><td bgcolor="ff9900"><a href="<%=rss%>"><font color="ffffff"><b>RSS</b> --- 351,355 ---- } %> ! <p /> <table bgcolor="3333ff" align="right"> <tr><td bgcolor="ff9900"><a href="<%=rss%>"><font color="ffffff"><b>RSS</b> |
From: Michael S. <sta...@us...> - 2005-11-23 23:56:24
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/web In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32095/src/web Modified Files: search.jsp Removed Files: search.jsp.archiveit Log Message: Committing last changes for nutchwax point release. Below are untested. Committing so I can test later tonight from home. * conf/nutch-site.xml.template Option to enable indexing of redirects. * src/java/org/archive/access/nutch/Arc2Segment.java If enabled, index redirects too. * src/web/search.jsp Add google-like paging (Listing of page numbers). Doesn't recover like google's when we go past actual count of hits but for most usage, its fine. Can improve upon it later. --- search.jsp.archiveit DELETED --- Index: search.jsp =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/web/search.jsp,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** search.jsp 12 Nov 2005 03:08:36 -0000 1.25 --- search.jsp 23 Nov 2005 23:56:16 -0000 1.26 *************** *** 228,246 **** if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))) { %> ! <form name="search" action="search.jsp" method="get"> ! <input type="hidden" name="query" value="<%=htmlQueryString%>"> ! <input type="hidden" name="start" value="<%=end%>"> ! <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>"> ! <input type="hidden" name="hitsPerDup" value="<%=hitsPerDup%>"> ! <input type="hidden" name="dedupField" value="<%=dedupField%>"> ! <input type="submit" value="<i18n:message key="next"/>"> ! <% if (sort != null) { %> ! <input type="hidden" name="sort" value="<%=sort%>"> ! <input type="hidden" name="reverse" value="<%=reverse%>"> ! <% } %> ! </form> <% } if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { --- 228,332 ---- if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))) { + + long pagesAvailable = (long) (hits.getTotal() / hitsPerPage) + 1 ; + long currentPage = (long) ((start + 1) / hitsPerPage + 1) ; + int maxPagesToShow = 20; + long displayMin = (long) (currentPage - (0.5 * maxPagesToShow) ); + + if (displayMin < 1) { + displayMin = 1; + } + + long displayMax = displayMin + maxPagesToShow - 1 ; + if (displayMax > pagesAvailable) { + displayMax = pagesAvailable; + } + %> ! <!-- ! Debugging info ! <table border="1"> ! <tr> ! <td>pagesAvailable:<%=pagesAvailable%></td> ! <td>currentPage:<%=currentPage%></td> ! <td>displayMin:<%=displayMin%></td> ! <td>displayMax:<%=displayMax%></td> ! </tr> ! </table> ! --> ! <center> ! <% ! if (currentPage > 1) { ! long previousPageStart = (currentPage - 1) * hitsPerPage; ! String previousPageUrl = request.getContextPath() + "/search.jsp?" + ! "query=" + htmlQueryString + ! "&start=" + previousPageStart + ! "&hitsPerPage=" + hitsPerPage + ! "&hitsPerDup=" + hitsPerDup + ! "&dedupField=" + dedupField; ! if (sort != null) { ! previousPageUrl = previousPageUrl + ! "&sort=" + sort + ! "&reverse=" + reverse; ! } ! %> ! ! <a href="<%=previousPageUrl%>"><b>Previous</b></a>  ! ! <% ! } ! %> ! <% + for (long pageIndex = displayMin; pageIndex <= displayMax; pageIndex++) { + long pageStart = (pageIndex - 1) * hitsPerPage; + String pageUrl = request.getContextPath() + "/search.jsp?" + + "query=" + htmlQueryString + + "&start=" + pageStart + + "&hitsPerPage=" + hitsPerPage + + "&hitsPerDup=" + hitsPerDup + + "&dedupField=" + dedupField; + if (sort != null) { + pageUrl = pageUrl + + "&sort=" + sort + + "&reverse=" + reverse; + } + if (pageIndex != currentPage) { + %> + <a href="<%=pageUrl%>"><%=pageIndex%></a> + else { + %> + <b><%=pageIndex%></b> + <% + } + } + %> + + <% + if (currentPage < pagesAvailable) { + long nextPageStart = (currentPage + 1) * hitsPerPage; + String nextPageUrl = request.getContextPath() + "/search.jsp?" + + "query=" + htmlQueryString + + "&start=" + nextPageStart + + "&hitsPerPage=" + hitsPerPage + + "&hitsPerDup=" + hitsPerDup + + "&dedupField=" + dedupField; + if (sort != null) { + nextPageUrl = nextPageUrl + + "&sort=" + sort + + "&reverse=" + reverse; } + %> + + <a href="<%=nextPageUrl%>"><b>Next</b></a>  + + <% + } + %> + + </center> + + <% + } if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { *************** *** 248,254 **** <form name="search" action="search.jsp" method="get"> <input type="hidden" name="query" value="<%=htmlQueryString%>"> <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>"> ! <input type="hidden" name="hitsPerDup" value="0"> ! <input type="submit" value="<i18n:message key="showAllHits"/>"> <% if (sort != null) { %> <input type="hidden" name="sort" value="<%=sort%>"> --- 334,342 ---- <form name="search" action="search.jsp" method="get"> <input type="hidden" name="query" value="<%=htmlQueryString%>"> + <input type="hidden" name="start" value="<%=end%>"> <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>"> ! <input type="hidden" name="hitsPerDup" value="<%=hitsPerDup%>"> ! <input type="hidden" name="dedupField" value="<%=dedupField%>"> ! <input type="submit" value="<i18n:message key="next"/>"> <% if (sort != null) { %> <input type="hidden" name="sort" value="<%=sort%>"> |
From: Michael S. <sta...@us...> - 2005-11-23 23:56:24
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32095/src/java/org/archive/access/nutch Modified Files: Arc2Segment.java Log Message: Committing last changes for nutchwax point release. Below are untested. Committing so I can test later tonight from home. * conf/nutch-site.xml.template Option to enable indexing of redirects. * src/java/org/archive/access/nutch/Arc2Segment.java If enabled, index redirects too. * src/web/search.jsp Add google-like paging (Listing of page numbers). Doesn't recover like google's when we go past actual count of hits but for most usage, its fine. Can improve upon it later. Index: Arc2Segment.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Arc2Segment.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** Arc2Segment.java 21 Oct 2005 01:29:29 -0000 1.31 --- Arc2Segment.java 23 Nov 2005 23:56:16 -0000 1.32 *************** *** 98,101 **** --- 98,108 ---- } } + private static boolean indexRedirects = false; + static { + String tmp = NutchConf.get().get("archive.index.redirects"); + if (tmp != null && tmp.toLowerCase().equals("true")) { + indexRedirects = true; + } + } /** Get the MimeTypes resolver instance. */ *************** *** 151,155 **** for (Iterator i = arc.iterator(); i.hasNext();) { ARCRecord rec = (ARCRecord)i.next(); ! if (rec.getStatusCode() != 200) { continue; } --- 158,165 ---- for (Iterator i = arc.iterator(); i.hasNext();) { ARCRecord rec = (ARCRecord)i.next(); ! if (rec.getStatusCode() != 200 || ! (this.indexRedirects && ! rec.getStatusCode() >= 300 && ! rec.getStatusCode() < 400)) { continue; } |
From: Michael S. <sta...@us...> - 2005-11-23 23:56:23
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/conf In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32095/conf Modified Files: nutch-site.xml.template Log Message: Committing last changes for nutchwax point release. Below are untested. Committing so I can test later tonight from home. * conf/nutch-site.xml.template Option to enable indexing of redirects. * src/java/org/archive/access/nutch/Arc2Segment.java If enabled, index redirects too. * src/web/search.jsp Add google-like paging (Listing of page numbers). Doesn't recover like google's when we go past actual count of hits but for most usage, its fine. Can improve upon it later. Index: nutch-site.xml.template =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml.template,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** nutch-site.xml.template 4 Nov 2005 19:23:59 -0000 1.3 --- nutch-site.xml.template 23 Nov 2005 23:56:16 -0000 1.4 *************** *** 142,144 **** --- 142,152 ---- value is -1 which says don't skip text/html docs.</description> </property> + <property> + <name>archive.index.redirects</name> + <value>-true</value> + <description>If true, we index redirects (status code 30x). + </description> + </property> + + </nutch-conf> |