You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: Michael S. <sta...@us...> - 2005-11-23 00:41:54
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27769/src/java/org/archive/access/nutch Modified Files: Tag: mapred IndexArcs.java Log Message: * src/java/org/archive/access/nutch/IndexArcs.java Fix name of logger (Was 'acces' instead of 'access'). Index: IndexArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Attic/IndexArcs.java,v retrieving revision 1.1.2.4 retrieving revision 1.1.2.5 diff -C2 -d -r1.1.2.4 -r1.1.2.5 *** IndexArcs.java 22 Nov 2005 23:11:33 -0000 1.1.2.4 --- IndexArcs.java 23 Nov 2005 00:41:46 -0000 1.1.2.5 *************** *** 29,36 **** import org.apache.nutch.crawl.*; import org.apache.nutch.indexer.IndexMerger; public class IndexArcs { public static final Logger LOG = ! LogFormatter.getLogger("org.archive.acces.nutch.IndexArcs"); private static String getDate() { --- 29,39 ---- import org.apache.nutch.crawl.*; import org.apache.nutch.indexer.IndexMerger; + import org.apache.nutch.util.mime.MimeType; + import org.apache.nutch.util.mime.MimeTypeException; + import org.apache.nutch.util.mime.MimeTypes; public class IndexArcs { public static final Logger LOG = ! LogFormatter.getLogger(IndexArcs.class.getName()); private static String getDate() { *************** *** 42,46 **** public static void main(String args[]) throws Exception { if (args.length < 2) { ! System.out.println("Usage: IndexArcs <arcsDir> <crawlDir> [-noimport] [-noinvert] [-noindex]"); return; } --- 45,50 ---- public static void main(String args[]) throws Exception { if (args.length < 2) { ! System.out.println("Usage: IndexArcs <arcsDir> <crawlDir> " + ! "[-noimport] [-noinvert] [-noindex]"); return; } |
From: Michael S. <sta...@us...> - 2005-11-23 00:39:06
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27156/src/java/org/archive/access/nutch Modified Files: Tag: mapred ImportArcs.java Log Message: * src/java/org/archive/access/nutch/ImportArcs.java Backport of more robust mimetype handling. Added close of ARC when done (should clean up tmp files if any made). Also added logging of each record imported. Good for reporting against. (skip, checkMimetype): Added. Index: ImportArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Attic/ImportArcs.java,v retrieving revision 1.1.2.2 retrieving revision 1.1.2.3 diff -C2 -d -r1.1.2.2 -r1.1.2.3 *** ImportArcs.java 20 Oct 2005 23:30:49 -0000 1.1.2.2 --- ImportArcs.java 23 Nov 2005 00:38:58 -0000 1.1.2.3 *************** *** 44,47 **** --- 44,48 ---- import org.apache.nutch.util.mime.MimeType; import org.apache.nutch.util.mime.MimeTypes; + import org.apache.nutch.util.mime.MimeTypeException; import org.apache.nutch.mapred.JobConf; import org.apache.nutch.mapred.JobClient; *************** *** 69,73 **** Logger.getLogger(ImportArcs.class.getName()); ! private static final String WHITESPACE = "\\s+"; public static final String ARCFILENAME_KEY = "arcname"; --- 70,74 ---- Logger.getLogger(ImportArcs.class.getName()); ! private static final String WHITESPACE = "\\s+"; public static final String ARCFILENAME_KEY = "arcname"; *************** *** 96,100 **** this.segmentName = job.get(Fetcher.SEGMENT_NAME_KEY); ! if (job.getBoolean("arc2segment.verbose", false)) { LOG.setLevel(Level.FINE); } --- 97,101 ---- this.segmentName = job.get(Fetcher.SEGMENT_NAME_KEY); ! if (job.getBoolean("importarcs.verbose", false)) { LOG.setLevel(Level.FINE); } *************** *** 140,143 **** --- 141,146 ---- } catch (Throwable e) { // problem parsing arc file LOG.log(Level.WARNING, "Error parsing: " + arcLocation, e); + } finally { + arc.close(); } } *************** *** 150,153 **** --- 153,157 ---- String url = arcData.getUrl(); + // Look at ARCRecord meta data line mimetype. It can be empty. String mimetype = arcData.getMimetype(); if (mimetype != null && mimetype.length() > 0) { *************** *** 159,182 **** } } ! if (!indexAll) { ! if ((mimetype == null) || ! (!mimetype.startsWith(TEXT_TYPE) && ! !mimetype.startsWith(APPLICATION_TYPE))) { ! // Skip any but basic types. ! return; ! } } ! String noSpacesMimetype = ! TextUtils.replaceAll(WHITESPACE, mimetype, "-"); ! // LOG.info("adding " + Long.toString(arcData.getLength()) ! // + " bytes of mimetype " + noSpacesMimetype + " " + url); ! ! // copy http headers to nutch metadata Properties metaData = new Properties(); Header[] headers = rec.getHttpHeaders(); for (int j = 0; j < headers.length; j++) { ! Header header = headers[j]; ! metaData.put(header.getName(), header.getValue()); } // Add the collection name, the arcfile name, and the offset. // Also add mimetype. Needed by the ia indexers. --- 163,196 ---- } } ! mimetype = checkMimetype(mimetype); ! if (skip(mimetype)) { ! return; } ! // Copy http headers to nutch metadata. Properties metaData = new Properties(); Header[] headers = rec.getHttpHeaders(); for (int j = 0; j < headers.length; j++) { ! Header header = headers[j]; ! if (mimetype == null) { ! // Special handling. If mimetype is null, try getting it from ! // the http header. I've seen arc record lines with empty ! // content-type and a MIME unparseable file ending; i.e. .MID. ! if (header.getName() != null && ! header.getName().toLowerCase().equals(CONTENT_TYPE_KEY)) { ! mimetype = checkMimetype(header.getValue().toLowerCase()); ! if (skip(mimetype)) { ! return; ! } ! } ! } ! metaData.put(header.getName(), header.getValue()); } + + String noSpacesMimetype = + TextUtils.replaceAll(WHITESPACE, mimetype, "-"); + + LOG.info("adding " + Long.toString(arcData.getLength()) + + " bytes of mimetype " + noSpacesMimetype + " " + url); + // Add the collection name, the arcfile name, and the offset. // Also add mimetype. Needed by the ia indexers. *************** *** 239,242 **** --- 253,280 ---- } + protected boolean skip(final String mimetype) { + boolean decision = false; + // Are we to index all content? + if (!indexAll) { + if ((mimetype == null) || + (!mimetype.startsWith(TEXT_TYPE) && + !mimetype.startsWith(APPLICATION_TYPE))) { + // Skip any but basic types. + decision = true; + } + } + return decision; + } + + protected static String checkMimetype(String mimetype) { + // Test the mimetype makes sense. If not, clear it. + try { + new MimeType(mimetype); + } catch (MimeTypeException e) { + mimetype = null; + } + return mimetype; + } + public void importArcs(File arcUrlsDir, File segment) throws IOException { |
From: Doug C. <cu...@us...> - 2005-11-22 23:11:40
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32146/src/java/org/archive/access/nutch Modified Files: Tag: mapred IndexArcs.java Log Message: Fix to get correct segment name. Index: IndexArcs.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/Attic/IndexArcs.java,v retrieving revision 1.1.2.3 retrieving revision 1.1.2.4 diff -C2 -d -r1.1.2.3 -r1.1.2.4 *** IndexArcs.java 20 Oct 2005 23:30:49 -0000 1.1.2.3 --- IndexArcs.java 22 Nov 2005 23:11:33 -0000 1.1.2.4 *************** *** 76,80 **** File linkDb = new File(crawlDir + "/linkdb"); File segments = new File(crawlDir + "/segments"); - File segment = new File(segments, getDate()); File indexes = new File(crawlDir + "/indexes"); File index = new File(crawlDir + "/index"); --- 76,79 ---- *************** *** 83,86 **** --- 82,86 ---- if (!noImport) { // import arcs + File segment = new File(segments, getDate()); LOG.info("importing arcs in " + arcsDir + " to " + segment); new ImportArcs(conf).importArcs(arcsDir, segment); *************** *** 89,93 **** if (!noUpdate) { // update crawldb LOG.info("updating crawldb in " + crawlDb); ! new CrawlDb(conf).update(crawlDb, segment); } --- 89,94 ---- if (!noUpdate) { // update crawldb LOG.info("updating crawldb in " + crawlDb); ! File[] segmentFiles = fs.listFiles(segments); ! new CrawlDb(conf).update(crawlDb, segmentFiles[segmentFiles.length-1]); } *************** *** 100,107 **** LOG.info("indexing " + crawlDir); new Indexer(conf).index(indexes,crawlDb,linkDb,fs.listFiles(segments)); } - new DeleteDuplicates(conf).dedup(new File[] { indexes }); - new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge(); LOG.info("IndexArcs finished: " + crawlDir); --- 101,108 ---- LOG.info("indexing " + crawlDir); new Indexer(conf).index(indexes,crawlDb,linkDb,fs.listFiles(segments)); + new DeleteDuplicates(conf).dedup(new File[] { indexes }); + new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge(); } LOG.info("IndexArcs finished: " + crawlDir); |
From: Doug C. <cu...@us...> - 2005-11-22 22:50:14
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/conf In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25361 Modified Files: Tag: mapred nutch-site.xml Log Message: Don't configure NDFS & mapred by default. Index: nutch-site.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml,v retrieving revision 1.24.2.5 retrieving revision 1.24.2.6 diff -C2 -d -r1.24.2.5 -r1.24.2.6 *** nutch-site.xml 21 Oct 2005 20:53:57 -0000 1.24.2.5 --- nutch-site.xml 22 Nov 2005 22:50:05 -0000 1.24.2.6 *************** *** 5,22 **** <nutch-conf> - <!-- NDFS --> - - <property> - <name>fs.default.name</name> - <value>ia109102.archive.org:8009</value> - </property> - - <!-- MapReduce --> - - <property> - <name>mapred.job.tracker</name> - <value>ia109102.archive.org:8010</value> - </property> - <!-- Override a few Nutch defaults --> --- 5,8 ---- *************** *** 76,79 **** --- 62,70 ---- </property> + <property> + <name>indexer.mergeFactor</name> + <value>30</value> + </property> + <!-- make summaries a little longer than the default --> <property> |
From: Michael S. <sta...@us...> - 2005-11-21 21:30:02
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15920/xdocs Modified Files: faq.fml Log Message: * xdocs/faq.fml Point to nutch FAQ. Index: faq.fml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/faq.fml,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** faq.fml 19 Nov 2005 01:21:58 -0000 1.16 --- faq.fml 21 Nov 2005 21:29:54 -0000 1.17 *************** *** 266,316 **** nutch/nutchwax (Or 'explain' the <code>explain</code> page)?</question> <answer> ! <p>Nutch is built on Lucene. To understand Nutch scoring, study ! how Lucene does it. The formula Lucene uses scoring can be found ! at the head of the Lucene Similarity class in the ! <a href="http://lucene.apache.org/java/docs/api/org/apache/lucene/search/Similarity.html">Lucene Similarity Javadoc</a>. ! Rougly, the score for a particular document in a set of query results, ! <code>score(q,d)</code>, is the sum of the score for each term of a ! query (<code>t in q</code>). A terms' score in a document is itself the ! sum of the term run against each field that comprises a document (e.g. ! <code>title</code> is one field, <code>url</code> is another. A 'document' ! is a set of 'fields'). Per field, the terms' score is the product of ! the following factors: Its <code>td</code> (term ! freqency in the document), a score factor <code>idf</code> usually a factor ! made up of frequency of term relative to amount of docs in index, an ! index-time boost, ! a normalization of count of terms found relative to size of document ! (<code>lengthNorm</code>), a similar normalization is done for the term in ! the query itself (<code>queryNorm</code>), and finally, a factor that ! has a weight for how many instances of the total amount of terms a ! particular document contains. Study the lucene javadoc to get more ! detail on each of the equation components and how they effect ! overall score.</p> ! <p>The nutch <code>explain.jsp</code> page can be interpreted with the ! Lucene scoring equation in mind. First, notice how we move right as ! we move from score total, to score per term, to score per field (Nothing ! is shown if a term was not found in a particular field). ! Next, studying a particular field scoring, it comprises a ! query component and then a field component (Score is product of ! these two components). The query component includes ! query time -- as opposed to index time -- boost, an idf (that is same ! for the query and field components), and then a queryNorm. Similar for ! the field component (fieldNorm is an aggregation of certain of the ! Lucene equation components).</p> ! ! <p>The easiest way to influence scoring is to change query time boost ! (will require edit of nutch-site.xml and redeploy of the nutchwax.war ! file). Query-time boost by default looks like this: ! <pre>query.url.boost, 4.0f ! query.anchor.boost, 2.0f ! query.title.boost, 1.5f ! query.host.boost, 2.0f ! query.phrase.boost, 1.0f</pre></p> ! <p>From the list above, you can see that terms found in a document URL get ! the highest boost with anchor text next, etc.</p> ! <p>Anchor text makes a large contribution to a document ranking score. ! You can see the anchor text for a page by browsing to the 'explain' then ! editing the URL to put in place 'anchors.jsp' instead of 'explain.jsp'. ! </p> </answer> </faq> --- 266,272 ---- nutch/nutchwax (Or 'explain' the <code>explain</code> page)?</question> <answer> ! <p>See <i>How is scoring done in Nutch? (Or, explain the ! "explain" page?)</i> and <i>How can I influence Nutch scoring?</i> over on ! the <a href="http://wiki.apache.org/nutch/FAQ">Nutch FAQ</a> page.</p> </answer> </faq> |
From: Michael S. <sta...@us...> - 2005-11-19 01:22:08
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25216/xdocs Modified Files: faq.fml Log Message: * xdocs/faq.fml More on nutch scoring. Index: faq.fml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/xdocs/faq.fml,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** faq.fml 1 Nov 2005 19:17:09 -0000 1.15 --- faq.fml 19 Nov 2005 01:21:58 -0000 1.16 *************** *** 264,270 **** <faq id="scoring"> <question>Tell me more about how scoring is done in ! nutch/nutchwax.</question> <answer> ! <p>By default, at query time, the following fields are boosted as follows: <pre>query.url.boost, 4.0f query.anchor.boost, 2.0f --- 264,305 ---- <faq id="scoring"> <question>Tell me more about how scoring is done in ! nutch/nutchwax (Or 'explain' the <code>explain</code> page)?</question> <answer> ! <p>Nutch is built on Lucene. To understand Nutch scoring, study ! how Lucene does it. The formula Lucene uses scoring can be found ! at the head of the Lucene Similarity class in the ! <a href="http://lucene.apache.org/java/docs/api/org/apache/lucene/search/Similarity.html">Lucene Similarity Javadoc</a>. ! Rougly, the score for a particular document in a set of query results, ! <code>score(q,d)</code>, is the sum of the score for each term of a ! query (<code>t in q</code>). A terms' score in a document is itself the ! sum of the term run against each field that comprises a document (e.g. ! <code>title</code> is one field, <code>url</code> is another. A 'document' ! is a set of 'fields'). Per field, the terms' score is the product of ! the following factors: Its <code>td</code> (term ! freqency in the document), a score factor <code>idf</code> usually a factor ! made up of frequency of term relative to amount of docs in index, an ! index-time boost, ! a normalization of count of terms found relative to size of document ! (<code>lengthNorm</code>), a similar normalization is done for the term in ! the query itself (<code>queryNorm</code>), and finally, a factor that ! has a weight for how many instances of the total amount of terms a ! particular document contains. Study the lucene javadoc to get more ! detail on each of the equation components and how they effect ! overall score.</p> ! <p>The nutch <code>explain.jsp</code> page can be interpreted with the ! Lucene scoring equation in mind. First, notice how we move right as ! we move from score total, to score per term, to score per field (Nothing ! is shown if a term was not found in a particular field). ! Next, studying a particular field scoring, it comprises a ! query component and then a field component (Score is product of ! these two components). The query component includes ! query time -- as opposed to index time -- boost, an idf (that is same ! for the query and field components), and then a queryNorm. Similar for ! the field component (fieldNorm is an aggregation of certain of the ! Lucene equation components).</p> ! ! <p>The easiest way to influence scoring is to change query time boost ! (will require edit of nutch-site.xml and redeploy of the nutchwax.war ! file). Query-time boost by default looks like this: <pre>query.url.boost, 4.0f query.anchor.boost, 2.0f *************** *** 273,278 **** query.phrase.boost, 1.0f</pre></p> <p>From the list above, you can see that terms found in a document URL get ! the highest boost with anchor text next, etc. ! You can change the above boosts by editing your nutch-site.xml</p> <p>Anchor text makes a large contribution to a document ranking score. You can see the anchor text for a page by browsing to the 'explain' then --- 308,312 ---- query.phrase.boost, 1.0f</pre></p> <p>From the list above, you can see that terms found in a document URL get ! the highest boost with anchor text next, etc.</p> <p>Anchor text makes a large contribution to a document ranking score. You can see the anchor text for a page by browsing to the 'explain' then |
From: Brad <bra...@us...> - 2005-11-19 01:08:19
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21779/src/java/org/archive/wayback Modified Files: WaybackConstants.java Log Message: TWEAK: moved HTTP and DNS URL prefix constants here. Index: WaybackConstants.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/WaybackConstants.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** WaybackConstants.java 16 Nov 2005 03:11:29 -0000 1.1 --- WaybackConstants.java 19 Nov 2005 01:08:09 -0000 1.2 *************** *** 34,37 **** --- 34,48 ---- /** + * prefixes of HTTP protocol URL.. + */ + public static final String HTTP_URL_PREFIX = "http://"; + + /** + * prefixes of DNS Record URLs.. + */ + public static final String DNS_URL_PREFIX = "dns:"; + + + /** * Request: filter results before this 14-digit timestamp */ |
From: Brad <bra...@us...> - 2005-11-19 01:07:33
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21377/src/java/org/archive/wayback/cdx Modified Files: BDBResourceIndex.java Log Message: TWEAK: added constant of BDB file max size to 256MB -- this should be configurable.. Index: BDBResourceIndex.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/BDBResourceIndex.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** BDBResourceIndex.java 16 Nov 2005 03:11:30 -0000 1.1 --- BDBResourceIndex.java 19 Nov 2005 01:07:19 -0000 1.2 *************** *** 49,52 **** --- 49,53 ---- */ public class BDBResourceIndex { + private final static String JE_LOG_FILEMAX = "256000000"; private String path; *************** *** 80,83 **** --- 81,85 ---- environmentConfig.setAllowCreate(true); environmentConfig.setTransactional(false); + environmentConfig.setConfigParam("je.log.fileMax",JE_LOG_FILEMAX); File file = new File(path); env = new Environment(file, environmentConfig); *************** *** 86,89 **** --- 88,92 ---- databaseConfig.setTransactional(false); // perform other database configurations + db = env.openDatabase(null, dbName, databaseConfig); } |
From: Brad <bra...@us...> - 2005-11-19 01:06:13
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20969/src/java/org/archive/wayback/cdx Modified Files: CDXRecord.java Log Message: TWEAK: moved CDX Header constant into this class Index: CDXRecord.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/CDXRecord.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** CDXRecord.java 16 Nov 2005 03:11:30 -0000 1.1 --- CDXRecord.java 19 Nov 2005 01:06:05 -0000 1.2 *************** *** 36,40 **** */ public class CDXRecord { ! public String url; public String captureDate; --- 36,42 ---- */ public class CDXRecord { ! public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; ! ! public String url; public String captureDate; *************** *** 58,61 **** --- 60,64 ---- } + /** * Attempt to deserialize state from a single text line, fields delimited by |
From: Brad <bra...@us...> - 2005-11-19 01:05:31
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20740/src/java/org/archive/wayback/cdx Modified Files: LocalBDBResourceIndex.java Log Message: BUGFIX: switched System.out to use Logger Index: LocalBDBResourceIndex.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/LocalBDBResourceIndex.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** LocalBDBResourceIndex.java 17 Nov 2005 02:46:05 -0000 1.2 --- LocalBDBResourceIndex.java 19 Nov 2005 01:05:23 -0000 1.3 *************** *** 26,29 **** --- 26,30 ---- import java.text.ParseException; import java.util.Properties; + import java.util.logging.Logger; import org.apache.commons.httpclient.URIException; *************** *** 50,53 **** --- 51,57 ---- */ public class LocalBDBResourceIndex implements ResourceIndex { + private static final Logger LOGGER = + Logger.getLogger(LocalBDBResourceIndex.class.getName()); + private final static String INDEX_PATH = "resourceindex.indexpath"; *************** *** 69,73 **** public void init(Properties p) throws ConfigurationException { ! System.out.println("initializing LocalDBDResourceIndex..."); String dbPath = (String) p.get(INDEX_PATH); if (dbPath == null || (dbPath.length() <= 0)) { --- 73,77 ---- public void init(Properties p) throws ConfigurationException { ! LOGGER.info("initializing LocalDBDResourceIndex..."); String dbPath = (String) p.get(INDEX_PATH); if (dbPath == null || (dbPath.length() <= 0)) { |
From: Brad <bra...@us...> - 2005-11-19 01:04:52
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20464/src/java/org/archive/wayback/cdx/indexer Modified Files: ArcIndexer.java Log Message: BUGFIX: switched System.out to Logger BUGFIX: was not closing ARCReader after indexing TWEAK: moved constants into WaybackConstants Index: ArcIndexer.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/ArcIndexer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ArcIndexer.java 17 Nov 2005 02:49:08 -0000 1.2 --- ArcIndexer.java 19 Nov 2005 01:04:44 -0000 1.3 *************** *** 24,32 **** --- 24,35 ---- package org.archive.wayback.cdx.indexer; + import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; + import java.io.PrintWriter; import java.text.ParseException; import java.util.Iterator; + import java.util.logging.Logger; import org.archive.io.arc.ARCReader; *************** *** 51,57 **** */ public class ArcIndexer { private final static String LOCATION_HTTP_HEADER = "Location"; - private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; - private final static String DNS_URL_PREFIX = "dns:"; /** --- 54,61 ---- */ public class ArcIndexer { + private static final Logger LOGGER = + Logger.getLogger(ArcIndexer.class.getName()); + private final static String LOCATION_HTTP_HEADER = "Location"; /** *************** *** 72,94 **** SearchResults results = new SearchResults(); ARCReader arcReader = ARCReaderFactory.get(arc); ! arcReader.setParseHttpHeaders(true); ! // doh. this does not generate quite the columns we need: ! // arcReader.createCDXIndexFile(arcPath); ! Iterator itr = arcReader.iterator(); ! while (itr.hasNext()) { ! ARCRecord rec = (ARCRecord) itr.next(); ! SearchResult result; ! try { ! result = arcRecordToSearchResult(rec, arc); ! } catch (NullPointerException e) { ! e.printStackTrace(); ! continue; ! } catch (ParseException e) { ! e.printStackTrace(); ! continue; ! } ! if(result != null) { ! results.addSearchResult(result); } } return results; --- 76,102 ---- SearchResults results = new SearchResults(); ARCReader arcReader = ARCReaderFactory.get(arc); ! try { ! arcReader.setParseHttpHeaders(true); ! // doh. this does not generate quite the columns we need: ! // arcReader.createCDXIndexFile(arcPath); ! Iterator itr = arcReader.iterator(); ! while (itr.hasNext()) { ! ARCRecord rec = (ARCRecord) itr.next(); ! SearchResult result; ! try { ! result = arcRecordToSearchResult(rec, arc); ! } catch (NullPointerException e) { ! e.printStackTrace(); ! continue; ! } catch (ParseException e) { ! e.printStackTrace(); ! continue; ! } ! if(result != null) { ! results.addSearchResult(result); ! } } + } finally { + arcReader.close(); } return results; *************** *** 116,128 **** return null; } ! if(uriStr.startsWith(DNS_URL_PREFIX)) { // skip dns records... return null; } ! UURI uri = new UURI(uriStr, false); String uriHost = uri.getHost(); if(uriHost == null) { ! System.out.println("No host in " + uriStr + " in " + arc.getAbsolutePath()); return null; --- 124,136 ---- return null; } ! if(uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { // skip dns records... return null; } ! UURI uri = UURIFactory.getInstance(uriStr); String uriHost = uri.getHost(); if(uriHost == null) { ! LOGGER.info("No host in " + uriStr + " in " + arc.getAbsolutePath()); return null; *************** *** 144,152 **** // right now, we're ignoring "Content-Location" try { ! UURI uriRedirect = UURIFactory.getInstance(uri,locationStr); redirectUrl = uriRedirect.getEscapedURI(); } catch (URIException e) { ! System.out.println("Bad Location: " + locationStr + " for " + uriStr + " in " + arc.getAbsolutePath() + " Skipped"); --- 152,161 ---- // right now, we're ignoring "Content-Location" try { ! UURI uriRedirect = UURIFactory.getInstance(uri, ! locationStr); redirectUrl = uriRedirect.getEscapedURI(); } catch (URIException e) { ! LOGGER.info("Bad Location: " + locationStr + " for " + uriStr + " in " + arc.getAbsolutePath() + " Skipped"); *************** *** 178,190 **** File target) throws IOException { ! // TODO will this automatically close when it falls out of scope? ! FileOutputStream output = new FileOutputStream(target); ! output.write((CDX_HEADER_STRING + "\n").getBytes()); ! CDXRecord cdxRecord = new CDXRecord(); ! Iterator itr = results.iterator(); ! while (itr.hasNext()) { ! SearchResult result = (SearchResult) itr.next(); ! cdxRecord.fromSearchResult(result); ! output.write((cdxRecord.toValue() + "\n").getBytes()); } } --- 187,204 ---- File target) throws IOException { ! FileOutputStream os = new FileOutputStream(target); ! BufferedOutputStream bos = new BufferedOutputStream(os); ! PrintWriter pw = new PrintWriter(bos); ! try { ! pw.println(CDXRecord.CDX_HEADER_MAGIC); ! CDXRecord cdxRecord = new CDXRecord(); ! Iterator itr = results.iterator(); ! while (itr.hasNext()) { ! SearchResult result = (SearchResult) itr.next(); ! cdxRecord.fromSearchResult(result); ! pw.println(cdxRecord.toValue()); ! } ! } finally { ! pw.close(); } } |
From: Brad <bra...@us...> - 2005-11-18 23:11:46
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21988/src/java/org/archive/wayback/cdx/indexer Modified Files: BDBResourceIndexWriter.java Log Message: BUGFIX: was not closing CDX file after inserting -- now does so using try...finally. Thanks stack. TWEAK: whitespace changes Index: BDBResourceIndexWriter.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/BDBResourceIndexWriter.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** BDBResourceIndexWriter.java 17 Nov 2005 02:49:59 -0000 1.2 --- BDBResourceIndexWriter.java 18 Nov 2005 23:11:38 -0000 1.3 *************** *** 79,98 **** private SearchResults readFile(File indexFile) throws Exception { - RandomAccessFile raFile = new RandomAccessFile(indexFile, "r"); SearchResults results = new SearchResults(); ! int lineNumber = 0; ! CDXRecord cdxRecord = new CDXRecord(); ! while (true) { ! String line = raFile.readLine(); ! if (line == null) { ! break; ! } ! lineNumber++; ! if ((lineNumber == 1) && (-1 != line.indexOf(CDX_HEADER_MAGIC))) { ! continue; } ! cdxRecord.parseLine(line, lineNumber); ! SearchResult result = cdxRecord.toSearchResult(); ! results.addSearchResult(result); } return results; --- 79,104 ---- private SearchResults readFile(File indexFile) throws Exception { SearchResults results = new SearchResults(); ! RandomAccessFile raFile = new RandomAccessFile(indexFile, "r"); ! try { ! int lineNumber = 0; ! CDXRecord cdxRecord = new CDXRecord(); ! while (true) { ! String line = raFile.readLine(); ! if (line == null) { ! break; ! } ! lineNumber++; ! if ((lineNumber == 1) && ! (line.indexOf(CDX_HEADER_MAGIC) != -1)) { ! ! continue; ! } ! cdxRecord.parseLine(line, lineNumber); ! SearchResult result = cdxRecord.toSearchResult(); ! results.addSearchResult(result); } ! } finally { ! raFile.close(); } return results; *************** *** 112,117 **** e.printStackTrace(); } - } - } --- 118,121 ---- |
From: Brad <bra...@us...> - 2005-11-18 23:10:31
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21558/src/java/org/archive/wayback/cdx/indexer Modified Files: IndexPipeline.java Log Message: TWEAK: now uses Logger instead of System.out TWEAK: now moves completed CDX files into merged directory instead of unlinking -- temporary to assist with index debugging. Index: IndexPipeline.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/IndexPipeline.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** IndexPipeline.java 17 Nov 2005 02:52:39 -0000 1.2 --- IndexPipeline.java 18 Nov 2005 23:10:22 -0000 1.3 *************** *** 31,34 **** --- 31,35 ---- import java.util.Iterator; import java.util.Properties; + import java.util.logging.Logger; import org.archive.wayback.PropertyConfigurable; *************** *** 54,57 **** --- 55,61 ---- */ public class IndexPipeline implements PropertyConfigurable{ + private static final Logger LOGGER = + Logger.getLogger(IndexPipeline.class.getName()); + private final static String RUN_PIPELINE = "indexpipeline.runpipeline"; *************** *** 71,74 **** --- 75,80 ---- private final static String TO_BE_MERGED_DIR = "toBeMerged"; + + private final static String MERGED_DIR = "merged"; *************** *** 85,88 **** --- 91,96 ---- private File toBeMergedDir = null; + private File mergedDir = null; + private BDBResourceIndex db = null; *************** *** 142,145 **** --- 150,154 ---- indexingDir = new File(workDir,INDEXING_DIR); toBeMergedDir = new File(workDir,TO_BE_MERGED_DIR); + mergedDir = new File(workDir,MERGED_DIR); try { *************** *** 149,152 **** --- 158,162 ---- ensureDir(indexingDir); ensureDir(toBeMergedDir); + ensureDir(mergedDir); File dbFile = new File(dbPath); ensureDir(dbFile); *************** *** 166,172 **** if ((runPipeline != null) && (runPipeline.equals("1"))) { ! // TODO: Logger! ! System.out.println("LocalDBDResourceIndex starting pipeline " + ! "thread..."); if (indexUpdateThread == null) { startIndexPipelineThread(db); --- 176,180 ---- if ((runPipeline != null) && (runPipeline.equals("1"))) { ! LOGGER.info("LocalDBDResourceIndex starting pipeline thread..."); if (indexUpdateThread == null) { startIndexPipelineThread(db); *************** *** 287,290 **** --- 295,299 ---- * Add any new CDX files in toBeMergedDir to the BDB, deleting the CDX * files as they are merged + * For now, moving merged to "merged" for debugging.. * @param dbWriter */ *************** *** 294,305 **** while(toBeMerged.hasNext()) { ! File indexFile = new File(toBeMergedDir,(String) toBeMerged.next()); ! try { dbWriter.importFile(indexFile); ! if (!indexFile.delete()) { ! throw new IOException("Unable to unlink " ! + indexFile.getAbsolutePath()); } numMerged++; } catch (Exception e) { --- 303,322 ---- while(toBeMerged.hasNext()) { ! String base = (String) toBeMerged.next(); ! File indexFile = new File(toBeMergedDir,base); ! File mergedFile = new File(mergedDir,base); try { dbWriter.importFile(indexFile); ! ! // move to "merged" for debugging ! if (!indexFile.renameTo(mergedFile)) { ! throw new IOException("Unable to move " ! + indexFile.getAbsolutePath() + " to " ! + mergedFile.getAbsolutePath()); } + // if (!indexFile.delete()) { + // throw new IOException("Unable to unlink " + // + indexFile.getAbsolutePath()); + // } numMerged++; } catch (Exception e) { *************** *** 308,312 **** } if (numMerged > 0) { ! System.out.println("Merged " + numMerged + " files."); } return numMerged; --- 325,329 ---- } if (numMerged > 0) { ! LOGGER.info("Merged " + numMerged + " files."); } return numMerged; *************** *** 364,368 **** merger.init(bdb); this.pipeline = pipeline; ! System.out.print("Pipeline Thread is ALIVE!"); } --- 381,385 ---- merger.init(bdb); this.pipeline = pipeline; ! LOGGER.info("Pipeline Thread is ALIVE!"); } |
From: Brad <bra...@us...> - 2005-11-18 23:07:58
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/proxy In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20518/src/java/org/archive/wayback/proxy Modified Files: ResultURIConverter.java Log Message: TWEAK: Fixed Comment to match classname -- was copy + paste TWEAK: made "http://" into WaybackConstant Index: ResultURIConverter.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/proxy/ResultURIConverter.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ResultURIConverter.java 16 Nov 2005 03:11:29 -0000 1.1 --- ResultURIConverter.java 18 Nov 2005 23:07:50 -0000 1.2 *************** *** 1,3 **** ! /* ProxyResultURIConverter * * $Id$ --- 1,3 ---- ! /* ResultURIConverter * * $Id$ *************** *** 54,59 **** public String makeReplayURI(SearchResult result) { String finalUrl = result.get(WaybackConstants.RESULT_URL); ! if(!finalUrl.startsWith("http://")) { ! finalUrl = "http://" + finalUrl; } return finalUrl; --- 54,59 ---- public String makeReplayURI(SearchResult result) { String finalUrl = result.get(WaybackConstants.RESULT_URL); ! if(!finalUrl.startsWith(WaybackConstants.HTTP_URL_PREFIX)) { ! finalUrl = WaybackConstants.HTTP_URL_PREFIX + finalUrl; } return finalUrl; *************** *** 85,90 **** e.printStackTrace(); } ! if(!finalUrl.startsWith("http://")) { ! finalUrl = "http://" + finalUrl; } return finalUrl; --- 85,90 ---- e.printStackTrace(); } ! if(!finalUrl.startsWith(WaybackConstants.HTTP_URL_PREFIX)) { ! finalUrl = WaybackConstants.HTTP_URL_PREFIX + finalUrl; } return finalUrl; |
From: Brad <bra...@us...> - 2005-11-18 23:06:55
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/replay In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20289/src/java/org/archive/wayback/replay Modified Files: ReplayServlet.java Log Message: Now uses Logging to report ResourceNotInArchive exceptions Index: ReplayServlet.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/replay/ReplayServlet.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ReplayServlet.java 16 Nov 2005 03:11:30 -0000 1.1 --- ReplayServlet.java 18 Nov 2005 23:06:42 -0000 1.2 *************** *** 31,34 **** --- 31,35 ---- import java.util.Properties; import java.util.Set; + import java.util.logging.Logger; import javax.servlet.ServletConfig; *************** *** 51,54 **** --- 52,56 ---- import org.archive.wayback.core.WaybackLogic; import org.archive.wayback.exception.BadQueryException; + import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.WaybackException; *************** *** 60,63 **** --- 62,68 ---- */ public class ReplayServlet extends HttpServlet { + private static final Logger LOGGER = Logger.getLogger(ReplayServlet.class + .getName()); + private static final String WMREQUEST_ATTRIBUTE = "wmrequest.attribute"; *************** *** 66,70 **** private WaybackLogic wayback = new WaybackLogic(); - /** * Constructor --- 71,74 ---- *************** *** 93,96 **** --- 97,101 ---- } } + private String getMapParam(Map queryMap, String field) { String arr[] = (String[]) queryMap.get(field); *************** *** 102,114 **** public WaybackRequest parseCGIRequest(HttpServletRequest httpRequest) ! throws BadQueryException { WaybackRequest wbRequest = new WaybackRequest(); Map queryMap = httpRequest.getParameterMap(); Set keys = queryMap.keySet(); Iterator itr = keys.iterator(); ! while(itr.hasNext()) { String key = (String) itr.next(); ! String val = getMapParam(queryMap,key); ! wbRequest.put(key,val); } String referer = httpRequest.getHeader("REFERER"); --- 107,119 ---- public WaybackRequest parseCGIRequest(HttpServletRequest httpRequest) ! throws BadQueryException { WaybackRequest wbRequest = new WaybackRequest(); Map queryMap = httpRequest.getParameterMap(); Set keys = queryMap.keySet(); Iterator itr = keys.iterator(); ! while (itr.hasNext()) { String key = (String) itr.next(); ! String val = getMapParam(queryMap, key); ! wbRequest.put(key, val); } String referer = httpRequest.getHeader("REFERER"); *************** *** 116,154 **** referer = null; } ! wbRequest.put(WaybackConstants.REQUEST_REFERER_URL,referer); return wbRequest; } ! private SearchResult getClosest(SearchResults results, WaybackRequest wbRequest) throws ParseException { ! SearchResult closest = null; ! long closestDistance = 0; ! SearchResult cur = null; ! Timestamp wantTimestamp; ! wantTimestamp = Timestamp.parseBefore(wbRequest. ! get(WaybackConstants.REQUEST_EXACT_DATE)); ! ! Iterator itr = results.iterator(); ! while (itr.hasNext()) { ! cur = (SearchResult) itr.next(); ! long curDistance; ! try { ! Timestamp curTimestamp = Timestamp.parseBefore(cur. ! get(WaybackConstants.RESULT_CAPTURE_DATE)); ! curDistance = curTimestamp.absDistanceFromTimestamp( ! wantTimestamp); ! } catch (ParseException e) { ! continue; ! } ! if ((closest == null) || (curDistance < closestDistance)) { ! closest = cur; ! closestDistance = curDistance; ! } ! } ! return closest; } ! public void doGet(HttpServletRequest httpRequest, HttpServletResponse httpResponse) throws IOException, --- 121,159 ---- referer = null; } ! wbRequest.put(WaybackConstants.REQUEST_REFERER_URL, referer); return wbRequest; } ! private SearchResult getClosest(SearchResults results, WaybackRequest wbRequest) throws ParseException { ! SearchResult closest = null; ! long closestDistance = 0; ! SearchResult cur = null; ! Timestamp wantTimestamp; ! wantTimestamp = Timestamp.parseBefore(wbRequest ! .get(WaybackConstants.REQUEST_EXACT_DATE)); ! ! Iterator itr = results.iterator(); ! while (itr.hasNext()) { ! cur = (SearchResult) itr.next(); ! long curDistance; ! try { ! Timestamp curTimestamp = Timestamp.parseBefore(cur ! .get(WaybackConstants.RESULT_CAPTURE_DATE)); ! curDistance = curTimestamp ! .absDistanceFromTimestamp(wantTimestamp); ! } catch (ParseException e) { ! continue; ! } ! if ((closest == null) || (curDistance < closestDistance)) { ! closest = cur; ! closestDistance = curDistance; ! } ! } ! return closest; } ! public void doGet(HttpServletRequest httpRequest, HttpServletResponse httpResponse) throws IOException, *************** *** 171,175 **** SearchResults results = idx.query(wbRequest); ! SearchResult closest = getClosest(results,wbRequest); // TODO loop here looking for closest online/available version? --- 176,180 ---- SearchResults results = idx.query(wbRequest); ! SearchResult closest = getClosest(results, wbRequest); // TODO loop here looking for closest online/available version? *************** *** 179,183 **** renderer.renderResource(httpRequest, httpResponse, wbRequest, ! closest, resource,uriConverter); } catch (WaybackException wbe) { --- 184,194 ---- renderer.renderResource(httpRequest, httpResponse, wbRequest, ! closest, resource, uriConverter); ! ! } catch (ResourceNotInArchiveException nia) { ! ! LOGGER.info("NotInArchive\t" ! + wbRequest.get(WaybackConstants.REQUEST_URL)); ! renderer.renderException(httpRequest, httpResponse, wbRequest, nia); } catch (WaybackException wbe) { |
From: Michael S. <sta...@us...> - 2005-11-17 21:59:38
|
Update of /cvsroot/archive-access/archive-access/projects/wb In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24223/projects/wb Modified Files: project.xml Log Message: * xdocs/index.xml * xdocs/navigation.xml Remove reference to wb project. superceded by wayback (or soon to be). * projects/wb/project.xml Add note about superceded by wayback. Index: project.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wb/project.xml,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** project.xml 20 Oct 2005 20:38:59 -0000 1.7 --- project.xml 17 Nov 2005 21:59:30 -0000 1.8 *************** *** 48,52 **** <a href="http://web.archive.org/collections/web.html">IA wayback</a>. Default lookup goes against nutch. To try out other lookup techniques, implement the org.archive.access.wb.Lookup interface and pass name of ! class in web.xml.</description> <shortDescription>Prototyping open source wayback.</shortDescription> --- 48,53 ---- <a href="http://web.archive.org/collections/web.html">IA wayback</a>. Default lookup goes against nutch. To try out other lookup techniques, implement the org.archive.access.wb.Lookup interface and pass name of ! class in web.xml. This project is now dead, superceded ! <a href="http://archive-access.sf.net/projects/wayback/">wayback</a> project.</description> <shortDescription>Prototyping open source wayback.</shortDescription> |
From: Michael S. <sta...@us...> - 2005-11-17 21:57:59
|
Update of /cvsroot/archive-access/archive-access/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23704/xdocs Modified Files: index.xml navigation.xml Log Message: * xdocs/index.xml * xdocs/navigation.xml Remove reference to wb project. superceded by wayback (or soon to be). Index: index.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/xdocs/index.xml,v retrieving revision 1.23 retrieving revision 1.24 diff -C2 -d -r1.23 -r1.24 *** index.xml 20 Oct 2005 02:06:00 -0000 1.23 --- index.xml 17 Nov 2005 21:57:51 -0000 1.24 *************** *** 32,38 **** <a href="http://nutch.org">Nutch</a>. </li> - <li><a href="/projects/wb">wb</a> is - a project to host Wayback Machine prototyping experiments. - </li> <li><a href="/projects/infiniteurl">infiniteurl</a> is an infinite source of pages used testing crawlers. --- 32,35 ---- Index: navigation.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/xdocs/navigation.xml,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** navigation.xml 20 Oct 2005 02:06:00 -0000 1.16 --- navigation.xml 17 Nov 2005 21:57:51 -0000 1.17 *************** *** 15,19 **** <item name="libarc" href="http://cvs.sourceforge.net/viewcvs.py/archive-access/archive-access/projects/libarc/LOCAL_README.txt?rev=1"/> <item name="Nutchwax" href="/projects/nutch" /> - <item name="wb" href="/projects/wb" /> <item name="infiniteurl" href="/projects/infiniteurl" /> <item name="Hedaern" href="http://cvs.sourceforge.net/viewcvs.py/archive-access/archive-access/projects/hedaern/"/> --- 15,18 ---- |
From: stack <st...@ar...> - 2005-11-17 18:46:43
|
Brad wrote: >Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/proxy >In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30992/src/java/org/archive/wayback/proxy > >Added Files: > ReplayFilter.java ResultURIConverter.java > RawReplayRenderer.java >Log Message: >Massive overhaul decomposing into three main categories of changes: > > 1) All internal datatypes are now extensible (currently Properties, but should be Maps) including: > a) WaybackRequest(was WBRequest) > b) SearchResults (was ResourceResults) > c) SearchResult (was ResourceResult) > d) Resource > > so that there is no longer an assumption of Archival URL queries, or "CDX-style" index results. This will put more responsiblility on the UI components to interrogate SearchResults to decide how to render, but should enable extension to data returned from Indexes, as well as allow far more flexibility in queries, predominantly geared towards free-text searching. This is still somewhat clunky, as there are no convenience accessor methods, so all users refer to constants when interacting with them. > > 2) Major cleanup of servlet and filter interaction with servlet container. ReplayUI and QueryUI are now just plain old servlets, and filters can be optionally added to allow non-CGI argument requests to be coerced into standard WaybackRequest objects. > > 3) Alternate "Proxy" Replay mode is now functional, and some work has been done towards an alternate Nutch ResourceIndex. Currently the web.xml contains example configurations for both Proxy and Archival Url replay modes, but the Proxy related configurations are commented out. Proxy mode *requires* changing the servlet context to ROOT. ArchivalUrl replay mode works as ROOT context and as any (I think) other context. There are some cosmetic double-slashe issues to work out. > > >--- NEW FILE: ResultURIConverter.java --- >/* ProxyResultURIConverter > * > > This name above doesn't match the file name. > * $Id: ResultURIConverter.java,v 1.1 2005/11/16 03:11:29 bradtofel Exp $ > * > * Created on 4:19:21 PM Nov 15, 2005. > * > * Copyright (C) 2005 Internet Archive. > * > * This file is part of wayback. > * > * wayback is free software; you can redistribute it and/or modify > * it under the terms of the GNU Lesser Public License as published by > * the Free Software Foundation; either version 2.1 of the License, or > * any later version. > * > * wayback is distributed in the hope that it will be useful, > * but WITHOUT ANY WARRANTY; without even the implied warranty of > * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > * GNU Lesser Public License for more details. > * > * You should have received a copy of the GNU Lesser Public License > * along with wayback; if not, write to the Free Software > * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > */ >package org.archive.wayback.proxy; > >import java.util.Properties; > >import org.apache.commons.httpclient.URIException; >import org.archive.net.UURI; >import org.archive.net.UURIFactory; >import org.archive.wayback.ReplayResultURIConverter; >import org.archive.wayback.WaybackConstants; >import org.archive.wayback.core.SearchResult; >import org.archive.wayback.exception.ConfigurationException; > >/** > * > * > * @author brad > * @version $Date: 2005/11/16 03:11:29 $, $Revision: 1.1 $ > */ > > A class comment describing what it is this class does would help. >public class ResultURIConverter implements ReplayResultURIConverter { > /* (non-Javadoc) > * @see org.archive.wayback.ReplayResultURIConverter#init(java.util.Properties) > */ > public void init(Properties p) throws ConfigurationException { > } > > /* (non-Javadoc) > * @see org.archive.wayback.ReplayResultURIConverter#makeReplayURI(org.archive.wayback.core.ResourceResult) > */ > > public String makeReplayURI(SearchResult result) { > String finalUrl = result.get(WaybackConstants.RESULT_URL); > if(!finalUrl.startsWith("http://")) { > finalUrl = "http://" + finalUrl; > } > > You might move 'http://' string to wayback constants since its created here twice (And later in this same class). Do you need to worry about https? Lower/upper case? St.Ack > return finalUrl; > } > > /** > * @return Returns the replayUriPrefix. > */ > public String getReplayUriPrefix() { > return ""; > } > > /* (non-Javadoc) > * @see org.archive.wayback.ReplayResultURIConverter#makeRedirectReplayURI(org.archive.wayback.core.SearchResult, java.lang.String) > */ > public String makeRedirectReplayURI(SearchResult result, String url) { > String finalUrl = url; > try { > > UURI origURI = UURIFactory.getInstance(url); > if(!origURI.isAbsoluteURI()) { > String resultUrl = result.get(WaybackConstants.RESULT_URL); > UURI absResultURI = UURIFactory.getInstance(resultUrl); > UURI finalURI = absResultURI.resolve(url); > finalUrl = finalURI.getEscapedURI(); > } > } catch (URIException e) { > // TODO Auto-generated catch block > e.printStackTrace(); > } > if(!finalUrl.startsWith("http://")) { > finalUrl = "http://" + finalUrl; > } > return finalUrl; > } >} > >--- NEW FILE: RawReplayRenderer.java --- >/* ReplayRenderer > * > * $Id: RawReplayRenderer.java,v 1.1 2005/11/16 03:11:29 bradtofel Exp $ > * > * Created on 5:50:38 PM Oct 31, 2005. > * > * Copyright (C) 2005 Internet Archive. > * > * This file is part of wayback. > * > * wayback is free software; you can redistribute it and/or modify > * it under the terms of the GNU Lesser Public License as published by > * the Free Software Foundation; either version 2.1 of the License, or > * any later version. > * > * wayback is distributed in the hope that it will be useful, > * but WITHOUT ANY WARRANTY; without even the implied warranty of > * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > * GNU Lesser Public License for more details. > * > * You should have received a copy of the GNU Lesser Public License > * along with wayback; if not, write to the Free Software > * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > */ >package org.archive.wayback.proxy; > >import java.io.IOException; >import java.io.InputStream; >import java.io.OutputStream; >import java.util.Enumeration; >import java.util.Properties; >import java.util.regex.Matcher; >import java.util.regex.Pattern; > >import javax.servlet.RequestDispatcher; >import javax.servlet.ServletException; >import javax.servlet.http.HttpServletRequest; >import javax.servlet.http.HttpServletResponse; > >import org.archive.wayback.ReplayRenderer; >import org.archive.wayback.ReplayResultURIConverter; >import org.archive.wayback.WaybackConstants; >import org.archive.wayback.core.Resource; >import org.archive.wayback.core.SearchResult; >import org.archive.wayback.core.WaybackRequest; >import org.archive.wayback.exception.ConfigurationException; >import org.archive.wayback.exception.WaybackException; > >/** > * > * > * @author brad > * @version $Date: 2005/11/16 03:11:29 $, $Revision: 1.1 $ > */ >public class RawReplayRenderer implements ReplayRenderer { > private final static String JSP_PATH = "replayui.jsppath"; > private final static String HTTP_LENGTH_HEADER= "Content-Length"; > private final static String HTTP_LOCATION_HEADER = "Location"; > > protected final Pattern IMAGE_REGEX = Pattern > .compile(".*\\.(jpg|jpeg|gif|png|bmp|tiff|tif)$"); > > > private String jspPath; > > private final String ERROR_JSP = "ErrorResult.jsp"; > private final String ERROR_JAVASCRIPT = "ErrorJavascript.jsp"; > private final String ERROR_IMAGE = "error_image.gif"; > > public void init(Properties p) throws ConfigurationException { > this.jspPath = (String) p.get(JSP_PATH); > if (this.jspPath == null || this.jspPath.length() <= 0) { > throw new IllegalArgumentException("Failed to find " + JSP_PATH); > } > } > > private boolean requestIsEmbedded(HttpServletRequest httpRequest, > WaybackRequest wbRequest) { > > String referer = wbRequest.get(WaybackConstants.REQUEST_REFERER_URL); > return (referer != null && referer.length() > 0); > } > > private boolean requestIsImage (HttpServletRequest httpRequest, > WaybackRequest wbRequest) { > String requestUrl = wbRequest.get(WaybackConstants.REQUEST_URL); > Matcher matcher = IMAGE_REGEX.matcher(requestUrl); > return (matcher != null && matcher.matches()); > } > > private boolean requestIsJavascript (HttpServletRequest httpRequest, > WaybackRequest wbRequest) { > > String requestUrl = wbRequest.get(WaybackConstants.REQUEST_URL); > return requestUrl.endsWith(".js"); > } > > // TODO special handling for Javascript and Images: send empty image > // or empty text file to avoid client errors > public void renderException(HttpServletRequest httpRequest, > HttpServletResponse httpResponse, WaybackRequest wbRequest, > WaybackException exception) throws ServletException, IOException { > > String finalJspPath = jspPath + "/" + ERROR_JSP; > > // is this object embedded? > if(requestIsEmbedded(httpRequest,wbRequest)) { > if(requestIsJavascript(httpRequest,wbRequest)) { > > finalJspPath = jspPath + "/" + ERROR_JAVASCRIPT; > > } else if(requestIsImage(httpRequest,wbRequest)) { > > finalJspPath = jspPath + "/" + ERROR_IMAGE; > > } > } > > httpRequest.setAttribute("exception", exception); > > RequestDispatcher dispatcher = httpRequest > .getRequestDispatcher(finalJspPath); > > dispatcher.forward(httpRequest, httpResponse); > } > > public void renderResource(HttpServletRequest httpRequest, > HttpServletResponse httpResponse, WaybackRequest wbRequest, > SearchResult result, Resource resource, > ReplayResultURIConverter uriConverter) throws ServletException, > IOException { > > resource.parseHeaders(); > copyRecordHttpHeader(httpResponse, resource, uriConverter, result, > false); > copy(resource, httpResponse.getOutputStream()); > } > > protected void copyRecordHttpHeader(HttpServletResponse response, > Resource resource, ReplayResultURIConverter uriConverter, > SearchResult result, boolean noLength) throws IOException { > Properties headers = resource.getHttpHeaders(); > int code = resource.getStatusCode(); > // Only return legit status codes -- don't return any minus > // codes, etc. > if (code <= HttpServletResponse.SC_CONTINUE) { > String identifier = ""; > response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, > "Bad status code " + code + " (" + identifier + ")."); > return; > } > response.setStatus(code); > if (headers != null) { > // Copy all headers to the response -- even date and > // server, but don't copy Content-Length if arguments indicate > for (Enumeration e = headers.keys(); e.hasMoreElements();) { > String key = (String) e.nextElement(); > String value = (String) headers.get(key); > if (noLength) { > if (-1 != key.indexOf(HTTP_LENGTH_HEADER)) { > continue; > } > } > if(0 == key.indexOf(HTTP_LOCATION_HEADER)) { > value = uriConverter.makeRedirectReplayURI(result,value); > } > response.setHeader(key, (value == null) ? "" : value); > } > } > } > > protected void copy(InputStream is, OutputStream os) throws IOException { > // TODO: Don't allocate everytime. > byte[] buffer = new byte[4 * 1024]; > for (int r = -1; (r = is.read(buffer, 0, buffer.length)) != -1;) { > os.write(buffer, 0, r); > } > } > >} > >--- NEW FILE: ReplayFilter.java --- >/* ProxyReplayFilter > * > * $Id: ReplayFilter.java,v 1.1 2005/11/16 03:11:29 bradtofel Exp $ > * > * Created on 6:08:59 PM Nov 14, 2005. > * > * Copyright (C) 2005 Internet Archive. > * > * This file is part of wayback. > * > * wayback is free software; you can redistribute it and/or modify > * it under the terms of the GNU Lesser Public License as published by > * the Free Software Foundation; either version 2.1 of the License, or > * any later version. > * > * wayback is distributed in the hope that it will be useful, > * but WITHOUT ANY WARRANTY; without even the implied warranty of > * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > * GNU Lesser Public License for more details. > * > * You should have received a copy of the GNU Lesser Public License > * along with wayback; if not, write to the Free Software > * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > */ >package org.archive.wayback.proxy; > >import java.text.ParseException; >import java.util.List; > >import javax.servlet.FilterConfig; >import javax.servlet.ServletException; >import javax.servlet.http.HttpServletRequest; > >import org.archive.util.InetAddressUtil; >import org.archive.wayback.WaybackConstants; >import org.archive.wayback.core.Timestamp; >import org.archive.wayback.core.RequestFilter; >import org.archive.wayback.core.WaybackRequest; > >/** > * > * > * @author brad > * @version $Date: 2005/11/16 03:11:29 $, $Revision: 1.1 $ > */ >public class ReplayFilter extends RequestFilter { > private List localhostNames = null; > > public ReplayFilter() { > super(); > } > public void init(final FilterConfig c) throws ServletException { > this.localhostNames = InetAddressUtil.getAllLocalHostNames(); > super.init(c); > } > /* (non-Javadoc) > * @see org.archive.wayback.core.RequestFilter#parseRequest(javax.servlet.http.HttpServletRequest) > */ > @Override > protected WaybackRequest parseRequest(HttpServletRequest httpRequest) { > WaybackRequest wbRequest = null; > if(isLocalRequest(httpRequest)) { > return wbRequest; > } > String requestServer = httpRequest.getServerName(); > String requestPath = httpRequest.getRequestURI(); > //int port = httpRequest.getServerPort(); > String requestQuery = httpRequest.getQueryString(); > String requestScheme = httpRequest.getScheme(); > if (requestQuery != null) { > requestPath = requestPath + "?" + requestQuery; > } > > String requestUrl = requestScheme + "://" + requestServer + requestPath; > > wbRequest = new WaybackRequest(); > wbRequest.put(WaybackConstants.REQUEST_URL,requestUrl); > wbRequest.put(WaybackConstants.REQUEST_TYPE, > WaybackConstants.REQUEST_REPLAY_QUERY); > > String referer = httpRequest.getHeader("REFERER"); > if (referer == null) { > referer = ""; > } > wbRequest.put(WaybackConstants.REQUEST_REFERER_URL,referer); > > try { > wbRequest.put(WaybackConstants.REQUEST_EXACT_DATE, > Timestamp.currentTimestamp().getDateStr()); > } catch (ParseException e) { > // Shouldn't happen... > e.printStackTrace(); > } > > > > return wbRequest; > } > protected boolean isLocalRequest(HttpServletRequest httpRequest) { > return this.localhostNames.contains(httpRequest.getServerName()); > } > >} > > > >------------------------------------------------------- >This SF.Net email is sponsored by the JBoss Inc. Get Certified Today >Register for a JBoss Training Course. Free Certification Exam >for All Training Attendees Through End of 2005. For more info visit: >http://ads.osdn.com/?ad_id=7628&alloc_id=16845&op=click >_______________________________________________ >Archive-access-cvs mailing list >Arc...@li... >https://lists.sourceforge.net/lists/listinfo/archive-access-cvs > > |
From: stack <st...@ar...> - 2005-11-17 17:35:58
|
Brad wrote: >Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer >In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11862/src/java/org/archive/wayback/cdx/indexer > >Modified Files: > ArcIndexer.java >Log Message: >BUGFIX: was not using CDXRecord to "serialize" SearchResult objects >BUGFIX: was not escaping Location: HTTP Header URLs >FEATURE: added skipping of dns: records in ARC files >FEATURE: added fixing/resolving of relative Location: HTTP headers, altho this is against spec. > >Index: ArcIndexer.java >=================================================================== >RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/ArcIndexer.java,v >retrieving revision 1.1 >retrieving revision 1.2 >diff -C2 -d -r1.1 -r1.2 >*** ArcIndexer.java 16 Nov 2005 03:11:29 -0000 1.1 >--- ArcIndexer.java 17 Nov 2005 02:49:08 -0000 1.2 >*************** >*** 35,42 **** >--- 35,45 ---- > import org.archive.io.arc.ARCRecordMetaData; > import org.archive.net.UURI; >+ import org.archive.net.UURIFactory; > import org.archive.wayback.WaybackConstants; >+ import org.archive.wayback.cdx.CDXRecord; > import org.archive.wayback.core.SearchResult; > import org.archive.wayback.core.SearchResults; > import org.apache.commons.httpclient.Header; >+ import org.apache.commons.httpclient.URIException; > > /** >*************** >*** 50,54 **** > private final static String LOCATION_HTTP_HEADER = "Location"; > private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; >! > > /** >--- 53,57 ---- > private final static String LOCATION_HTTP_HEADER = "Location"; > private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; >! private final static String DNS_URL_PREFIX = "dns:"; > > /** >*************** >*** 113,118 **** > return null; > } > UURI uri = new UURI(uriStr, false); >! result.put(WaybackConstants.RESULT_ORIG_HOST,uri.getHost()); > > String redirectUrl = "-"; >--- 116,132 ---- > return null; > } >+ if(uriStr.startsWith(DNS_URL_PREFIX)) { >+ // skip dns records... >+ return null; >+ } >+ > UURI uri = new UURI(uriStr, false); > > You don't want to use UURIFactory making your UURIs? It takes care of fixup and proper escaping (The UURI constructors used to be shutdown so you had to go via UURIFactory to get your UURIs). You might check it out. >! String uriHost = uri.getHost(); >! if(uriHost == null) { >! System.out.println("No host in " + uriStr + " in " + >! arc.getAbsolutePath()); > > You don't want to use a logger here? (java.util.logging?). >! return null; >! } >! result.put(WaybackConstants.RESULT_ORIG_HOST,uriHost); > > String redirectUrl = "-"; >*************** >*** 121,125 **** > for (int i = 0; i < headers.length; i++) { > if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { >! redirectUrl = headers[i].getValue(); > break; > } >--- 135,155 ---- > for (int i = 0; i < headers.length; i++) { > if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { >! String locationStr = headers[i].getValue(); >! // TODO: "Location" is supposed to be absolute: >! // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) >! // (section 14.30) but Content-Location can be relative. >! // is it correct to resolve a relative Location, as we are? >! // it's also possible to have both in the HTTP headers... >! // should we prefer one over the other? >! // right now, we're ignoring "Content-Location" >! try { >! UURI uriRedirect = UURIFactory.getInstance(uri,locationStr); >! redirectUrl = uriRedirect.getEscapedURI(); >! >! } catch (URIException e) { >! System.out.println("Bad Location: " + locationStr + >! " for " + uriStr + " in " + >! arc.getAbsolutePath() + " Skipped"); >! } > break; > } >*************** >*** 151,159 **** > FileOutputStream output = new FileOutputStream(target); > output.write((CDX_HEADER_STRING + "\n").getBytes()); >! > Do you think encoding wil ever be an issue? Does CDX only ever have ASCII? Any chance of multibyte chars? (Maybe you want to do getBytes("UTF-8")? > > Iterator itr = results.iterator(); > while (itr.hasNext()) { > > You might do instead of above: for (final Iterator itr = results.iterator(); itr.hasNext();) { etc. > SearchResult result = (SearchResult) itr.next(); >! output.write((result.toString() + "\n").getBytes()); > > Any reason you don't want to use something more sophisticated than a byte writer? If you used a PrintWriter you could just pass the String and you could call println. You might also want to wrap your FileOutputStream in a BufferedOutputStream as in: OutputStream output = new BufferedOutputStream(new FileOutputStream(....)); St.Ack > } > } >--- 181,190 ---- > FileOutputStream output = new FileOutputStream(target); > output.write((CDX_HEADER_STRING + "\n").getBytes()); >! CDXRecord cdxRecord = new CDXRecord(); > Iterator itr = results.iterator(); > while (itr.hasNext()) { > SearchResult result = (SearchResult) itr.next(); >! cdxRecord.fromSearchResult(result); >! output.write((cdxRecord.toValue() + "\n").getBytes()); > } > } > > > >------------------------------------------------------- >This SF.Net email is sponsored by the JBoss Inc. Get Certified Today >Register for a JBoss Training Course. Free Certification Exam >for All Training Attendees Through End of 2005. For more info visit: >http://ads.osdn.com/?ad_id=7628&alloc_id=16845&op=click >_______________________________________________ >Archive-access-cvs mailing list >Arc...@li... >https://lists.sourceforge.net/lists/listinfo/archive-access-cvs > > |
From: stack <st...@ar...> - 2005-11-17 17:19:23
|
Brad wrote: >Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer >In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11989/src/java/org/archive/wayback/cdx/indexer > >Modified Files: > BDBResourceIndexWriter.java >Log Message: >FEATURE: changed String.contains to indexOf for backvards compat > >Index: BDBResourceIndexWriter.java >=================================================================== >RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/BDBResourceIndexWriter.java,v >retrieving revision 1.1 >retrieving revision 1.2 >diff -C2 -d -r1.1 -r1.2 >*** BDBResourceIndexWriter.java 16 Nov 2005 03:11:29 -0000 1.1 >--- BDBResourceIndexWriter.java 17 Nov 2005 02:49:59 -0000 1.2 >*************** >*** 89,93 **** > } > lineNumber++; >! if ((lineNumber == 1) && (line.contains(CDX_HEADER_MAGIC))) { > continue; > } >--- 89,93 ---- > } > lineNumber++; >! if ((lineNumber == 1) && (-1 != line.indexOf(CDX_HEADER_MAGIC))) { > continue; > > To be consistent -- e.g. consistent with the just-previous clause -- I'd suggest flipping putting the '-1' to RHS of the '==' rather than LHS. St.Ack > } > > > >------------------------------------------------------- >This SF.Net email is sponsored by the JBoss Inc. Get Certified Today >Register for a JBoss Training Course. Free Certification Exam >for All Training Attendees Through End of 2005. For more info visit: >http://ads.osdn.com/?ad_id=7628&alloc_id=16845&op=click >_______________________________________________ >Archive-access-cvs mailing list >Arc...@li... >https://lists.sourceforge.net/lists/listinfo/archive-access-cvs > > |
From: Brad <bra...@us...> - 2005-11-17 02:52:47
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12585/src/java/org/archive/wayback/cdx/indexer Modified Files: IndexPipeline.java Log Message: FEATURE: removed dependancy on StringToStringTable, now uses HashMap. FEATURE: now indexes N ARCs at a time before merging, instead of indexing everything, then merging everything FEATURE: now only sleeps if nothing was merged, and sleeps for increasing number of seconds when nothing new has appeared. Index: IndexPipeline.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/IndexPipeline.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** IndexPipeline.java 16 Nov 2005 03:11:29 -0000 1.1 --- IndexPipeline.java 17 Nov 2005 02:52:39 -0000 1.2 *************** *** 28,31 **** --- 28,32 ---- import java.net.MalformedURLException; import java.util.ArrayList; + import java.util.HashMap; import java.util.Iterator; import java.util.Properties; *************** *** 37,41 **** import com.sleepycat.je.DatabaseException; - import com.sun.org.apache.xml.internal.utils.StringToStringTable; /** --- 38,41 ---- *************** *** 184,189 **** } ! private StringToStringTable getQueuedFiles() { ! StringToStringTable hash = new StringToStringTable(); String entries[] = queuedDir.list(); for (int i = 0; i < entries.length; i++) { --- 184,189 ---- } ! private HashMap getQueuedFiles() { ! HashMap hash = new HashMap(); String entries[] = queuedDir.list(); for (int i = 0; i < entries.length; i++) { *************** *** 209,213 **** // this should be a method call into ResourceStore... private Iterator getNewArcs() { ! StringToStringTable queued = getQueuedFiles(); ArrayList newArcs = new ArrayList(); --- 209,213 ---- // this should be a method call into ResourceStore... private Iterator getNewArcs() { ! HashMap queued = getQueuedFiles(); ArrayList newArcs = new ArrayList(); *************** *** 217,221 **** File arc = new File(arcDir,arcs[i]); if(arc.isFile() && arcs[i].endsWith(".arc.gz")) { ! if (!queued.contains(arcs[i])) { newArcs.add(arcs[i]); } --- 217,222 ---- File arc = new File(arcDir,arcs[i]); if(arc.isFile() && arcs[i].endsWith(".arc.gz")) { ! ! if (!queued.containsKey(arcs[i])) { newArcs.add(arcs[i]); } *************** *** 253,259 **** * @throws IOException */ ! public void indexArcs(ArcIndexer indexer) throws MalformedURLException, ! IOException { Iterator toBeIndexed = getDirFilesIterator(toBeIndexedDir); while(toBeIndexed.hasNext()) { String base = (String) toBeIndexed.next(); --- 254,261 ---- * @throws IOException */ ! public void indexArcs(ArcIndexer indexer, int max) ! throws MalformedURLException, IOException { Iterator toBeIndexed = getDirFilesIterator(toBeIndexedDir); + int numIndexed = 0; while(toBeIndexed.hasNext()) { String base = (String) toBeIndexed.next(); *************** *** 275,278 **** --- 277,284 ---- + toBeIndexedFlagFile.getAbsolutePath()); } + numIndexed++; + if(max > 0 && (numIndexed >= max)) { + break; + } } } *************** *** 283,287 **** * @param dbWriter */ ! public void mergeIndex(BDBResourceIndexWriter dbWriter) { int numMerged = 0; Iterator toBeMerged = getDirFilesIterator(toBeMergedDir); --- 289,293 ---- * @param dbWriter */ ! public int mergeIndex(BDBResourceIndexWriter dbWriter) { int numMerged = 0; Iterator toBeMerged = getDirFilesIterator(toBeMergedDir); *************** *** 304,307 **** --- 310,314 ---- System.out.println("Merged " + numMerged + " files."); } + return numMerged; } *************** *** 337,341 **** private class IndexPipelineThread extends Thread { private final static int SLEEP_MILLISECONDS = 10000; ! private BDBResourceIndexWriter merger = null; private ArcIndexer indexer = new ArcIndexer(); --- 344,348 ---- private class IndexPipelineThread extends Thread { private final static int SLEEP_MILLISECONDS = 10000; ! private final static int MAX_TO_MERGE = 10; private BDBResourceIndexWriter merger = null; private ArcIndexer indexer = new ArcIndexer(); *************** *** 362,371 **** public void run() { while (true) { try { pipeline.queueNewArcsForIndex(); ! pipeline.indexArcs(indexer); ! pipeline.mergeIndex(merger); ! sleep(SLEEP_MILLISECONDS); } catch (InterruptedException e) { e.printStackTrace(); --- 369,384 ---- public void run() { + int sleepInterval = SLEEP_MILLISECONDS; while (true) { try { pipeline.queueNewArcsForIndex(); ! pipeline.indexArcs(indexer,MAX_TO_MERGE); ! int numMerged = pipeline.mergeIndex(merger); ! if(numMerged == 0) { ! sleep(sleepInterval); ! sleepInterval += SLEEP_MILLISECONDS; ! } else { ! sleepInterval = SLEEP_MILLISECONDS; ! } } catch (InterruptedException e) { e.printStackTrace(); |
From: Brad <bra...@us...> - 2005-11-17 02:50:07
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11989/src/java/org/archive/wayback/cdx/indexer Modified Files: BDBResourceIndexWriter.java Log Message: FEATURE: changed String.contains to indexOf for backvards compat Index: BDBResourceIndexWriter.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/BDBResourceIndexWriter.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** BDBResourceIndexWriter.java 16 Nov 2005 03:11:29 -0000 1.1 --- BDBResourceIndexWriter.java 17 Nov 2005 02:49:59 -0000 1.2 *************** *** 89,93 **** } lineNumber++; ! if ((lineNumber == 1) && (line.contains(CDX_HEADER_MAGIC))) { continue; } --- 89,93 ---- } lineNumber++; ! if ((lineNumber == 1) && (-1 != line.indexOf(CDX_HEADER_MAGIC))) { continue; } |
From: Brad <bra...@us...> - 2005-11-17 02:49:18
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11862/src/java/org/archive/wayback/cdx/indexer Modified Files: ArcIndexer.java Log Message: BUGFIX: was not using CDXRecord to "serialize" SearchResult objects BUGFIX: was not escaping Location: HTTP Header URLs FEATURE: added skipping of dns: records in ARC files FEATURE: added fixing/resolving of relative Location: HTTP headers, altho this is against spec. Index: ArcIndexer.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/indexer/ArcIndexer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ArcIndexer.java 16 Nov 2005 03:11:29 -0000 1.1 --- ArcIndexer.java 17 Nov 2005 02:49:08 -0000 1.2 *************** *** 35,42 **** --- 35,45 ---- import org.archive.io.arc.ARCRecordMetaData; import org.archive.net.UURI; + import org.archive.net.UURIFactory; import org.archive.wayback.WaybackConstants; + import org.archive.wayback.cdx.CDXRecord; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; import org.apache.commons.httpclient.Header; + import org.apache.commons.httpclient.URIException; /** *************** *** 50,54 **** private final static String LOCATION_HTTP_HEADER = "Location"; private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; ! /** --- 53,57 ---- private final static String LOCATION_HTTP_HEADER = "Location"; private final static String CDX_HEADER_STRING = " CDX N b h m s k r V g"; ! private final static String DNS_URL_PREFIX = "dns:"; /** *************** *** 113,118 **** return null; } UURI uri = new UURI(uriStr, false); ! result.put(WaybackConstants.RESULT_ORIG_HOST,uri.getHost()); String redirectUrl = "-"; --- 116,132 ---- return null; } + if(uriStr.startsWith(DNS_URL_PREFIX)) { + // skip dns records... + return null; + } + UURI uri = new UURI(uriStr, false); ! String uriHost = uri.getHost(); ! if(uriHost == null) { ! System.out.println("No host in " + uriStr + " in " + ! arc.getAbsolutePath()); ! return null; ! } ! result.put(WaybackConstants.RESULT_ORIG_HOST,uriHost); String redirectUrl = "-"; *************** *** 121,125 **** for (int i = 0; i < headers.length; i++) { if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { ! redirectUrl = headers[i].getValue(); break; } --- 135,155 ---- for (int i = 0; i < headers.length; i++) { if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { ! String locationStr = headers[i].getValue(); ! // TODO: "Location" is supposed to be absolute: ! // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) ! // (section 14.30) but Content-Location can be relative. ! // is it correct to resolve a relative Location, as we are? ! // it's also possible to have both in the HTTP headers... ! // should we prefer one over the other? ! // right now, we're ignoring "Content-Location" ! try { ! UURI uriRedirect = UURIFactory.getInstance(uri,locationStr); ! redirectUrl = uriRedirect.getEscapedURI(); ! ! } catch (URIException e) { ! System.out.println("Bad Location: " + locationStr + ! " for " + uriStr + " in " + ! arc.getAbsolutePath() + " Skipped"); ! } break; } *************** *** 151,159 **** FileOutputStream output = new FileOutputStream(target); output.write((CDX_HEADER_STRING + "\n").getBytes()); ! Iterator itr = results.iterator(); while (itr.hasNext()) { SearchResult result = (SearchResult) itr.next(); ! output.write((result.toString() + "\n").getBytes()); } } --- 181,190 ---- FileOutputStream output = new FileOutputStream(target); output.write((CDX_HEADER_STRING + "\n").getBytes()); ! CDXRecord cdxRecord = new CDXRecord(); Iterator itr = results.iterator(); while (itr.hasNext()) { SearchResult result = (SearchResult) itr.next(); ! cdxRecord.fromSearchResult(result); ! output.write((cdxRecord.toValue() + "\n").getBytes()); } } |
From: Brad <bra...@us...> - 2005-11-17 02:46:14
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11314/src/java/org/archive/wayback/cdx Modified Files: LocalBDBResourceIndex.java Log Message: Changed String.contains to use indexOf for backvards compat. Index: LocalBDBResourceIndex.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/cdx/LocalBDBResourceIndex.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** LocalBDBResourceIndex.java 16 Nov 2005 03:11:30 -0000 1.1 --- LocalBDBResourceIndex.java 17 Nov 2005 02:46:05 -0000 1.2 *************** *** 152,156 **** } } else { ! if (!searchUrl.contains("/")) { searchUrl = searchUrl + "/"; } --- 152,156 ---- } } else { ! if (-1 == searchUrl.indexOf("/")) { searchUrl = searchUrl + "/"; } |
From: Michael S. <sta...@us...> - 2005-11-16 16:15:16
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/proxy In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13501/src/java/org/archive/wayback/proxy Modified Files: ReplayFilter.java Log Message: * src/java/org/archive/wayback/proxy/ReplayFilter.java Removed 1.5 ism (Was breaking the build). Index: ReplayFilter.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/proxy/ReplayFilter.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ReplayFilter.java 16 Nov 2005 03:11:29 -0000 1.1 --- ReplayFilter.java 16 Nov 2005 16:15:08 -0000 1.2 *************** *** 57,61 **** * @see org.archive.wayback.core.RequestFilter#parseRequest(javax.servlet.http.HttpServletRequest) */ - @Override protected WaybackRequest parseRequest(HttpServletRequest httpRequest) { WaybackRequest wbRequest = null; --- 57,60 ---- |