You can subscribe to this list here.
| 2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
| 2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
| 2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
| 2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
| 2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
| 2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
| 2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
|
From: <bi...@us...> - 2010-03-24 01:09:00
|
Revision: 3005
http://archive-access.svn.sourceforge.net/archive-access/?rev=3005&view=rev
Author: binzino
Date: 2010-03-24 01:08:53 +0000 (Wed, 24 Mar 2010)
Log Message:
-----------
Various hacks for ARI-2260.
Modified Paths:
--------------
tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java
tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java
Added Paths:
-----------
tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java
tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java
tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java
Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java 2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,246 @@
+package org.apache.nutch.searcher;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.lucene.search.TopDocCollector;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+
+public class CollapsingHitCollector extends TopDocCollector
+{
+ public static final Comparator<Hit> SCORE_COMPARATOR = new Comparator<Hit>( )
+ {
+ public int compare( Hit h1, Hit h2 )
+ {
+ if ( h1.score < h2.score ) return -1;
+ if ( h1.score > h2.score ) return 1;
+
+ // must be equal
+ return 0;
+ }
+ };
+
+ public static final Comparator<Hit> SITE_COMPARATOR_PARTIAL = new Comparator<Hit>( )
+ {
+ public int compare( Hit h1, Hit h2 )
+ {
+ return String.CASE_INSENSITIVE_ORDER.compare( h1.site, h2.site );
+ }
+ };
+
+ public static final Comparator<Hit> SITE_COMPARATOR_TOTAL = new Comparator<Hit>( )
+ {
+ public int compare( Hit h1, Hit h2 )
+ {
+ return String.CASE_INSENSITIVE_ORDER.compare( h1.site, h2.site );
+ }
+ };
+
+ public static class Hit
+ {
+ int id;
+ float score;
+ String site;
+
+ public Hit( int id, float score, String site )
+ {
+ this.id = id;
+ this.score = score;
+ this.site = site;
+ }
+
+ public int compareTo( Hit that )
+ {
+ if ( this.score < that.score ) return -1;
+ if ( this.score > that.score ) return 1;
+
+ if ( this.id < that.id ) return -1;
+ if ( this.id > that.id ) return 1;
+
+ return 0;
+ }
+ }
+
+ public org.apache.lucene.search.Searcher searcher;
+ public int numHits;
+ public int hitsPerSite;
+
+ final Hit[] sortedByScore;
+ final Hit[] sortedBySite;
+ final Hit candidate;
+
+ int numUncollapsedHits = 0;
+
+ public CollapsingHitCollector( org.apache.lucene.search.Searcher searcher, int numHits, int hitsPerSite )
+ {
+ super( numHits );
+ this.searcher = searcher;
+ this.numHits = numHits;
+ this.hitsPerSite = hitsPerSite;
+
+ this.sortedByScore = new Hit[numHits];
+ this.sortedBySite = new Hit[numHits];
+
+ for ( int i = 0; i < numHits; i++ )
+ {
+ Hit sd = new Hit( -1, Float.NEGATIVE_INFINITY, "" );
+ this.sortedByScore[i] = sd;
+ this.sortedBySite [i] = sd;
+ }
+
+ this.candidate = new Hit( -1, Float.NEGATIVE_INFINITY, "" );
+
+ }
+
+ public void collect( int doc, float score )
+ {
+ this.numUncollapsedHits++;
+
+ this.candidate.id = doc;
+ this.candidate.score = score;
+
+ if ( this.candidate.score <= this.sortedByScore[0].score )
+ {
+ return ;
+ }
+
+ try
+ {
+ String url = this.searcher.doc( this.candidate.id ).get( "url");
+ try
+ {
+ java.net.URL u = new java.net.URL( url );
+
+ this.candidate.site = u.getHost();
+ }
+ catch ( java.net.MalformedURLException e ) { }
+ }
+ catch ( IOException ioe ) { throw new RuntimeException( ioe ); }
+
+ // Use "" rather than null to keep searching and sorting simple.
+ if ( this.candidate.site == null ) this.candidate.site = "";
+
+ int sitePos = findReplacementPosition( candidate );
+
+ // No existing hit to be replaced, so we replace the overall
+ // lowest-scoring one, which is always in position 0 in the
+ // sortedByScore list.
+ if ( sitePos < 0 )
+ {
+ this.sortedByScore[0].id = candidate.id;
+ this.sortedByScore[0].score = candidate.score;
+ this.sortedByScore[0].site = candidate.site;
+
+ // Since we just added a new site, re-sort them.
+ Arrays.sort( this.sortedByScore, SCORE_COMPARATOR );
+
+ // No need to re-sort the sites if not collapsing.
+ if ( this.hitsPerSite != 0 )
+ {
+ Arrays.sort( this.sortedBySite, SITE_COMPARATOR_TOTAL );
+ }
+
+ // Done!
+ return ;
+ }
+
+ // We have an existing Hit from the same site which can be
+ // replaced *if* the candidate's score is better.
+ if ( candidate.score > this.sortedBySite[sitePos].score )
+ {
+ this.sortedBySite[sitePos].id = this.candidate.id;
+ this.sortedBySite[sitePos].score = this.candidate.score;
+
+ // We have to re-sort by scores.
+ Arrays.sort( this.sortedByScore, SCORE_COMPARATOR );
+
+ // If our hitsPerSite > 1, then we have to re-sort by site to
+ // ensure that the hit we just inserted is put into the proper
+ // sorted position within the site group. If hitsPerSite==1,
+ // then the group size == 1 and therefore no need to re-sort.
+ if ( this.hitsPerSite > 1 )
+ {
+ Arrays.sort( this.sortedBySite, SITE_COMPARATOR_TOTAL );
+ }
+ }
+ }
+
+ private int findReplacementPosition( Hit candidate )
+ {
+ if ( this.hitsPerSite == 0 ) return -1;
+
+ int pos = Arrays.binarySearch( this.sortedBySite, candidate, SITE_COMPARATOR_PARTIAL );
+
+ if ( pos < 0 || this.hitsPerSite == 1 ) return pos;
+
+ int i = pos, j = pos;
+
+ final int mini = 0, maxj = this.sortedBySite.length - 1;
+
+ for ( ; i > mini && SITE_COMPARATOR_PARTIAL.compare( this.sortedBySite[i], this.sortedBySite[i-1] ) == 0; i-- )
+ ;
+
+ for ( ; j < maxj && SITE_COMPARATOR_PARTIAL.compare( this.sortedBySite[i], this.sortedBySite[j+1] ) == 0; j++ )
+ ;
+
+ // The number of hits from this site is (j-i+1), so if we are less
+ // than the max number of hits per site, then we return -1 to
+ // indicate there is still room for more Hits from the candidate
+ // site.
+ if ( (j - i + 1) < this.hitsPerSite ) return -1;
+
+ // Otherwise, the Hit to be potentially replaced is the lowest
+ // scoring hit, which is the one at position i.
+ return i;
+ }
+
+ public Hit[] getHits()
+ {
+ Hit[] hits = new Hit[this.getNumHits( )];
+
+ final int sortedByScoreEndPos = this.sortedByScore.length - 1;
+ for ( int i = 0; i < hits.length ; i++ )
+ {
+ hits[i] = this.sortedByScore[ sortedByScoreEndPos - i ];
+ }
+
+ return hits;
+ }
+
+ public int getNumHits( )
+ {
+ for ( int i = this.sortedByScore.length - this.numHits ; i < this.sortedByScore.length ; i++ )
+ {
+ if ( this.sortedByScore[i].score != Float.NEGATIVE_INFINITY )
+ {
+ return this.sortedByScore.length - i;
+ }
+ }
+ return 0;
+ }
+
+ public int getTotalHits()
+ {
+ if ( this.hitsPerSite == 0 ) return this.numUncollapsedHits;
+
+ int numCollapsedHits = getNumHits( );
+
+ if ( numCollapsedHits < this.numHits )
+ {
+ return numCollapsedHits;
+ }
+ return this.numUncollapsedHits;
+ }
+
+ public TopDocs topDocs()
+ {
+ Hit[] hits = this.getHits( );
+ ScoreDoc[] sd = new ScoreDoc[hits.length];
+ for ( int i = 0 ; i < hits.length ; i++ )
+ {
+ sd[i] = new ScoreDoc( hits[i].id, hits[i].score );
+ }
+ return new TopDocs( getTotalHits(), sd, hits[hits.length-1].score );
+ }
+}
\ No newline at end of file
Modified: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java 2010-03-24 01:01:04 UTC (rev 3004)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java 2010-03-24 01:08:53 UTC (rev 3005)
@@ -254,7 +254,9 @@
}
public Hits search(final Query query, final int numHits,
- final String dedupField, final String sortField,
+ final int maxHitsPerDup,
+ final String dedupField,
+ final String sortField,
final boolean reverse) throws IOException {
// Get the list of live servers. It would be nice to build this
// list in updateSegments(), but that would create concurrency issues.
@@ -282,8 +284,9 @@
params[i][0] = query;
params[i][1] = new Integer(numHits);
params[i][2] = dedupField;
- params[i][3] = sortField;
- params[i][4] = Boolean.valueOf(reverse);
+ params[i][3] = maxHitsPerDup;
+ params[i][4] = sortField;
+ params[i][5] = Boolean.valueOf(reverse);
}
Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses, this.conf);
@@ -439,7 +442,7 @@
Client client = new Client(addresses, NutchConfiguration.create());
//client.setTimeout(Integer.MAX_VALUE);
- Hits hits = client.search(query, 10, null, null, false);
+ Hits hits = client.search(query, 10, 0, null, null, false);
System.out.println("Total hits: " + hits.getTotal());
for (int i = 0; i < hits.getLength(); i++) {
System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i)));
Modified: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2010-03-24 01:01:04 UTC (rev 3004)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2010-03-24 01:08:53 UTC (rev 3005)
@@ -90,7 +90,9 @@
}
public Hits search(Query query, int numHits,
- String dedupField, String sortField, boolean reverse)
+ int maxHitsPerDup,
+ String dedupField,
+ String sortField, boolean reverse)
throws IOException {
org.apache.lucene.search.BooleanQuery luceneQuery =
@@ -100,7 +102,7 @@
System.out.println( "Lucene query: " + luceneQuery );
return translateHits
- (optimizer.optimize(luceneQuery, luceneSearcher, numHits,
+ (optimizer.optimize(luceneQuery, luceneSearcher, numHits, maxHitsPerDup,
sortField, reverse),
dedupField, sortField);
}
Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,278 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.QueryFilter;
+import org.apache.lucene.search.*;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.misc.ChainedFilter;
+
+import org.apache.hadoop.conf.Configuration;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.ArrayList;
+
+import java.io.IOException;
+
+/** Utility which converts certain query clauses into {@link QueryFilter}s and
+ * caches these. Only required clauses whose boost is zero are converted to
+ * cached filters. Range queries are converted to range filters. This
+ * accellerates query constraints like date, language, document format, etc.,
+ * which do not affect ranking but might otherwise slow search considerably. */
+class LuceneQueryOptimizer {
+
+ // This thread provides a pseudo-clock service to all searching
+ // threads, so that they can count elapsed time with less overhead than
+ // repeatedly calling System.currentTimeMillis.
+ private TimerThread timerThread = null;
+
+ private static class TimerThread extends Thread {
+ private int tick;
+ // NOTE: we can avoid explicit synchronization here for several reasons:
+ // * updates to 32-bit-sized variables are atomic
+ // * only single thread modifies this value
+ // * use of volatile keyword ensures that it does not reside in
+ // a register, but in main memory (so that changes are visible to
+ // other threads).
+ // * visibility of changes does not need to be instantanous, we can
+ // afford losing a tick or two.
+ //
+ // See section 17 of the Java Language Specification for details.
+ public volatile int timeCounter = 0;
+
+ boolean running = true;
+
+ public TimerThread(int tick) {
+ super("LQO timer thread");
+ this.tick = tick;
+ this.setDaemon(true);
+ }
+
+ public void run() {
+ while(running) {
+ timeCounter++;
+ try {
+ Thread.sleep(tick);
+ } catch (InterruptedException ie) {};
+ }
+ }
+ }
+
+ private void initTimerThread(int p) {
+ if (timerThread == null || !timerThread.isAlive()) {
+ timerThread = new TimerThread(p);
+ timerThread.start();
+ }
+ }
+
+
+ private static class TimeExceeded extends RuntimeException {
+ public long maxTime;
+ private int maxDoc;
+ public TimeExceeded(long maxTime, int maxDoc) {
+ super("Exceeded search time: " + maxTime + " ms.");
+ this.maxTime = maxTime;
+ this.maxDoc = maxDoc;
+ }
+ }
+
+ private static class LimitedCollector extends TopDocCollector {
+ private int maxHits;
+ private int maxTicks;
+ private int startTicks;
+ private TimerThread timer;
+ private int curTicks;
+
+ public LimitedCollector(int numHits, int maxHits, int maxTicks,
+ TimerThread timer) {
+ super(numHits);
+ this.maxHits = maxHits;
+ this.maxTicks = maxTicks;
+ if (timer != null) {
+ this.timer = timer;
+ this.startTicks = timer.timeCounter;
+ }
+ }
+
+ public void collect(int doc, float score) {
+ if (maxHits > 0 && getTotalHits() >= maxHits) {
+ throw new LimitExceeded(doc);
+ }
+ if (timer != null) {
+ curTicks = timer.timeCounter;
+ // overflow check
+ if (curTicks < startTicks) curTicks += Integer.MAX_VALUE;
+ if (curTicks - startTicks > maxTicks) {
+ throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc);
+ }
+ }
+ super.collect(doc, score);
+ }
+ }
+
+ private static class LimitExceeded extends RuntimeException {
+ private int maxDoc;
+ public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; }
+ }
+
+ private LinkedHashMap<BooleanQuery, Filter> cache; // an LRU cache of QueryFilter
+
+ private float threshold;
+
+ private int searcherMaxHits;
+
+ private int tickLength;
+
+ private int maxTickCount;
+
+ /**
+ * Construct an optimizer that caches and uses filters for required clauses
+ * whose boost is zero.
+ *
+ * @param cacheSize
+ * the number of QueryFilters to cache
+ * @param threshold
+ * the fraction of documents which must contain a term
+ */
+ public LuceneQueryOptimizer(Configuration conf) {
+ final int cacheSize = conf.getInt("searcher.filter.cache.size", 16);
+ this.threshold = conf.getFloat("searcher.filter.cache.threshold",
+ 0.05f);
+ this.searcherMaxHits = conf.getInt("searcher.max.hits", -1);
+ this.cache = new LinkedHashMap<BooleanQuery, Filter>(cacheSize, 0.75f, true) {
+ protected boolean removeEldestEntry(Map.Entry eldest) {
+ return size() > cacheSize; // limit size of cache
+ }
+ };
+ this.tickLength = conf.getInt("searcher.max.time.tick_length", 200);
+ this.maxTickCount = conf.getInt("searcher.max.time.tick_count", -1);
+ if (this.maxTickCount > 0) {
+ initTimerThread(this.tickLength);
+ }
+ }
+
+ public TopDocs optimize(BooleanQuery original,
+ Searcher searcher, int numHits, int maxHitsPerDup,
+ String sortField, boolean reverse)
+ throws IOException {
+
+ BooleanQuery query = new BooleanQuery();
+ BooleanQuery cacheQuery = new BooleanQuery();
+ BooleanQuery filterQuery = new BooleanQuery();
+ ArrayList<Filter> filters = new ArrayList<Filter>();
+
+ BooleanClause[] clauses = original.getClauses();
+ for (int i = 0; i < clauses.length; i++) {
+ BooleanClause c = clauses[i];
+ if (c.isRequired() // required
+ && c.getQuery().getBoost() == 0.0f) { // boost is zero
+
+ if (c.getQuery() instanceof TermQuery // TermQuery
+ && (searcher.docFreq(((TermQuery)c.getQuery()).getTerm())
+ / (float)searcher.maxDoc()) < threshold) { // beneath threshold
+ query.add(c); // don't filterize
+ continue;
+ }
+
+ if (c.getQuery() instanceof RangeQuery) { // RangeQuery
+ RangeQuery range = (RangeQuery)c.getQuery();
+ boolean inclusive = range.isInclusive();// convert to RangeFilter
+ Term lower = range.getLowerTerm();
+ Term upper = range.getUpperTerm();
+ filters.add(new RangeFilter(lower!=null?lower.field():upper.field(),
+ lower != null ? lower.text() : null,
+ upper != null ? upper.text() : null,
+ inclusive, inclusive));
+ cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it
+ continue;
+ }
+
+ // all other query types
+ filterQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // filter it
+ cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it
+ continue;
+ }
+
+ query.add(c); // query it
+ }
+
+ Filter filter = null;
+ if (cacheQuery.getClauses().length != 0) {
+ synchronized (cache) { // check cache
+ filter = cache.get(cacheQuery);
+ }
+ if (filter == null) { // miss
+
+ if (filterQuery.getClauses().length != 0) // add filterQuery to filters
+ filters.add(new CachingWrapperFilter(new QueryWrapperFilter(filterQuery)));
+
+ if (filters.size() == 1) { // convert filters to filter
+ filter = (Filter)filters.get(0);
+ } else {
+ filter = new ChainedFilter((Filter[])filters.toArray
+ (new Filter[filters.size()]),
+ ChainedFilter.AND);
+ }
+ if (!(filter instanceof CachingWrapperFilter)) // make sure bits are cached
+ filter = new CachingWrapperFilter(filter);
+
+ synchronized (cache) {
+ cache.put(cacheQuery, filter); // cache the filter
+ }
+ }
+ }
+ if (sortField == null && !reverse) {
+
+ // no hit limit
+ if (this.searcherMaxHits <= 0 && timerThread == null) {
+ // FIXME: Need hitsPerSite value, using '1' to test.
+ TopDocCollector c = new CollapsingHitCollector( searcher, numHits, maxHitsPerDup );
+ searcher.search(query, filter, c );
+ return c.topDocs( );
+ }
+
+ // hits limited in time or in count -- use a LimitedCollector
+ LimitedCollector collector = new LimitedCollector(numHits, searcherMaxHits,
+ maxTickCount, timerThread);
+ LimitExceeded exceeded = null;
+ TimeExceeded timeExceeded = null;
+ try {
+ searcher.search(query, filter, collector);
+ } catch (LimitExceeded le) {
+ exceeded = le;
+ } catch (TimeExceeded te) {
+ timeExceeded = te;
+ }
+ TopDocs results = collector.topDocs();
+ if (exceeded != null) { // limit was exceeded
+ results.totalHits = (int) // must estimate totalHits
+ (results.totalHits*(searcher.maxDoc()/(float)exceeded.maxDoc));
+ } else if (timeExceeded != null) {
+ // Estimate total hits.
+ results.totalHits = (int)(results.totalHits * (searcher.maxDoc()/(float)timeExceeded.maxDoc));
+ }
+ return results;
+
+ } else {
+ return searcher.search(query, filter, numHits,
+ new Sort(sortField, reverse));
+ }
+ }
+}
Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java 2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,434 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.*;
+import java.util.*;
+import javax.servlet.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.io.Closeable;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.indexer.*;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * One stop shopping for search-related functionality.
+ * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
+ */
+public class NutchBean
+ implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
+ DistributedSearch.Protocol, Closeable {
+
+ public static final Log LOG = LogFactory.getLog(NutchBean.class);
+ public static final String KEY = "nutchBean";
+
+// static {
+// LogFormatter.setShowThreadIDs(true);
+// }
+
+ private String[] segmentNames;
+
+ private Searcher searcher;
+ private HitDetailer detailer;
+ private HitSummarizer summarizer;
+ private HitContent content;
+ private HitInlinks linkDb;
+
+
+ /** BooleanQuery won't permit more than 32 required/prohibited clauses. We
+ * don't want to use too many of those. */
+ private static final int MAX_PROHIBITED_TERMS = 20;
+
+ private Configuration conf;
+
+ private FileSystem fs;
+
+ /** Returns the cached instance in the servlet context.
+ * @see NutchBeanConstructor*/
+ public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
+ NutchBean bean = (NutchBean)app.getAttribute(KEY);
+ return bean;
+ }
+
+
+ /**
+ *
+ * @param conf
+ * @throws IOException
+ */
+ public NutchBean(Configuration conf) throws IOException {
+ this(conf, null);
+ }
+
+ /**
+ * Construct in a named directory.
+ * @param conf
+ * @param dir
+ * @throws IOException
+ */
+ public NutchBean(Configuration conf, Path dir) throws IOException {
+ this.conf = conf;
+ this.fs = FileSystem.get(this.conf);
+ if (dir == null) {
+ dir = new Path(this.conf.get("searcher.dir", "crawl"));
+ }
+ Path servers = new Path(dir, "search-servers.txt");
+ if (fs.exists(servers)) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("searching servers in " + servers);
+ }
+ init(new DistributedSearch.Client(servers, conf));
+ } else {
+ init(new Path(dir, "index"), new Path(dir, "indexes"), new Path(
+ dir, "segments"), new Path(dir, "linkdb"));
+ }
+ }
+
+ private void init(Path indexDir, Path indexesDir, Path segmentsDir,
+ Path linkDb)
+ throws IOException {
+ IndexSearcher indexSearcher;
+ if (this.fs.exists(indexDir)) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("opening merged index in " + indexDir);
+ }
+ indexSearcher = new IndexSearcher(indexDir, this.conf);
+ } else {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("opening indexes in " + indexesDir);
+ }
+
+ Vector vDirs=new Vector();
+ FileStatus[] fstats = fs.listStatus(indexesDir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
+ Path [] directories = HadoopFSUtil.getPaths(fstats);
+ for(int i = 0; i < directories.length; i++) {
+ Path indexdone = new Path(directories[i], Indexer.DONE_NAME);
+ if(fs.isFile(indexdone)) {
+ vDirs.add(directories[i]);
+ }
+ }
+
+
+ directories = new Path[ vDirs.size() ];
+ for(int i = 0; vDirs.size()>0; i++) {
+ directories[i]=(Path)vDirs.remove(0);
+ }
+
+ indexSearcher = new IndexSearcher(directories, this.conf);
+ }
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("opening segments in " + segmentsDir);
+ }
+ FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf);
+
+ this.segmentNames = segments.getSegmentNames();
+
+ this.searcher = indexSearcher;
+ this.detailer = indexSearcher;
+ this.summarizer = segments;
+ this.content = segments;
+
+ if (LOG.isInfoEnabled()) { LOG.info("opening linkdb in " + linkDb); }
+ this.linkDb = new LinkDbInlinks(fs, linkDb, this.conf);
+ }
+
+ private void init(DistributedSearch.Client client) {
+ this.segmentNames = client.getSegmentNames();
+ this.searcher = client;
+ this.detailer = client;
+ this.summarizer = client;
+ this.content = client;
+ this.linkDb = client;
+ }
+
+
+ public String[] getSegmentNames() {
+ return segmentNames;
+ }
+
+ public Hits search(Query query, int numHits) throws IOException {
+ return search(query, numHits, null, null, false);
+ }
+
+ public Hits search(Query query, int numHits,
+ String dedupField, String sortField, boolean reverse)
+ throws IOException {
+
+ return searcher.search(query, numHits, 0, dedupField, sortField, reverse);
+ }
+
+ private class DupHits extends ArrayList {
+ private boolean maxSizeExceeded;
+ }
+
+ /** Search for pages matching a query, eliminating excessive hits from the
+ * same site. Hits after the first <code>maxHitsPerDup</code> from the same
+ * site are removed from results. The remaining hits have {@link
+ * Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero then all
+ * hits are returned.
+ *
+ * @param query query
+ * @param numHits number of requested hits
+ * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+ * @return Hits the matching hits
+ * @throws IOException
+ */
+ public Hits search(Query query, int numHits, int maxHitsPerDup)
+ throws IOException {
+ return search(query, numHits, maxHitsPerDup, "site", null, false);
+ }
+
+ /** Search for pages matching a query, eliminating excessive hits with
+ * matching values for a named field. Hits after the first
+ * <code>maxHitsPerDup</code> are removed from results. The remaining hits
+ * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero
+ * then all hits are returned.
+ *
+ * @param query query
+ * @param numHits number of requested hits
+ * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+ * @param dedupField field name to check for duplicates
+ * @return Hits the matching hits
+ * @throws IOException
+ */
+ public Hits search(Query query, int numHits,
+ int maxHitsPerDup, String dedupField)
+ throws IOException {
+ return search(query, numHits, maxHitsPerDup, dedupField, null, false);
+ }
+ /** Search for pages matching a query, eliminating excessive hits with
+ * matching values for a named field. Hits after the first
+ * <code>maxHitsPerDup</code> are removed from results. The remaining hits
+ * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero
+ * then all hits are returned.
+ *
+ * @param query query
+ * @param numHits number of requested hits
+ * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+ * @param dedupField field name to check for duplicates
+ * @param sortField Field to sort on (or null if no sorting).
+ * @param reverse True if we are to reverse sort by <code>sortField</code>.
+ * @return Hits the matching hits
+ * @throws IOException
+ */
+ public Hits search(Query query, int numHits,
+ int maxHitsPerDup,
+ String dedupField,
+ String sortField, boolean reverse)
+ throws IOException {
+ if (maxHitsPerDup <= 0) // disable dup checking
+ return search(query, numHits, dedupField, sortField, reverse);
+
+ float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
+ int numHitsRaw = (int)(numHits * rawHitsFactor);
+ if (LOG.isInfoEnabled()) {
+ LOG.info("searching for "+numHitsRaw+" raw hits");
+ }
+ Hits hits = searcher.search(query, numHitsRaw, maxHitsPerDup, dedupField, sortField, reverse);
+ long total = hits.getTotal();
+ Map dupToHits = new HashMap();
+ List resultList = new ArrayList();
+ Set seen = new HashSet();
+ List excludedValues = new ArrayList();
+ boolean totalIsExact = true;
+ for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
+ // get the next raw hit
+ if (rawHitNum >= hits.getLength()) {
+ // optimize query by prohibiting more matches on some excluded values
+ Query optQuery = (Query)query.clone();
+ for (int i = 0; i < excludedValues.size(); i++) {
+ if (i == MAX_PROHIBITED_TERMS)
+ break;
+ optQuery.addProhibitedTerm(((String)excludedValues.get(i)),
+ dedupField);
+ }
+ numHitsRaw = (int)(numHitsRaw * rawHitsFactor);
+ if (LOG.isInfoEnabled()) {
+ LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
+ }
+ hits = searcher.search(optQuery, numHitsRaw, maxHitsPerDup, dedupField, sortField, reverse);
+ if (LOG.isInfoEnabled()) {
+ LOG.info("found "+hits.getTotal()+" raw hits");
+ }
+ rawHitNum = -1;
+ continue;
+ }
+
+ Hit hit = hits.getHit(rawHitNum);
+ if (seen.contains(hit))
+ continue;
+ seen.add(hit);
+
+ // get dup hits for its value
+ String value = hit.getDedupValue();
+ DupHits dupHits = (DupHits)dupToHits.get(value);
+ if (dupHits == null)
+ dupToHits.put(value, dupHits = new DupHits());
+
+ // does this hit exceed maxHitsPerDup?
+ if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit
+ if (!dupHits.maxSizeExceeded) {
+
+ // mark prior hits with moreFromDupExcluded
+ for (int i = 0; i < dupHits.size(); i++) {
+ ((Hit)dupHits.get(i)).setMoreFromDupExcluded(true);
+ }
+ dupHits.maxSizeExceeded = true;
+
+ excludedValues.add(value); // exclude dup
+ }
+ totalIsExact = false;
+ } else { // no -- collect the hit
+ resultList.add(hit);
+ dupHits.add(hit);
+
+ // are we done?
+ // we need to find one more than asked for, so that we can tell if
+ // there are more hits to be shown
+ if (resultList.size() > numHits)
+ break;
+ }
+ }
+
+ Hits results =
+ new Hits(total,
+ (Hit[])resultList.toArray(new Hit[resultList.size()]));
+ results.setTotalIsExact(totalIsExact);
+ return results;
+ }
+
+
+ public String getExplanation(Query query, Hit hit) throws IOException {
+ return searcher.getExplanation(query, hit);
+ }
+
+ public HitDetails getDetails(Hit hit) throws IOException {
+ return detailer.getDetails(hit);
+ }
+
+ public HitDetails[] getDetails(Hit[] hits) throws IOException {
+ return detailer.getDetails(hits);
+ }
+
+ public Summary getSummary(HitDetails hit, Query query) throws IOException {
+ return summarizer.getSummary(hit, query);
+ }
+
+ public Summary[] getSummary(HitDetails[] hits, Query query)
+ throws IOException {
+ return summarizer.getSummary(hits, query);
+ }
+
+ public byte[] getContent(HitDetails hit) throws IOException {
+ return content.getContent(hit);
+ }
+
+ public ParseData getParseData(HitDetails hit) throws IOException {
+ return content.getParseData(hit);
+ }
+
+ public ParseText getParseText(HitDetails hit) throws IOException {
+ return content.getParseText(hit);
+ }
+
+ public String[] getAnchors(HitDetails hit) throws IOException {
+ return linkDb.getAnchors(hit);
+ }
+
+ public Inlinks getInlinks(HitDetails hit) throws IOException {
+ return linkDb.getInlinks(hit);
+ }
+
+ public long getFetchDate(HitDetails hit) throws IOException {
+ return content.getFetchDate(hit);
+ }
+
+ public void close() throws IOException {
+ if (content != null) { content.close(); }
+ if (searcher != null) { searcher.close(); }
+ if (linkDb != null) { linkDb.close(); }
+ if (fs != null) { fs.close(); }
+ }
+
+ /** For debugging. */
+ public static void main(String[] args) throws Exception {
+ String usage = "NutchBean query";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ Configuration conf = NutchConfiguration.create();
+ NutchBean bean = new NutchBean(conf);
+ Query query = Query.parse(args[0], conf);
+ Hits hits = bean.search(query, 10);
+ System.out.println("Total hits: " + hits.getTotal());
+ int length = (int)Math.min(hits.getTotal(), 10);
+ Hit[] show = hits.getHits(0, length);
+ HitDetails[] details = bean.getDetails(show);
+ Summary[] summaries = bean.getSummary(details, query);
+
+ for (int i = 0; i < hits.getLength(); i++) {
+ System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
+ }
+ }
+
+ public long getProtocolVersion(String className, long arg1) throws IOException {
+ if(DistributedSearch.Protocol.class.getName().equals(className)){
+ return 1;
+ } else {
+ throw new IOException("Unknown Protocol classname:" + className);
+ }
+ }
+
+ /** Responsible for constructing a NutchBean singleton instance and
+ * caching it in the servlet context. This class should be registered in
+ * the deployment descriptor as a listener
+ */
+ public static class NutchBeanConstructor implements ServletContextListener {
+
+ public void contextDestroyed(ServletContextEvent sce) { }
+
+ public void contextInitialized(ServletContextEvent sce) {
+ ServletContext app = sce.getServletContext();
+ Configuration conf = NutchConfiguration.get(app);
+
+ LOG.info("creating new bean");
+ NutchBean bean = null;
+ try {
+ bean = new NutchBean(conf);
+ app.setAttribute(KEY, bean);
+ }
+ catch (IOException ex) {
+ LOG.error(StringUtils.stringifyException(ex));
+ }
+ }
+ }
+
+}
Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java
===================================================================
--- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java (rev 0)
+++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java 2010-03-24 01:08:53 UTC (rev 3005)
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.searcher;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Closeable;
+
+/** Service that searches. */
+public interface Searcher extends Closeable {
+ /** Return the top-scoring hits for a query. */
+ Hits search(Query query, int numHits,
+ int maxHitsPerDup,
+ String dedupField,
+ String sortField, boolean reverse)
+ throws IOException;
+
+ /** Return an HTML-formatted explanation of how a query scored. */
+ String getExplanation(Query query, Hit hit) throws IOException;
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <al...@us...> - 2010-03-24 01:01:33
|
Revision: 3004
http://archive-access.svn.sourceforge.net/archive-access/?rev=3004&view=rev
Author: alexoz
Date: 2010-03-24 01:01:04 +0000 (Wed, 24 Mar 2010)
Log Message:
-----------
Rename the embargo field to 'applies after X seconds since capture' to try to make it clearer this causes a rule to not apply until the embargo period ends. To setup an embargo create a unconditional block rule and then relax it with an allow that has seconds since capture.
Modified Paths:
--------------
trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp
Modified: trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp
===================================================================
--- trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp 2010-03-24 00:36:08 UTC (rev 3003)
+++ trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp 2010-03-24 01:01:04 UTC (rev 3004)
@@ -67,10 +67,10 @@
to <input name="retrievalEnd" id="retrievalEnd"
value="<fmt:formatDate value="${rule.rule.retrievalEnd }" type="both" pattern="yyyy-MM-dd HH:mm:ss"/>" /></p>
- <p><label for="secondsSinceCapture">Seconds since
- capture (embargo):</label> <input name="secondsSinceCapture"
+ <p><label for="secondsSinceCapture">Applies after
+ </label> <input name="secondsSinceCapture"
id="secondsSinceCapture"
- value="<c:out value="${rule.rule.secondsSinceCapture }"/>" /></p>
+ value="<c:out value="${rule.rule.secondsSinceCapture }"/>" /> seconds since capture</p>
<p><label for="policy">Policy:</label> <input name="policy"
id="policy" value="<c:out value="${rule.rule.policy}"/>" /></p>
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-03-24 00:36:16
|
Revision: 3003
http://archive-access.svn.sourceforge.net/archive-access/?rev=3003&view=rev
Author: binzino
Date: 2010-03-24 00:36:08 +0000 (Wed, 24 Mar 2010)
Log Message:
-----------
Create branch for hacks fixing JIRA ARI-2260.
Added Paths:
-----------
tags/nutchwax-0_12_9-JIRA-ARI-2260/
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-24 00:20:44
|
Revision: 3002
http://archive-access.svn.sourceforge.net/archive-access/?rev=3002&view=rev
Author: bradtofel
Date: 2010-03-24 00:20:31 +0000 (Wed, 24 Mar 2010)
Log Message:
-----------
TWEAK: added simplistic PerformanceLogger
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2010-03-23 23:46:12 UTC (rev 3001)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2010-03-24 00:20:31 UTC (rev 3002)
@@ -424,7 +424,9 @@
throws IOException, ServletException, WaybackException {
Resource resource = null;
try {
+ PerformanceLogger p = new PerformanceLogger("replay");
SearchResults results = collection.getResourceIndex().query(wbRequest);
+ p.queried();
if(!(results instanceof CaptureSearchResults)) {
throw new ResourceNotAvailableException("Bad results...");
}
@@ -434,9 +436,12 @@
CaptureSearchResult closest = captureResults.getClosest(wbRequest,
useAnchorWindow);
resource = collection.getResourceStore().retrieveResource(closest);
+ p.retrieved();
ReplayRenderer renderer = replay.getRenderer(wbRequest, closest, resource);
renderer.renderResource(httpRequest, httpResponse, wbRequest,
closest, resource, uriConverter, captureResults);
+ p.rendered();
+ p.write(wbRequest.getReplayTimestamp() + " " + wbRequest.getRequestUrl());
} finally {
if(resource != null) {
resource.close();
@@ -448,7 +453,9 @@
HttpServletRequest httpRequest, HttpServletResponse httpResponse)
throws ServletException, IOException, WaybackException {
+ PerformanceLogger p = new PerformanceLogger("query");
SearchResults results = collection.getResourceIndex().query(wbRequest);
+ p.queried();
if(results instanceof CaptureSearchResults) {
CaptureSearchResults cResults = (CaptureSearchResults) results;
cResults.markClosest(wbRequest);
@@ -462,6 +469,8 @@
} else {
throw new WaybackException("Unknown index format");
}
+ p.rendered();
+ p.write(wbRequest.getRequestUrl());
}
/**
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java 2010-03-24 00:20:31 UTC (rev 3002)
@@ -0,0 +1,71 @@
+/* PerformanceLogger
+ *
+ * $Id$:
+ *
+ * Created on Mar 19, 2010.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+package org.archive.wayback.webapp;
+
+import org.apache.log4j.Logger;
+
+/**
+ * @author brad
+ *
+ */
+public class PerformanceLogger {
+ private static final Logger LOGGER = Logger.getLogger(
+ PerformanceLogger.class.getName());
+
+ private static char delim = '\t';
+
+ private String type = null;
+ private long start = 0;
+ private long query = 0;
+ private long retrieve = -1;
+ private long render = 0;
+ public PerformanceLogger(String type) {
+ this.type = type;
+ this.start = System.currentTimeMillis();
+ }
+ public void queried() {
+ this.query = System.currentTimeMillis();
+ }
+ public void retrieved() {
+ this.retrieve = System.currentTimeMillis();
+ }
+ public void rendered() {
+ this.render = System.currentTimeMillis();
+ }
+ public void write(String info) {
+ StringBuilder sb = new StringBuilder(40);
+ sb.append(type).append(delim);
+ sb.append(query - start).append(delim);
+ if(retrieve == -1) {
+ sb.append(render - query).append(delim);
+ } else {
+ sb.append(retrieve - query).append(delim);
+ sb.append(render - retrieve).append(delim);
+ }
+ sb.append(info);
+ LOGGER.debug(sb.toString());
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 3001
http://archive-access.svn.sourceforge.net/archive-access/?rev=3001&view=rev
Author: bradtofel
Date: 2010-03-23 23:46:12 +0000 (Tue, 23 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): was not setting STRICT_REMARKS to false - causing problems with many web pages using <!--- ----> and such.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2010-03-23 23:45:07 UTC (rev 3000)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2010-03-23 23:46:12 UTC (rev 3001)
@@ -123,8 +123,9 @@
// and finally, parse, using the special lexer that knows how to
// handle javascript blocks containing unescaped HTML entities:
Page lexPage = new Page(resource,charSet);
- ContextAwareLexer lex = new ContextAwareLexer(new Lexer(lexPage),
- context);
+ Lexer lexer = new Lexer(lexPage);
+ Lexer.STRICT_REMARKS = false;
+ ContextAwareLexer lex = new ContextAwareLexer(lexer, context);
Node node;
try {
while((node = lex.nextNode()) != null) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-23 23:45:15
|
Revision: 3000
http://archive-access.svn.sourceforge.net/archive-access/?rev=3000&view=rev
Author: bradtofel
Date: 2010-03-23 23:45:07 +0000 (Tue, 23 Mar 2010)
Log Message:
-----------
INTERFACE: now passing AccessPoint reference into PathRequestParsers
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000)
@@ -31,6 +31,7 @@
import org.archive.wayback.requestparser.BaseRequestParser;
import org.archive.wayback.requestparser.PathRequestParser;
import org.archive.wayback.util.Timestamp;
+import org.archive.wayback.webapp.AccessPoint;
/**
* RequestParser implementation that extracts request info from an Archival Url
@@ -54,7 +55,7 @@
private final static Pattern WB_QUERY_REGEX = Pattern
.compile("^(\\d{0,13})\\*/(.*[^*])$");
- public WaybackRequest parse(String requestPath) {
+ public WaybackRequest parse(String requestPath, AccessPoint ap) {
WaybackRequest wbRequest = null;
Matcher matcher = WB_QUERY_REGEX.matcher(requestPath);
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000)
@@ -31,6 +31,7 @@
import org.archive.wayback.requestparser.BaseRequestParser;
import org.archive.wayback.requestparser.PathRequestParser;
import org.archive.wayback.util.Timestamp;
+import org.archive.wayback.webapp.AccessPoint;
/**
* RequestParser implementation that extracts request info from an Archival Url
@@ -56,7 +57,7 @@
.compile("^(\\d{1,14})-(\\d{1,14})\\*/(.*[^*])$");
- public WaybackRequest parse(String requestPath) {
+ public WaybackRequest parse(String requestPath, AccessPoint ap) {
WaybackRequest wbRequest = null;
Matcher matcher = WB_QUERY2_REGEX.matcher(requestPath);
if (matcher != null && matcher.matches()) {
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000)
@@ -31,6 +31,7 @@
import org.archive.wayback.requestparser.BaseRequestParser;
import org.archive.wayback.requestparser.PathRequestParser;
import org.archive.wayback.util.Timestamp;
+import org.archive.wayback.webapp.AccessPoint;
/**
* RequestParser implementation that extracts request info from an Archival Url
@@ -54,7 +55,7 @@
private final static Pattern WB_PATH_QUERY_REGEX = Pattern
.compile("^(\\d{0,13})\\*/(.*)\\*$");
- public WaybackRequest parse(String requestPath) {
+ public WaybackRequest parse(String requestPath, AccessPoint ap) {
WaybackRequest wbRequest = null;
Matcher matcher = WB_PATH_QUERY_REGEX.matcher(requestPath);
if (matcher != null && matcher.matches()) {
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000)
@@ -31,6 +31,7 @@
import org.archive.wayback.requestparser.BaseRequestParser;
import org.archive.wayback.requestparser.PathRequestParser;
import org.archive.wayback.util.Timestamp;
+import org.archive.wayback.webapp.AccessPoint;
/**
* RequestParser implementation that extracts request info from an Archival Url
@@ -54,7 +55,7 @@
private final static Pattern WB_PATH_QUERY2_REGEX = Pattern
.compile("^(\\d{1,14})-(\\d{1,14})\\*/(.*)\\*$");
- public WaybackRequest parse(String requestPath) {
+ public WaybackRequest parse(String requestPath, AccessPoint ap) {
WaybackRequest wbRequest = null;
Matcher matcher = WB_PATH_QUERY2_REGEX.matcher(requestPath);
if (matcher != null && matcher.matches()) {
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000)
@@ -27,11 +27,17 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.commons.httpclient.URIException;
+import org.archive.net.UURIFactory;
+import org.archive.wayback.ResultURIConverter;
import org.archive.wayback.archivalurl.ArchivalUrlRequestParser;
import org.archive.wayback.core.WaybackRequest;
+import org.archive.wayback.exception.BetterRequestException;
import org.archive.wayback.requestparser.BaseRequestParser;
import org.archive.wayback.requestparser.PathRequestParser;
import org.archive.wayback.util.Timestamp;
+import org.archive.wayback.util.url.UrlOperations;
+import org.archive.wayback.webapp.AccessPoint;
/**
* RequestParser implementation that extracts request info from a Replay
@@ -55,7 +61,8 @@
super(wrapped);
}
- public WaybackRequest parse(String requestPath) {
+ public WaybackRequest parse(String requestPath, AccessPoint ap)
+ throws BetterRequestException {
WaybackRequest wbRequest = null;
Matcher matcher = WB_REQUEST_REGEX.matcher(requestPath);
String urlStr = null;
@@ -105,6 +112,33 @@
wbRequest.setReplayRequest();
wbRequest.setRequestUrl(urlStr);
+ } else {
+ // see if the remainder looks like an URL:
+// String scheme = UrlOperations.urlToScheme(requestPath);
+// if(scheme != null) {
+// // lets interpret this as a replay request missing the
+// // timestamp: use "NOW"
+// String nowTS = Timestamp.currentTimestamp().getDateStr();
+// ResultURIConverter conv = ap.getUriConverter();
+//
+// String betterURI = conv.makeReplayURI(nowTS, requestPath);
+// throw new BetterRequestException(betterURI);
+// } else {
+// // not obviously an URL... see if UURI can handle it:
+// String httpUrl = UrlOperations.HTTP_SCHEME + requestPath;
+// try {
+// UURIFactory.getInstance(httpUrl);
+// // that worked. use httpUrl:
+// String nowTS = Timestamp.currentTimestamp().getDateStr();
+// ResultURIConverter conv = ap.getUriConverter();
+//
+// String betterURI = conv.makeReplayURI(nowTS, requestPath);
+// throw new BetterRequestException(betterURI);
+// } catch (URIException e) {
+// // oh well. lets just fail:
+// }
+// }
+
}
return wbRequest;
}
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000)
@@ -28,6 +28,7 @@
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.BadQueryException;
+import org.archive.wayback.exception.BetterRequestException;
import org.archive.wayback.webapp.AccessPoint;
/**
@@ -48,17 +49,21 @@
/**
* @param requestPath
+ * @param acessPoint
* @return WaybackRequest with information parsed from the requestPath, or
* null if information could not be extracted.
+ * @throws BetterRequestException
*/
- public abstract WaybackRequest parse(String requestPath);
+ public abstract WaybackRequest parse(String requestPath,
+ AccessPoint acessPoint) throws BetterRequestException;
/* (non-Javadoc)
* @see org.archive.wayback.requestparser.BaseRequestParser#parse(javax.servlet.http.HttpServletRequest, org.archive.wayback.webapp.WaybackContext)
*/
@Override
public WaybackRequest parse(HttpServletRequest httpRequest,
- AccessPoint wbContext) throws BadQueryException {
+ AccessPoint acessPoint)
+ throws BadQueryException, BetterRequestException {
String queryString = httpRequest.getQueryString();
String origRequestPath = httpRequest.getRequestURI();
@@ -66,13 +71,13 @@
if (queryString != null) {
origRequestPath += "?" + queryString;
}
- String contextPath = wbContext.getContextPath(httpRequest);
+ String contextPath = acessPoint.getContextPath(httpRequest);
if (!origRequestPath.startsWith(contextPath)) {
return null;
}
String requestPath = origRequestPath.substring(contextPath.length());
- WaybackRequest wbRequest = parse(requestPath);
+ WaybackRequest wbRequest = parse(requestPath, acessPoint);
if(wbRequest != null) {
wbRequest.setResultsPerPage(getMaxRecords());
}
Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java 2010-03-23 23:40:36 UTC (rev 2999)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java 2010-03-23 23:45:07 UTC (rev 3000)
@@ -26,7 +26,9 @@
import org.archive.wayback.archivalurl.ArchivalUrlRequestParser;
import org.archive.wayback.core.WaybackRequest;
+import org.archive.wayback.exception.BetterRequestException;
import org.archive.wayback.requestparser.BaseRequestParser;
+import org.archive.wayback.webapp.AccessPoint;
import junit.framework.TestCase;
@@ -41,87 +43,89 @@
/**
* Test method for {@link org.archive.wayback.archivalurl.requestparser.ReplayRequestParser#parse(java.lang.String)}.
+ * @throws BetterRequestException
*/
- public void testParseString() {
+ public void testParseString() throws BetterRequestException {
BaseRequestParser wrapped = new ArchivalUrlRequestParser();
ReplayRequestParser p = new ReplayRequestParser(wrapped);
WaybackRequest r;
- r = p.parse("");
+ AccessPoint ap = null;
+ r = p.parse("",ap);
assertNull("Should not parse empty string", r);
- r = p.parse("20070101000000/foo.com");
+ r = p.parse("20070101000000/foo.com",ap);
assertNotNull("Should parse legit request sans scheme", r);
assertEquals("parsed request Url",r.getRequestUrl(),"http://foo.com");
assertEquals("Parsed timestamp","20070101000000",r.getReplayTimestamp());
- r = p.parse("20070101000000/foo.com/");
+ r = p.parse("20070101000000/foo.com/",ap);
assertEquals("parsed request Url, maintaining trailing slash",
"http://foo.com/",r.getRequestUrl());
- r = p.parse("200701010000/foo.com");
+ r = p.parse("200701010000/foo.com",ap);
assertEquals("parsed partial date",
"http://foo.com",r.getRequestUrl());
assertEquals("Parsed partial timestamp to earliest",
"20070101000000",r.getReplayTimestamp());
- r = p.parse("20070101000000/http://foo.com");
+ r = p.parse("20070101000000/http://foo.com",ap);
assertEquals("parsed request Url with scheme",
"http://foo.com",r.getRequestUrl());
- r = p.parse("20070101000000/http://foo.com/");
+ r = p.parse("20070101000000/http://foo.com/",ap);
assertEquals("parsed request Url with scheme and trailing slash",
"http://foo.com/",r.getRequestUrl());
- r = p.parse("20070101000000/ftp://foo.com/");
+ r = p.parse("20070101000000/ftp://foo.com/",ap);
assertEquals("parsed request Url with ftp scheme",
"ftp://foo.com/",r.getRequestUrl());
- r = p.parse("20070101000000/https://foo.com/");
+ r = p.parse("20070101000000/https://foo.com/",ap);
assertEquals("parsed request Url with https scheme",
"https://foo.com/",r.getRequestUrl());
- r = p.parse("20070101000000js_/http://foo.com/");
+ r = p.parse("20070101000000js_/http://foo.com/",ap);
assertEquals("parsed request Url with js_ flag",
"http://foo.com/",r.getRequestUrl());
assertTrue("parsed js_ flag",r.isJSContext());
assertFalse("css not set",r.isCSSContext());
- r = p.parse("20070101000000cs_/http://foo.com/");
+ r = p.parse("20070101000000cs_/http://foo.com/",ap);
assertEquals("parsed request Url with cs_ flag",
"http://foo.com/",r.getRequestUrl());
assertTrue("parsed cs_ flag",r.isCSSContext());
assertFalse("js not set",r.isJSContext());
- r = p.parse("20070101000000cs_js_/http://foo.com/");
+ r = p.parse("20070101000000cs_js_/http://foo.com/",ap);
assertEquals("parsed request Url with cs_ and js_ flags",
"http://foo.com/",r.getRequestUrl());
assertTrue("parsed cs_ flag",r.isCSSContext());
assertTrue("parsed js_ flag",r.isJSContext());
- r = p.parse("20070101000000js_cs_/http://foo.com/");
+ r = p.parse("20070101000000js_cs_/http://foo.com/",ap);
assertEquals("parsed request Url with cs_ and js_ flags, backvards",
"http://foo.com/",r.getRequestUrl());
assertTrue("parsed cs_ flag",r.isCSSContext());
assertTrue("parsed js_ flag",r.isJSContext());
- r = p.parse("20070101000000un_/http://foo.com/");
+ r = p.parse("20070101000000un_/http://foo.com/",ap);
assertEquals("parsed request Url with unknown flag",
"http://foo.com/",r.getRequestUrl());
assertFalse("no cs_ flag",r.isCSSContext());
assertFalse("no js_ flag",r.isJSContext());
- r = p.parse("20070101000000un_js_cs_/http://foo.com/");
+ r = p.parse("20070101000000un_js_cs_/http://foo.com/",ap);
assertEquals("parsed request Url with falgs and unknown flag",
"http://foo.com/",r.getRequestUrl());
assertTrue("parsed cs_ flag",r.isCSSContext());
assertTrue("parsed js_ flag",r.isJSContext());
- r = p.parse("20070101000000js_cs_un_/http://foo.com/");
+ r = p.parse("20070101000000js_cs_un_/http://foo.com/",ap);
assertEquals("parsed request Url with falgs and unknown flag at end",
"http://foo.com/",r.getRequestUrl());
assertTrue("parsed cs_ flag",r.isCSSContext());
assertTrue("parsed js_ flag",r.isJSContext());
- r = p.parse("20070101000000un_js_cs_un_/http://foo.com/");
+ r = p.parse("20070101000000un_js_cs_un_/http://foo.com/",ap);
assertEquals("parsed request Url with falgs and unknown flags",
"http://foo.com/",r.getRequestUrl());
assertTrue("parsed cs_ flag",r.isCSSContext());
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2999
http://archive-access.svn.sourceforge.net/archive-access/?rev=2999&view=rev
Author: bradtofel
Date: 2010-03-23 23:40:36 +0000 (Tue, 23 Mar 2010)
Log Message:
-----------
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-03-23 23:39:21 UTC (rev 2998)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-03-23 23:40:36 UTC (rev 2999)
@@ -62,6 +62,9 @@
assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com/path:/"));
assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com/path:/"));
assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com/path:/"));
+ assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com\\"));
+ assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com\\"));
+ assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com:80\\"));
}
public void testResolveUrl() {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2998
http://archive-access.svn.sourceforge.net/archive-access/?rev=2998&view=rev
Author: bradtofel
Date: 2010-03-23 23:39:21 +0000 (Tue, 23 Mar 2010)
Log Message:
-----------
Added test for braces
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2010-03-23 23:38:10 UTC (rev 2997)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2010-03-23 23:39:21 UTC (rev 2998)
@@ -131,7 +131,11 @@
// unescape '%' (%25)
checkCanonicalization("foo.com/pa%25th","foo.com/pa%th");
+ //"http://wayback.archive-it.org/1726/20091231154920cs_/http://alumni.creighton.edu/atf/cf/%257B82F49357-B0BC-48DA-B47F-5701CAC6EDFE%257D/MENU-CSSPLAY.css"
+ checkCanonicalization("foo.com/{a}b","foo.com/%7Ba%7Db");
+ checkCanonicalization("foo.com/%7Ba%7Db","foo.com/%7Ba%7Db");
+
// replace escaped ' ' with '+' in path, unescape legal '!' in path
// no change in query escaping
checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b");
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2997
http://archive-access.svn.sourceforge.net/archive-access/?rev=2997&view=rev
Author: bradtofel
Date: 2010-03-23 23:38:10 +0000 (Tue, 23 Mar 2010)
Log Message:
-----------
Added test for extra escaping regression
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java 2010-03-23 23:36:35 UTC (rev 2996)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java 2010-03-23 23:38:10 UTC (rev 2997)
@@ -67,6 +67,10 @@
pc.contextualizeUrl("/../../image/1.html#REF"));
assertEquals("http://base.com/image/1.html#REF FOO",
pc.contextualizeUrl("/../../image/1.html#REF FOO"));
+ assertEquals("http://base.com/image/foo?boo=baz",
+ pc.contextualizeUrl("/image/foo?boo=baz"));
+ assertEquals("http://base.com/image/foo?boo=baz%3A&gar=war",
+ pc.contextualizeUrl("/image/foo?boo=baz%3A&gar=war"));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2996
http://archive-access.svn.sourceforge.net/archive-access/?rev=2996&view=rev
Author: bradtofel
Date: 2010-03-23 23:36:35 +0000 (Tue, 23 Mar 2010)
Log Message:
-----------
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java
Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java 2010-03-23 23:36:35 UTC (rev 2996)
@@ -0,0 +1,59 @@
+/* RobotRulesTest
+ *
+ * $Id$:
+ *
+ * Created on Jan 15, 2010.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+package org.archive.wayback.accesscontrol.robotstxt;
+
+import java.io.ByteArrayInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+/**
+ * @author brad
+ *
+ */
+public class RobotRulesTest extends TestCase {
+
+ /**
+ * Test method for {@link org.archive.wayback.accesscontrol.robotstxt.RobotRules#blocksPathForUA(java.lang.String, java.lang.String)}.
+ */
+ public void testBlocksPathForUA() {
+ String testString = "User-agent: *\nDisallow:\n";
+ RobotRules rr = new RobotRules();
+ try {
+ rr.parse(new ByteArrayInputStream(testString.getBytes()));
+ assertFalse(rr.hasSyntaxErrors());
+ assertFalse(rr.blocksPathForUA("/", "ia_archiver"));
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ } catch (IOException e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2995
http://archive-access.svn.sourceforge.net/archive-access/?rev=2995&view=rev
Author: bradtofel
Date: 2010-03-20 01:21:00 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
LOGGING: added logging when unable to access a ResourceFile
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2010-03-20 01:19:20 UTC (rev 2994)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2010-03-20 01:21:00 UTC (rev 2995)
@@ -25,6 +25,7 @@
package org.archive.wayback.resourcestore;
import java.io.IOException;
+import java.util.logging.Logger;
import org.archive.wayback.ResourceStore;
import org.archive.wayback.core.Resource;
@@ -45,6 +46,8 @@
*/
public class SimpleResourceStore implements ResourceStore {
+ private final static Logger LOGGER = Logger.getLogger(
+ SimpleResourceStore.class.getName());
private String prefix = null;
public Resource retrieveResource(CaptureSearchResult result)
@@ -71,7 +74,7 @@
r = ResourceFactory.getResource(fileUrl, offset);
} catch (IOException e) {
-
+ LOGGER.warning("Unable to retrieve:" + fileUrl + ":" + offset);
e.printStackTrace();
throw new ResourceNotAvailableException("Unable to retrieve",
e.getLocalizedMessage());
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2994
http://archive-access.svn.sourceforge.net/archive-access/?rev=2994&view=rev
Author: bradtofel
Date: 2010-03-20 01:19:20 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): was not actually caching a robots.txt correctly, causing MANY robots.txt requests.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:16:21 UTC (rev 2993)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:19:20 UTC (rev 2994)
@@ -99,17 +99,29 @@
private String hostToRobotUrlString(String host) {
sb.setLength(0);
sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX);
- return sb.toString();
+ String robotUrl = sb.toString();
+ LOGGER.fine("Adding robot URL:" + robotUrl);
+ return robotUrl;
}
/*
- * Return a List of all robots.txt urls to attempt for this url:
- * If originalURL starts with "www.DOMAIN":
- * [originalURL,DOMAIN]
- * If url starts with "www[0-9]+.DOMAIN":
- * [originalURL,www.DOMAIN,DOMAIN]
+ * Return a List of all robots.txt urls to attempt for this HOST:
+ * If HOST starts with "www.DOMAIN":
+ * [
+ * http://HOST/robots.txt,
+ * http://DOMAIN/robots.txt
+ * ]
+ * If HOST starts with "www[0-9]+.DOMAIN":
+ * [
+ * http://HOST/robots.txt,
+ * http://www.DOMAIN/robots.txt,
+ * http://DOMAIN/robots.txt
+ * ]
* Otherwise:
- * [originalURL,www.originalURL]
+ * [
+ * http://HOST/robots.txt,
+ * http://www.HOST/robots.txt
+ * ]
*/
protected List<String> searchResultToRobotUrlStrings(String resultHost) {
ArrayList<String> list = new ArrayList<String>();
@@ -135,22 +147,41 @@
private RobotRules getRules(CaptureSearchResult result) {
RobotRules rules = null;
RobotRules tmpRules = null;
- String host = result.getOriginalHost();
+ String host;
+ try {
+ host = result.getOriginalHost();
+ } catch(Exception e) {
+ LOGGER.warning("ROBOT: Failed to get host from("+result.getOriginalUrl()+")");
+ return null;
+ }
List<String> urlStrings = searchResultToRobotUrlStrings(host);
Iterator<String> itr = urlStrings.iterator();
String firstUrlString = null;
-
+// StringBuilder sb = new StringBuilder();
+// for(String ttt : urlStrings) {
+// sb.append("RU(").append(ttt).append(")");
+// }
+// LOGGER.info("RobotUrls for("+host+")"+sb.toString());
+ // loop through them all. As soon as we get a response, store that
+ // in the cache for the FIRST url we tried and return it..
+ // If we get no responses for any of the robot URLs, use "empty" rules,
+ // and record that in the cache, too.
+
while(rules == null && itr.hasNext()) {
String urlString = (String) itr.next();
if(firstUrlString == null) {
firstUrlString = urlString;
}
if(rulesCache.containsKey(urlString)) {
- LOGGER.fine("ROBOT: Cached("+urlString+")");
+ LOGGER.info("ROBOT: Cached("+urlString+")");
rules = rulesCache.get(urlString);
+ if(!urlString.equals(firstUrlString)) {
+ LOGGER.info("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")");
+ rulesCache.put(firstUrlString, rules);
+ }
} else {
try {
- LOGGER.fine("ROBOT: NotCached("+urlString+")");
+ LOGGER.info("ROBOT: NotCached("+urlString+")");
tmpRules = new RobotRules();
Resource resource = webCache.getCachedResource(new URL(urlString),
@@ -165,18 +196,19 @@
LOGGER.info("ROBOT: Downloaded("+urlString+")");
} catch (LiveDocumentNotAvailableException e) {
- // cache an empty rule: all OK
-// rulesCache.put(firstUrlString, emptyRules);
-// rules = emptyRules;
- continue;
+ LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")");
+
} catch (MalformedURLException e) {
e.printStackTrace();
+ LOGGER.info("ROBOT: MalformedURLException("+urlString+")");
return null;
} catch (IOException e) {
- e.printStackTrace();
+ e.printStackTrace(System.err);
+ LOGGER.info("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage());
return null;
} catch (LiveWebCacheUnavailableException e) {
e.printStackTrace();
+ LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")");
return null;
}
}
@@ -185,6 +217,7 @@
// special-case, allow empty rules if no longer available.
rulesCache.put(firstUrlString,emptyRules);
rules = emptyRules;
+ LOGGER.info("No rules available, using emptyRules for:" + firstUrlString);
}
return rules;
}
@@ -203,6 +236,7 @@
url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL));
if(!rules.blocksPathForUA(url.getPath(), userAgent)) {
filterResult = ObjectFilter.FILTER_INCLUDE;
+ LOGGER.fine("ROBOT: ALLOWED("+resultURL+")");
} else {
LOGGER.info("ROBOT: BLOCKED("+resultURL+")");
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2993
http://archive-access.svn.sourceforge.net/archive-access/?rev=2993&view=rev
Author: bradtofel
Date: 2010-03-20 01:16:21 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
LOGGING: toned down logging level for a message
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2010-03-20 01:15:37 UTC (rev 2992)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2010-03-20 01:16:21 UTC (rev 2993)
@@ -61,7 +61,7 @@
if(nextSearch == null) {
break;
}
- LOGGER.info("EXCLUSION-MAP:Checking " + nextSearch);
+ LOGGER.trace("EXCLUSION-MAP:Checking " + nextSearch);
if(exclusionMap.containsKey(nextSearch)) {
LOGGER.info("EXCLUSION-MAP: EXCLUDED: \"" + nextSearch + "\" (" + url +")");
return true;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2992
http://archive-access.svn.sourceforge.net/archive-access/?rev=2992&view=rev
Author: bradtofel
Date: 2010-03-20 01:15:37 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
LOGGING: improved logging
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2010-03-20 01:14:08 UTC (rev 2991)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2010-03-20 01:15:37 UTC (rev 2992)
@@ -83,6 +83,8 @@
lastUpdated = -1;
currentMap = null;
e.printStackTrace();
+ LOGGER.error("Reload " + file.getAbsolutePath() + " FAILED:" +
+ e.getLocalizedMessage());
}
}
protected Map<String,Object> loadFile(String path) throws IOException {
@@ -97,7 +99,7 @@
}
String surt = line.startsWith("(") ? line :
SURTTokenizer.prefixKey(line);
- LOGGER.info("EXCLUSION-MAP: adding " + surt);
+ LOGGER.trace("EXCLUSION-MAP: adding " + surt);
newMap.put(surt, null);
}
itr.close();
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-20 01:14:14
|
Revision: 2991
http://archive-access.svn.sourceforge.net/archive-access/?rev=2991&view=rev
Author: bradtofel
Date: 2010-03-20 01:14:08 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
FEATURE: actually tries to devine if a stream is chunked or not before setting the chunked inputs stream wrapper.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2010-03-20 01:11:51 UTC (rev 2990)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2010-03-20 01:14:08 UTC (rev 2991)
@@ -66,8 +66,49 @@
*/
public void setChunkedEncoding() throws IOException {
validate();
- is = new ChunkedInputStream(is);
+ // peek ahead and make sure we have a line with hex numbers:
+ int max = 50;
+ is.mark(max+2);
+ int cur = 0;
+ boolean isChunked = false;
+ while(cur < max) {
+ int nextC = is.read();
+ if(nextC == 10) {
+ // must have read at least 1 hex char:
+ if(cur > 0) {
+ nextC = is.read();
+ if(nextC == 13) {
+ isChunked = true;
+ break;
+ }
+ }
+ } else {
+ // better be a hex character:
+ if(!isHex(nextC)) {
+ break;
+ }
+ }
+ cur++;
+ }
+ is.reset();
+ if(isChunked) {
+ is = new ChunkedInputStream(is);
+ }
}
+
+ private boolean isHex(int c) {
+ if((c >= '0') && (c <= '9')) {
+ return true;
+ }
+ if((c >= 'a') && (c <= 'f')) {
+ return true;
+ }
+ if((c >= 'A') && (c <= 'F')) {
+ return true;
+ }
+ return false;
+ }
+
/**
* @return
* @throws IOException
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-20 01:11:59
|
Revision: 2990
http://archive-access.svn.sourceforge.net/archive-access/?rev=2990&view=rev
Author: bradtofel
Date: 2010-03-20 01:11:51 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
INITIAL REV: tests for 2 transformers.
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java
Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java 2010-03-20 01:11:51 UTC (rev 2990)
@@ -0,0 +1,82 @@
+/* JSStringTransformerTest
+ *
+ * $Id$:
+ *
+ * Created on Dec 10, 2009.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+package org.archive.wayback.replay.html.transformer;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+
+import org.archive.wayback.replay.html.ContextResultURIConverterFactory;
+import org.archive.wayback.replay.html.ReplayParseContext;
+
+import junit.framework.TestCase;
+
+/**
+ * @author brad
+ *
+ */
+public class JSStringTransformerTest extends TestCase {
+
+ /**
+ * Test method for {@link org.archive.wayback.replay.html.transformer.JSStringTransformer#transform(org.archive.wayback.replay.html.ReplayParseContext, java.lang.String)}.
+ * @throws MalformedURLException
+ */
+ public void testTransform() throws MalformedURLException {
+ RecordingReplayParseContext rc = new RecordingReplayParseContext(null, new URL("http://foo.com/"), null);
+ String input = "'<a href=\'http://www.gavelgrab.org\' target=\'_blank\'>Learn more in Gavel Grab</a>'";
+ JSStringTransformer jst = new JSStringTransformer();
+ jst.transform(rc, input);
+ assertEquals(1,rc.got.size());
+ assertEquals("http://www.gavelgrab.org",rc.got.get(0));
+
+ input = "'<a href=\'http://www.gavelgrab.org/foobla/blah\' target=\'_blank\'>Learn more in Gavel Grab</a>'";
+ rc = new RecordingReplayParseContext(null, new URL("http://foo.com/"), null);
+ jst.transform(rc, input);
+ assertEquals(1,rc.got.size());
+ assertEquals("http://www.gavelgrab.org",rc.got.get(0));
+
+ }
+ public class RecordingReplayParseContext extends ReplayParseContext {
+ ArrayList<String> got = null;
+ /**
+ * @param uriConverterFactory
+ * @param baseUrl
+ * @param datespec
+ */
+ public RecordingReplayParseContext(
+ ContextResultURIConverterFactory uriConverterFactory,
+ URL baseUrl, String datespec) {
+ super(uriConverterFactory, baseUrl, datespec);
+ got = new ArrayList<String>();
+ // TODO Auto-generated constructor stub
+ }
+ public String contextualizeUrl(String url) {
+ got.add(url);
+ return url;
+ }
+
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java 2010-03-20 01:11:51 UTC (rev 2990)
@@ -0,0 +1,64 @@
+/* MetaRefreshUrlStringTransformerTest
+ *
+ * $Id$:
+ *
+ * Created on Jan 12, 2010.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+package org.archive.wayback.replay.html.transformer;
+
+import junit.framework.TestCase;
+
+/**
+ * @author brad
+ *
+ */
+public class MetaRefreshUrlStringTransformerTest extends TestCase {
+
+ /**
+ * Test method for {@link org.archive.wayback.replay.html.transformer.MetaRefreshUrlStringTransformer#transform(org.archive.wayback.replay.html.ReplayParseContext, java.lang.String)}.
+ */
+ public void testTransform() {
+// cmpT("0; url=http://foo.com/bar","0; url=(((http://foo.com/bar)))");
+// cmpT("0; url=/bar","0; url=(((/bar)))");
+// cmpT("0; url =/bar","0; url =(((/bar)))");
+// cmpT("0; url =/bar","0; url =(((/bar)))");
+// cmpT("; url =/bar","; url =/bar");
+// cmpT("0; URL =/bar","0; URL =(((/bar)))");
+//
+// cmpT("0; URL = /bar","0; URL = (((/bar)))");
+// cmpT("0; URL = /bar ","0; URL = (((/bar))) ");
+// cmpT("0; URL = /bar ","0; URL = (((/bar))) ");
+// cmpT("0; URL = /baz foo","0; URL = (((/baz foo)))");
+// cmpT("0; URL = /baz foo ","0; URL = (((/baz foo))) ");
+// cmpT("0; URL=/baz foo ","0; URL=(((/baz foo))) ");
+//
+// cmpT("0; UrL=/baz foo ","0; UrL=(((/baz foo))) ");
+// cmpT("0; UrL=/baZefoo ","0; UrL=(((/baZefoo))) ");
+
+ }
+ private void cmpT(String source, String want) {
+ MetaRefreshUrlStringTransformer m = new MetaRefreshUrlStringTransformer();
+ String got = m.transform(null,source);
+ assertEquals(want, got);
+ }
+
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2989
http://archive-access.svn.sourceforge.net/archive-access/?rev=2989&view=rev
Author: bradtofel
Date: 2010-03-20 01:11:18 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
FEATURE: transformer for META refresh tags
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java 2010-03-20 01:11:18 UTC (rev 2989)
@@ -0,0 +1,78 @@
+/* MetaRefreshUrlStringTransformer
+ *
+ * $Id$:
+ *
+ * Created on Jan 12, 2010.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+package org.archive.wayback.replay.html.transformer;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.archive.wayback.replay.html.ReplayParseContext;
+import org.archive.wayback.replay.html.StringTransformer;
+
+/**
+ * @author brad
+ *
+ */
+public class MetaRefreshUrlStringTransformer extends URLStringTransformer
+implements StringTransformer {
+
+ private final static Pattern refreshURLPattern =
+ Pattern.compile("^\\d+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$",
+ Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+
+ /* (non-Javadoc)
+ * @see org.archive.wayback.replay.html.StringTransformer#transform(org.archive.wayback.replay.html.ReplayParseContext, java.lang.String)
+ */
+ public String transform(ReplayParseContext context, String input) {
+ /*
+ <META
+ HTTP-EQUIV="Refresh"
+ CONTENT="0; URL=/ics/default.asp">
+
+ Our argument "input" is set to the value of the "CONTENT" attribute.
+
+ So, we need to search for the "URL=", take everything to the right
+ of that, trim it, contextualize it, and return that.
+ */
+ Matcher m = refreshURLPattern.matcher(input);
+ if(m.matches()) {
+ if(m.groupCount() == 1) {
+ StringBuilder sb = new StringBuilder(input.length() * 2);
+
+ sb.append(input.substring(0,m.start(1)));
+
+ sb.append(super.transform(context, m.group(1)));
+
+ // This was temporarily used for testing the regex:
+// sb.append("(((").append(m.group(1)).append(")))");
+
+ sb.append(input.substring(m.end(1)));
+ return sb.toString();
+ }
+ }
+ return input;
+ }
+
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-20 01:05:45
|
Revision: 2988
http://archive-access.svn.sourceforge.net/archive-access/?rev=2988&view=rev
Author: bradtofel
Date: 2010-03-20 01:05:39 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): replaced URL to host processing with REGEX, to better handle URLs with freakish illegal characters before the port/path start.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-03-20 01:02:49 UTC (rev 2987)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-03-20 01:05:39 UTC (rev 2988)
@@ -92,6 +92,9 @@
Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" +
"(" + IP_PATTERN + ")");
+ private static final Pattern AUTHORITY_REGEX_SIMPLE =
+ Pattern.compile("([0-9a-z_.-]++)");
+
/**
* @param urlPart
* @return boolean indicating whether urlPart might be an Authority.
@@ -186,22 +189,11 @@
for(String scheme : ALL_SCHEMES) {
if(url.startsWith(scheme)) {
int hostIdx = scheme.length();
- int portIdx = url.indexOf(PORT_SEPARATOR, hostIdx + 1);
- int pathIdx = url.indexOf(PATH_START, hostIdx + 1);
- if(portIdx == -1 && pathIdx == -1) {
- return url.substring(hostIdx);
+
+ Matcher m = AUTHORITY_REGEX_SIMPLE.matcher(url.substring(hostIdx));
+ if(m.find()) {
+ return m.group(0);
}
- if(portIdx == -1) {
- return url.substring(hostIdx,pathIdx);
- }
- if(pathIdx == -1) {
- return url.substring(hostIdx,portIdx);
- }
- if(pathIdx > portIdx) {
- return url.substring(hostIdx,portIdx);
- } else {
- return url.substring(hostIdx,pathIdx);
- }
}
}
return url;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-20 01:03:00
|
Revision: 2987
http://archive-access.svn.sourceforge.net/archive-access/?rev=2987&view=rev
Author: bradtofel
Date: 2010-03-20 01:02:49 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX(unreported) now returns closeable iterators, so filehandles/sockets can be cleaned up..
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2010-03-20 01:00:50 UTC (rev 2986)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2010-03-20 01:02:49 UTC (rev 2987)
@@ -180,7 +180,7 @@
* @return Iterator for records beggining with key
* @throws IOException
*/
- public Iterator<String> getRecordIterator(final String prefix) throws IOException {
+ public CloseableIterator<String> getRecordIterator(final String prefix) throws IOException {
RecordIterator itr = null;
RandomAccessFile raf = new RandomAccessFile(file,"r");
long offset = findKeyOffset(raf,prefix);
@@ -190,7 +190,7 @@
return itr;
}
- public Iterator<String> getRecordIteratorLT(final String prefix) throws IOException {
+ public CloseableIterator<String> getRecordIteratorLT(final String prefix) throws IOException {
RecordIterator itr = null;
RandomAccessFile raf = new RandomAccessFile(file,"r");
long offset = findKeyOffsetLT(raf,prefix);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-20 01:00:57
|
Revision: 2986
http://archive-access.svn.sourceforge.net/archive-access/?rev=2986&view=rev
Author: bradtofel
Date: 2010-03-20 01:00:50 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): was not using correct resolve method, causing extra level of escaping on all GET arguments.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-20 00:59:42 UTC (rev 2985)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-20 01:00:50 UTC (rev 2986)
@@ -100,7 +100,7 @@
url = url.substring(0,hashIdx);
}
try {
- return baseUrl.resolve(url).toString() + frag;
+ return baseUrl.resolve(url,true).toString() + frag;
} catch (URIException e) {
e.printStackTrace();
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2985
http://archive-access.svn.sourceforge.net/archive-access/?rev=2985&view=rev
Author: bradtofel
Date: 2010-03-20 00:59:42 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX: now closes iterators so open filehandles don't stack up.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-03-20 00:58:14 UTC (rev 2984)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-03-20 00:59:42 UTC (rev 2985)
@@ -91,7 +91,7 @@
public void init() throws IOException {
chunkMap = new HashMap<String, String>();
FlatFile ff = new FlatFile(chunkMapPath);
- Iterator<String> lines = ff.getSequentialIterator();
+ CloseableIterator<String> lines = ff.getSequentialIterator();
while(lines.hasNext()) {
String line = lines.next();
String[] parts = line.split("\\s");
@@ -101,6 +101,7 @@
}
chunkMap.put(parts[0],parts[1]);
}
+ lines.close();
chunkIndex = new FlatFile(chunkIndexPath);
}
protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr)
@@ -130,7 +131,7 @@
}
public Iterator<String> getStringPrefixIterator(String prefix) throws ResourceIndexNotAvailableException, IOException {
- Iterator<String> itr = chunkIndex.getRecordIteratorLT(prefix);
+ CloseableIterator<String> itr = chunkIndex.getRecordIteratorLT(prefix);
ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>();
boolean first = true;
while(itr.hasNext()) {
@@ -161,6 +162,7 @@
long offset = Long.parseLong(parts[2]);
blocks.add(new ZiplinedBlock(url, offset));
}
+ itr.close();
return new StringPrefixIterator(new ZiplinesChunkIterator(blocks),prefix);
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
Revision: 2984
http://archive-access.svn.sourceforge.net/archive-access/?rev=2984&view=rev
Author: bradtofel
Date: 2010-03-20 00:58:14 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): checks that filters are not null, which likely indicates a situation where no results can be returned anyways.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java 2010-03-19 21:18:47 UTC (rev 2983)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java 2010-03-20 00:58:14 UTC (rev 2984)
@@ -55,7 +55,11 @@
public int filterObject(CaptureSearchResult r) {
Iterator<ObjectFilter<CaptureSearchResult>> itr = filters.iterator();
while(itr.hasNext()) {
- int result = itr.next().filterObject(r);
+ ObjectFilter<CaptureSearchResult> filter = itr.next();
+ if(filter == null) {
+ return FILTER_EXCLUDE;
+ }
+ int result = filter.filterObject(r);
if(result != FILTER_INCLUDE) {
return result;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2010-03-19 21:18:55
|
Revision: 2983
http://archive-access.svn.sourceforge.net/archive-access/?rev=2983&view=rev
Author: bradtofel
Date: 2010-03-19 21:18:47 +0000 (Fri, 19 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): adding http:// if missing for server-relative redirect
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2010-03-19 02:19:23 UTC (rev 2982)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2010-03-19 21:18:47 UTC (rev 2983)
@@ -39,6 +39,7 @@
import org.apache.log4j.Logger;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
+import org.archive.util.ArchiveUtils;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.util.url.UrlOperations;
@@ -131,7 +132,8 @@
int thirdSlash = remainder.indexOf('/');
if(thirdSlash > -1) {
String datespec = remainder.substring(0,thirdSlash);
- String url = remainder.substring(thirdSlash+1);
+ String url = ArchiveUtils.addImpliedHttpIfNecessary(
+ remainder.substring(thirdSlash+1));
String thisPath = httpRequest.getRequestURI();
String queryString = httpRequest.getQueryString();
if (queryString != null) {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-03-19 02:19:29
|
Revision: 2982
http://archive-access.svn.sourceforge.net/archive-access/?rev=2982&view=rev
Author: binzino
Date: 2010-03-19 02:19:23 +0000 (Fri, 19 Mar 2010)
Log Message:
-----------
Put back line accidentally removed configuring the 'digest' metadata field for indexing.
Modified Paths:
--------------
tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml
Modified: tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml 2010-03-18 23:05:40 UTC (rev 2981)
+++ tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml 2010-03-19 02:19:23 UTC (rev 2982)
@@ -48,6 +48,7 @@
site:false:false:untokenized
url:false:true:tokenized
+ digest:false:true:no
collection:true:true:no_norms
date:true:true:no_norms
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bi...@us...> - 2010-03-18 23:05:50
|
Revision: 2981
http://archive-access.svn.sourceforge.net/archive-access/?rev=2981&view=rev
Author: binzino
Date: 2010-03-18 23:05:40 +0000 (Thu, 18 Mar 2010)
Log Message:
-----------
NutchWAX 0.13 release tag/branch.
Added Paths:
-----------
tags/nutchwax-0_13/
tags/nutchwax-0_13/archive/
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|