You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bi...@us...> - 2010-03-24 01:09:00
|
Revision: 3005 http://archive-access.svn.sourceforge.net/archive-access/?rev=3005&view=rev Author: binzino Date: 2010-03-24 01:08:53 +0000 (Wed, 24 Mar 2010) Log Message: ----------- Various hacks for ARI-2260. Modified Paths: -------------- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java Added Paths: ----------- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java =================================================================== --- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java (rev 0) +++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/CollapsingHitCollector.java 2010-03-24 01:08:53 UTC (rev 3005) @@ -0,0 +1,246 @@ +package org.apache.nutch.searcher; + +import java.io.*; +import java.util.*; + +import org.apache.lucene.search.TopDocCollector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; + +public class CollapsingHitCollector extends TopDocCollector +{ + public static final Comparator<Hit> SCORE_COMPARATOR = new Comparator<Hit>( ) + { + public int compare( Hit h1, Hit h2 ) + { + if ( h1.score < h2.score ) return -1; + if ( h1.score > h2.score ) return 1; + + // must be equal + return 0; + } + }; + + public static final Comparator<Hit> SITE_COMPARATOR_PARTIAL = new Comparator<Hit>( ) + { + public int compare( Hit h1, Hit h2 ) + { + return String.CASE_INSENSITIVE_ORDER.compare( h1.site, h2.site ); + } + }; + + public static final Comparator<Hit> SITE_COMPARATOR_TOTAL = new Comparator<Hit>( ) + { + public int compare( Hit h1, Hit h2 ) + { + return String.CASE_INSENSITIVE_ORDER.compare( h1.site, h2.site ); + } + }; + + public static class Hit + { + int id; + float score; + String site; + + public Hit( int id, float score, String site ) + { + this.id = id; + this.score = score; + this.site = site; + } + + public int compareTo( Hit that ) + { + if ( this.score < that.score ) return -1; + if ( this.score > that.score ) return 1; + + if ( this.id < that.id ) return -1; + if ( this.id > that.id ) return 1; + + return 0; + } + } + + public org.apache.lucene.search.Searcher searcher; + public int numHits; + public int hitsPerSite; + + final Hit[] sortedByScore; + final Hit[] sortedBySite; + final Hit candidate; + + int numUncollapsedHits = 0; + + public CollapsingHitCollector( org.apache.lucene.search.Searcher searcher, int numHits, int hitsPerSite ) + { + super( numHits ); + this.searcher = searcher; + this.numHits = numHits; + this.hitsPerSite = hitsPerSite; + + this.sortedByScore = new Hit[numHits]; + this.sortedBySite = new Hit[numHits]; + + for ( int i = 0; i < numHits; i++ ) + { + Hit sd = new Hit( -1, Float.NEGATIVE_INFINITY, "" ); + this.sortedByScore[i] = sd; + this.sortedBySite [i] = sd; + } + + this.candidate = new Hit( -1, Float.NEGATIVE_INFINITY, "" ); + + } + + public void collect( int doc, float score ) + { + this.numUncollapsedHits++; + + this.candidate.id = doc; + this.candidate.score = score; + + if ( this.candidate.score <= this.sortedByScore[0].score ) + { + return ; + } + + try + { + String url = this.searcher.doc( this.candidate.id ).get( "url"); + try + { + java.net.URL u = new java.net.URL( url ); + + this.candidate.site = u.getHost(); + } + catch ( java.net.MalformedURLException e ) { } + } + catch ( IOException ioe ) { throw new RuntimeException( ioe ); } + + // Use "" rather than null to keep searching and sorting simple. + if ( this.candidate.site == null ) this.candidate.site = ""; + + int sitePos = findReplacementPosition( candidate ); + + // No existing hit to be replaced, so we replace the overall + // lowest-scoring one, which is always in position 0 in the + // sortedByScore list. + if ( sitePos < 0 ) + { + this.sortedByScore[0].id = candidate.id; + this.sortedByScore[0].score = candidate.score; + this.sortedByScore[0].site = candidate.site; + + // Since we just added a new site, re-sort them. + Arrays.sort( this.sortedByScore, SCORE_COMPARATOR ); + + // No need to re-sort the sites if not collapsing. + if ( this.hitsPerSite != 0 ) + { + Arrays.sort( this.sortedBySite, SITE_COMPARATOR_TOTAL ); + } + + // Done! + return ; + } + + // We have an existing Hit from the same site which can be + // replaced *if* the candidate's score is better. + if ( candidate.score > this.sortedBySite[sitePos].score ) + { + this.sortedBySite[sitePos].id = this.candidate.id; + this.sortedBySite[sitePos].score = this.candidate.score; + + // We have to re-sort by scores. + Arrays.sort( this.sortedByScore, SCORE_COMPARATOR ); + + // If our hitsPerSite > 1, then we have to re-sort by site to + // ensure that the hit we just inserted is put into the proper + // sorted position within the site group. If hitsPerSite==1, + // then the group size == 1 and therefore no need to re-sort. + if ( this.hitsPerSite > 1 ) + { + Arrays.sort( this.sortedBySite, SITE_COMPARATOR_TOTAL ); + } + } + } + + private int findReplacementPosition( Hit candidate ) + { + if ( this.hitsPerSite == 0 ) return -1; + + int pos = Arrays.binarySearch( this.sortedBySite, candidate, SITE_COMPARATOR_PARTIAL ); + + if ( pos < 0 || this.hitsPerSite == 1 ) return pos; + + int i = pos, j = pos; + + final int mini = 0, maxj = this.sortedBySite.length - 1; + + for ( ; i > mini && SITE_COMPARATOR_PARTIAL.compare( this.sortedBySite[i], this.sortedBySite[i-1] ) == 0; i-- ) + ; + + for ( ; j < maxj && SITE_COMPARATOR_PARTIAL.compare( this.sortedBySite[i], this.sortedBySite[j+1] ) == 0; j++ ) + ; + + // The number of hits from this site is (j-i+1), so if we are less + // than the max number of hits per site, then we return -1 to + // indicate there is still room for more Hits from the candidate + // site. + if ( (j - i + 1) < this.hitsPerSite ) return -1; + + // Otherwise, the Hit to be potentially replaced is the lowest + // scoring hit, which is the one at position i. + return i; + } + + public Hit[] getHits() + { + Hit[] hits = new Hit[this.getNumHits( )]; + + final int sortedByScoreEndPos = this.sortedByScore.length - 1; + for ( int i = 0; i < hits.length ; i++ ) + { + hits[i] = this.sortedByScore[ sortedByScoreEndPos - i ]; + } + + return hits; + } + + public int getNumHits( ) + { + for ( int i = this.sortedByScore.length - this.numHits ; i < this.sortedByScore.length ; i++ ) + { + if ( this.sortedByScore[i].score != Float.NEGATIVE_INFINITY ) + { + return this.sortedByScore.length - i; + } + } + return 0; + } + + public int getTotalHits() + { + if ( this.hitsPerSite == 0 ) return this.numUncollapsedHits; + + int numCollapsedHits = getNumHits( ); + + if ( numCollapsedHits < this.numHits ) + { + return numCollapsedHits; + } + return this.numUncollapsedHits; + } + + public TopDocs topDocs() + { + Hit[] hits = this.getHits( ); + ScoreDoc[] sd = new ScoreDoc[hits.length]; + for ( int i = 0 ; i < hits.length ; i++ ) + { + sd[i] = new ScoreDoc( hits[i].id, hits[i].score ); + } + return new TopDocs( getTotalHits(), sd, hits[hits.length-1].score ); + } +} \ No newline at end of file Modified: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java =================================================================== --- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java 2010-03-24 01:01:04 UTC (rev 3004) +++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java 2010-03-24 01:08:53 UTC (rev 3005) @@ -254,7 +254,9 @@ } public Hits search(final Query query, final int numHits, - final String dedupField, final String sortField, + final int maxHitsPerDup, + final String dedupField, + final String sortField, final boolean reverse) throws IOException { // Get the list of live servers. It would be nice to build this // list in updateSegments(), but that would create concurrency issues. @@ -282,8 +284,9 @@ params[i][0] = query; params[i][1] = new Integer(numHits); params[i][2] = dedupField; - params[i][3] = sortField; - params[i][4] = Boolean.valueOf(reverse); + params[i][3] = maxHitsPerDup; + params[i][4] = sortField; + params[i][5] = Boolean.valueOf(reverse); } Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses, this.conf); @@ -439,7 +442,7 @@ Client client = new Client(addresses, NutchConfiguration.create()); //client.setTimeout(Integer.MAX_VALUE); - Hits hits = client.search(query, 10, null, null, false); + Hits hits = client.search(query, 10, 0, null, null, false); System.out.println("Total hits: " + hits.getTotal()); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i))); Modified: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java =================================================================== --- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2010-03-24 01:01:04 UTC (rev 3004) +++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2010-03-24 01:08:53 UTC (rev 3005) @@ -90,7 +90,9 @@ } public Hits search(Query query, int numHits, - String dedupField, String sortField, boolean reverse) + int maxHitsPerDup, + String dedupField, + String sortField, boolean reverse) throws IOException { org.apache.lucene.search.BooleanQuery luceneQuery = @@ -100,7 +102,7 @@ System.out.println( "Lucene query: " + luceneQuery ); return translateHits - (optimizer.optimize(luceneQuery, luceneSearcher, numHits, + (optimizer.optimize(luceneQuery, luceneSearcher, numHits, maxHitsPerDup, sortField, reverse), dedupField, sortField); } Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java =================================================================== --- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (rev 0) +++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 2010-03-24 01:08:53 UTC (rev 3005) @@ -0,0 +1,278 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.QueryFilter; +import org.apache.lucene.search.*; +import org.apache.lucene.index.Term; +import org.apache.lucene.misc.ChainedFilter; + +import org.apache.hadoop.conf.Configuration; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.ArrayList; + +import java.io.IOException; + +/** Utility which converts certain query clauses into {@link QueryFilter}s and + * caches these. Only required clauses whose boost is zero are converted to + * cached filters. Range queries are converted to range filters. This + * accellerates query constraints like date, language, document format, etc., + * which do not affect ranking but might otherwise slow search considerably. */ +class LuceneQueryOptimizer { + + // This thread provides a pseudo-clock service to all searching + // threads, so that they can count elapsed time with less overhead than + // repeatedly calling System.currentTimeMillis. + private TimerThread timerThread = null; + + private static class TimerThread extends Thread { + private int tick; + // NOTE: we can avoid explicit synchronization here for several reasons: + // * updates to 32-bit-sized variables are atomic + // * only single thread modifies this value + // * use of volatile keyword ensures that it does not reside in + // a register, but in main memory (so that changes are visible to + // other threads). + // * visibility of changes does not need to be instantanous, we can + // afford losing a tick or two. + // + // See section 17 of the Java Language Specification for details. + public volatile int timeCounter = 0; + + boolean running = true; + + public TimerThread(int tick) { + super("LQO timer thread"); + this.tick = tick; + this.setDaemon(true); + } + + public void run() { + while(running) { + timeCounter++; + try { + Thread.sleep(tick); + } catch (InterruptedException ie) {}; + } + } + } + + private void initTimerThread(int p) { + if (timerThread == null || !timerThread.isAlive()) { + timerThread = new TimerThread(p); + timerThread.start(); + } + } + + + private static class TimeExceeded extends RuntimeException { + public long maxTime; + private int maxDoc; + public TimeExceeded(long maxTime, int maxDoc) { + super("Exceeded search time: " + maxTime + " ms."); + this.maxTime = maxTime; + this.maxDoc = maxDoc; + } + } + + private static class LimitedCollector extends TopDocCollector { + private int maxHits; + private int maxTicks; + private int startTicks; + private TimerThread timer; + private int curTicks; + + public LimitedCollector(int numHits, int maxHits, int maxTicks, + TimerThread timer) { + super(numHits); + this.maxHits = maxHits; + this.maxTicks = maxTicks; + if (timer != null) { + this.timer = timer; + this.startTicks = timer.timeCounter; + } + } + + public void collect(int doc, float score) { + if (maxHits > 0 && getTotalHits() >= maxHits) { + throw new LimitExceeded(doc); + } + if (timer != null) { + curTicks = timer.timeCounter; + // overflow check + if (curTicks < startTicks) curTicks += Integer.MAX_VALUE; + if (curTicks - startTicks > maxTicks) { + throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc); + } + } + super.collect(doc, score); + } + } + + private static class LimitExceeded extends RuntimeException { + private int maxDoc; + public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; } + } + + private LinkedHashMap<BooleanQuery, Filter> cache; // an LRU cache of QueryFilter + + private float threshold; + + private int searcherMaxHits; + + private int tickLength; + + private int maxTickCount; + + /** + * Construct an optimizer that caches and uses filters for required clauses + * whose boost is zero. + * + * @param cacheSize + * the number of QueryFilters to cache + * @param threshold + * the fraction of documents which must contain a term + */ + public LuceneQueryOptimizer(Configuration conf) { + final int cacheSize = conf.getInt("searcher.filter.cache.size", 16); + this.threshold = conf.getFloat("searcher.filter.cache.threshold", + 0.05f); + this.searcherMaxHits = conf.getInt("searcher.max.hits", -1); + this.cache = new LinkedHashMap<BooleanQuery, Filter>(cacheSize, 0.75f, true) { + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > cacheSize; // limit size of cache + } + }; + this.tickLength = conf.getInt("searcher.max.time.tick_length", 200); + this.maxTickCount = conf.getInt("searcher.max.time.tick_count", -1); + if (this.maxTickCount > 0) { + initTimerThread(this.tickLength); + } + } + + public TopDocs optimize(BooleanQuery original, + Searcher searcher, int numHits, int maxHitsPerDup, + String sortField, boolean reverse) + throws IOException { + + BooleanQuery query = new BooleanQuery(); + BooleanQuery cacheQuery = new BooleanQuery(); + BooleanQuery filterQuery = new BooleanQuery(); + ArrayList<Filter> filters = new ArrayList<Filter>(); + + BooleanClause[] clauses = original.getClauses(); + for (int i = 0; i < clauses.length; i++) { + BooleanClause c = clauses[i]; + if (c.isRequired() // required + && c.getQuery().getBoost() == 0.0f) { // boost is zero + + if (c.getQuery() instanceof TermQuery // TermQuery + && (searcher.docFreq(((TermQuery)c.getQuery()).getTerm()) + / (float)searcher.maxDoc()) < threshold) { // beneath threshold + query.add(c); // don't filterize + continue; + } + + if (c.getQuery() instanceof RangeQuery) { // RangeQuery + RangeQuery range = (RangeQuery)c.getQuery(); + boolean inclusive = range.isInclusive();// convert to RangeFilter + Term lower = range.getLowerTerm(); + Term upper = range.getUpperTerm(); + filters.add(new RangeFilter(lower!=null?lower.field():upper.field(), + lower != null ? lower.text() : null, + upper != null ? upper.text() : null, + inclusive, inclusive)); + cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it + continue; + } + + // all other query types + filterQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // filter it + cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it + continue; + } + + query.add(c); // query it + } + + Filter filter = null; + if (cacheQuery.getClauses().length != 0) { + synchronized (cache) { // check cache + filter = cache.get(cacheQuery); + } + if (filter == null) { // miss + + if (filterQuery.getClauses().length != 0) // add filterQuery to filters + filters.add(new CachingWrapperFilter(new QueryWrapperFilter(filterQuery))); + + if (filters.size() == 1) { // convert filters to filter + filter = (Filter)filters.get(0); + } else { + filter = new ChainedFilter((Filter[])filters.toArray + (new Filter[filters.size()]), + ChainedFilter.AND); + } + if (!(filter instanceof CachingWrapperFilter)) // make sure bits are cached + filter = new CachingWrapperFilter(filter); + + synchronized (cache) { + cache.put(cacheQuery, filter); // cache the filter + } + } + } + if (sortField == null && !reverse) { + + // no hit limit + if (this.searcherMaxHits <= 0 && timerThread == null) { + // FIXME: Need hitsPerSite value, using '1' to test. + TopDocCollector c = new CollapsingHitCollector( searcher, numHits, maxHitsPerDup ); + searcher.search(query, filter, c ); + return c.topDocs( ); + } + + // hits limited in time or in count -- use a LimitedCollector + LimitedCollector collector = new LimitedCollector(numHits, searcherMaxHits, + maxTickCount, timerThread); + LimitExceeded exceeded = null; + TimeExceeded timeExceeded = null; + try { + searcher.search(query, filter, collector); + } catch (LimitExceeded le) { + exceeded = le; + } catch (TimeExceeded te) { + timeExceeded = te; + } + TopDocs results = collector.topDocs(); + if (exceeded != null) { // limit was exceeded + results.totalHits = (int) // must estimate totalHits + (results.totalHits*(searcher.maxDoc()/(float)exceeded.maxDoc)); + } else if (timeExceeded != null) { + // Estimate total hits. + results.totalHits = (int)(results.totalHits * (searcher.maxDoc()/(float)timeExceeded.maxDoc)); + } + return results; + + } else { + return searcher.search(query, filter, numHits, + new Sort(sortField, reverse)); + } + } +} Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java =================================================================== --- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java (rev 0) +++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java 2010-03-24 01:08:53 UTC (rev 3005) @@ -0,0 +1,434 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.*; +import java.util.*; +import javax.servlet.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.fs.*; +import org.apache.hadoop.io.Closeable; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.parse.*; +import org.apache.nutch.indexer.*; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.nutch.util.NutchConfiguration; + +/** + * One stop shopping for search-related functionality. + * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $ + */ +public class NutchBean + implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks, + DistributedSearch.Protocol, Closeable { + + public static final Log LOG = LogFactory.getLog(NutchBean.class); + public static final String KEY = "nutchBean"; + +// static { +// LogFormatter.setShowThreadIDs(true); +// } + + private String[] segmentNames; + + private Searcher searcher; + private HitDetailer detailer; + private HitSummarizer summarizer; + private HitContent content; + private HitInlinks linkDb; + + + /** BooleanQuery won't permit more than 32 required/prohibited clauses. We + * don't want to use too many of those. */ + private static final int MAX_PROHIBITED_TERMS = 20; + + private Configuration conf; + + private FileSystem fs; + + /** Returns the cached instance in the servlet context. + * @see NutchBeanConstructor*/ + public static NutchBean get(ServletContext app, Configuration conf) throws IOException { + NutchBean bean = (NutchBean)app.getAttribute(KEY); + return bean; + } + + + /** + * + * @param conf + * @throws IOException + */ + public NutchBean(Configuration conf) throws IOException { + this(conf, null); + } + + /** + * Construct in a named directory. + * @param conf + * @param dir + * @throws IOException + */ + public NutchBean(Configuration conf, Path dir) throws IOException { + this.conf = conf; + this.fs = FileSystem.get(this.conf); + if (dir == null) { + dir = new Path(this.conf.get("searcher.dir", "crawl")); + } + Path servers = new Path(dir, "search-servers.txt"); + if (fs.exists(servers)) { + if (LOG.isInfoEnabled()) { + LOG.info("searching servers in " + servers); + } + init(new DistributedSearch.Client(servers, conf)); + } else { + init(new Path(dir, "index"), new Path(dir, "indexes"), new Path( + dir, "segments"), new Path(dir, "linkdb")); + } + } + + private void init(Path indexDir, Path indexesDir, Path segmentsDir, + Path linkDb) + throws IOException { + IndexSearcher indexSearcher; + if (this.fs.exists(indexDir)) { + if (LOG.isInfoEnabled()) { + LOG.info("opening merged index in " + indexDir); + } + indexSearcher = new IndexSearcher(indexDir, this.conf); + } else { + if (LOG.isInfoEnabled()) { + LOG.info("opening indexes in " + indexesDir); + } + + Vector vDirs=new Vector(); + FileStatus[] fstats = fs.listStatus(indexesDir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path [] directories = HadoopFSUtil.getPaths(fstats); + for(int i = 0; i < directories.length; i++) { + Path indexdone = new Path(directories[i], Indexer.DONE_NAME); + if(fs.isFile(indexdone)) { + vDirs.add(directories[i]); + } + } + + + directories = new Path[ vDirs.size() ]; + for(int i = 0; vDirs.size()>0; i++) { + directories[i]=(Path)vDirs.remove(0); + } + + indexSearcher = new IndexSearcher(directories, this.conf); + } + + if (LOG.isInfoEnabled()) { + LOG.info("opening segments in " + segmentsDir); + } + FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf); + + this.segmentNames = segments.getSegmentNames(); + + this.searcher = indexSearcher; + this.detailer = indexSearcher; + this.summarizer = segments; + this.content = segments; + + if (LOG.isInfoEnabled()) { LOG.info("opening linkdb in " + linkDb); } + this.linkDb = new LinkDbInlinks(fs, linkDb, this.conf); + } + + private void init(DistributedSearch.Client client) { + this.segmentNames = client.getSegmentNames(); + this.searcher = client; + this.detailer = client; + this.summarizer = client; + this.content = client; + this.linkDb = client; + } + + + public String[] getSegmentNames() { + return segmentNames; + } + + public Hits search(Query query, int numHits) throws IOException { + return search(query, numHits, null, null, false); + } + + public Hits search(Query query, int numHits, + String dedupField, String sortField, boolean reverse) + throws IOException { + + return searcher.search(query, numHits, 0, dedupField, sortField, reverse); + } + + private class DupHits extends ArrayList { + private boolean maxSizeExceeded; + } + + /** Search for pages matching a query, eliminating excessive hits from the + * same site. Hits after the first <code>maxHitsPerDup</code> from the same + * site are removed from results. The remaining hits have {@link + * Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero then all + * hits are returned. + * + * @param query query + * @param numHits number of requested hits + * @param maxHitsPerDup the maximum hits returned with matching values, or zero + * @return Hits the matching hits + * @throws IOException + */ + public Hits search(Query query, int numHits, int maxHitsPerDup) + throws IOException { + return search(query, numHits, maxHitsPerDup, "site", null, false); + } + + /** Search for pages matching a query, eliminating excessive hits with + * matching values for a named field. Hits after the first + * <code>maxHitsPerDup</code> are removed from results. The remaining hits + * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero + * then all hits are returned. + * + * @param query query + * @param numHits number of requested hits + * @param maxHitsPerDup the maximum hits returned with matching values, or zero + * @param dedupField field name to check for duplicates + * @return Hits the matching hits + * @throws IOException + */ + public Hits search(Query query, int numHits, + int maxHitsPerDup, String dedupField) + throws IOException { + return search(query, numHits, maxHitsPerDup, dedupField, null, false); + } + /** Search for pages matching a query, eliminating excessive hits with + * matching values for a named field. Hits after the first + * <code>maxHitsPerDup</code> are removed from results. The remaining hits + * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero + * then all hits are returned. + * + * @param query query + * @param numHits number of requested hits + * @param maxHitsPerDup the maximum hits returned with matching values, or zero + * @param dedupField field name to check for duplicates + * @param sortField Field to sort on (or null if no sorting). + * @param reverse True if we are to reverse sort by <code>sortField</code>. + * @return Hits the matching hits + * @throws IOException + */ + public Hits search(Query query, int numHits, + int maxHitsPerDup, + String dedupField, + String sortField, boolean reverse) + throws IOException { + if (maxHitsPerDup <= 0) // disable dup checking + return search(query, numHits, dedupField, sortField, reverse); + + float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f); + int numHitsRaw = (int)(numHits * rawHitsFactor); + if (LOG.isInfoEnabled()) { + LOG.info("searching for "+numHitsRaw+" raw hits"); + } + Hits hits = searcher.search(query, numHitsRaw, maxHitsPerDup, dedupField, sortField, reverse); + long total = hits.getTotal(); + Map dupToHits = new HashMap(); + List resultList = new ArrayList(); + Set seen = new HashSet(); + List excludedValues = new ArrayList(); + boolean totalIsExact = true; + for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { + // get the next raw hit + if (rawHitNum >= hits.getLength()) { + // optimize query by prohibiting more matches on some excluded values + Query optQuery = (Query)query.clone(); + for (int i = 0; i < excludedValues.size(); i++) { + if (i == MAX_PROHIBITED_TERMS) + break; + optQuery.addProhibitedTerm(((String)excludedValues.get(i)), + dedupField); + } + numHitsRaw = (int)(numHitsRaw * rawHitsFactor); + if (LOG.isInfoEnabled()) { + LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery); + } + hits = searcher.search(optQuery, numHitsRaw, maxHitsPerDup, dedupField, sortField, reverse); + if (LOG.isInfoEnabled()) { + LOG.info("found "+hits.getTotal()+" raw hits"); + } + rawHitNum = -1; + continue; + } + + Hit hit = hits.getHit(rawHitNum); + if (seen.contains(hit)) + continue; + seen.add(hit); + + // get dup hits for its value + String value = hit.getDedupValue(); + DupHits dupHits = (DupHits)dupToHits.get(value); + if (dupHits == null) + dupToHits.put(value, dupHits = new DupHits()); + + // does this hit exceed maxHitsPerDup? + if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit + if (!dupHits.maxSizeExceeded) { + + // mark prior hits with moreFromDupExcluded + for (int i = 0; i < dupHits.size(); i++) { + ((Hit)dupHits.get(i)).setMoreFromDupExcluded(true); + } + dupHits.maxSizeExceeded = true; + + excludedValues.add(value); // exclude dup + } + totalIsExact = false; + } else { // no -- collect the hit + resultList.add(hit); + dupHits.add(hit); + + // are we done? + // we need to find one more than asked for, so that we can tell if + // there are more hits to be shown + if (resultList.size() > numHits) + break; + } + } + + Hits results = + new Hits(total, + (Hit[])resultList.toArray(new Hit[resultList.size()])); + results.setTotalIsExact(totalIsExact); + return results; + } + + + public String getExplanation(Query query, Hit hit) throws IOException { + return searcher.getExplanation(query, hit); + } + + public HitDetails getDetails(Hit hit) throws IOException { + return detailer.getDetails(hit); + } + + public HitDetails[] getDetails(Hit[] hits) throws IOException { + return detailer.getDetails(hits); + } + + public Summary getSummary(HitDetails hit, Query query) throws IOException { + return summarizer.getSummary(hit, query); + } + + public Summary[] getSummary(HitDetails[] hits, Query query) + throws IOException { + return summarizer.getSummary(hits, query); + } + + public byte[] getContent(HitDetails hit) throws IOException { + return content.getContent(hit); + } + + public ParseData getParseData(HitDetails hit) throws IOException { + return content.getParseData(hit); + } + + public ParseText getParseText(HitDetails hit) throws IOException { + return content.getParseText(hit); + } + + public String[] getAnchors(HitDetails hit) throws IOException { + return linkDb.getAnchors(hit); + } + + public Inlinks getInlinks(HitDetails hit) throws IOException { + return linkDb.getInlinks(hit); + } + + public long getFetchDate(HitDetails hit) throws IOException { + return content.getFetchDate(hit); + } + + public void close() throws IOException { + if (content != null) { content.close(); } + if (searcher != null) { searcher.close(); } + if (linkDb != null) { linkDb.close(); } + if (fs != null) { fs.close(); } + } + + /** For debugging. */ + public static void main(String[] args) throws Exception { + String usage = "NutchBean query"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + Configuration conf = NutchConfiguration.create(); + NutchBean bean = new NutchBean(conf); + Query query = Query.parse(args[0], conf); + Hits hits = bean.search(query, 10); + System.out.println("Total hits: " + hits.getTotal()); + int length = (int)Math.min(hits.getTotal(), 10); + Hit[] show = hits.getHits(0, length); + HitDetails[] details = bean.getDetails(show); + Summary[] summaries = bean.getSummary(details, query); + + for (int i = 0; i < hits.getLength(); i++) { + System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]); + } + } + + public long getProtocolVersion(String className, long arg1) throws IOException { + if(DistributedSearch.Protocol.class.getName().equals(className)){ + return 1; + } else { + throw new IOException("Unknown Protocol classname:" + className); + } + } + + /** Responsible for constructing a NutchBean singleton instance and + * caching it in the servlet context. This class should be registered in + * the deployment descriptor as a listener + */ + public static class NutchBeanConstructor implements ServletContextListener { + + public void contextDestroyed(ServletContextEvent sce) { } + + public void contextInitialized(ServletContextEvent sce) { + ServletContext app = sce.getServletContext(); + Configuration conf = NutchConfiguration.get(app); + + LOG.info("creating new bean"); + NutchBean bean = null; + try { + bean = new NutchBean(conf); + app.setAttribute(KEY, bean); + } + catch (IOException ex) { + LOG.error(StringUtils.stringifyException(ex)); + } + } + } + +} Added: tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java =================================================================== --- tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java (rev 0) +++ tags/nutchwax-0_12_9-JIRA-ARI-2260/archive/src/nutch/src/java/org/apache/nutch/searcher/Searcher.java 2010-03-24 01:08:53 UTC (rev 3005) @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.IOException; + +import org.apache.hadoop.io.Closeable; + +/** Service that searches. */ +public interface Searcher extends Closeable { + /** Return the top-scoring hits for a query. */ + Hits search(Query query, int numHits, + int maxHitsPerDup, + String dedupField, + String sortField, boolean reverse) + throws IOException; + + /** Return an HTML-formatted explanation of how a query scored. */ + String getExplanation(Query query, Hit hit) throws IOException; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <al...@us...> - 2010-03-24 01:01:33
|
Revision: 3004 http://archive-access.svn.sourceforge.net/archive-access/?rev=3004&view=rev Author: alexoz Date: 2010-03-24 01:01:04 +0000 (Wed, 24 Mar 2010) Log Message: ----------- Rename the embargo field to 'applies after X seconds since capture' to try to make it clearer this causes a rule to not apply until the embargo period ends. To setup an embargo create a unconditional block rule and then relax it with an allow that has seconds since capture. Modified Paths: -------------- trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp Modified: trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp =================================================================== --- trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp 2010-03-24 00:36:08 UTC (rev 3003) +++ trunk/archive-access/projects/access-control/oracle/src/main/webapp/WEB-INF/views/list_rules.jsp 2010-03-24 01:01:04 UTC (rev 3004) @@ -67,10 +67,10 @@ to <input name="retrievalEnd" id="retrievalEnd" value="<fmt:formatDate value="${rule.rule.retrievalEnd }" type="both" pattern="yyyy-MM-dd HH:mm:ss"/>" /></p> - <p><label for="secondsSinceCapture">Seconds since - capture (embargo):</label> <input name="secondsSinceCapture" + <p><label for="secondsSinceCapture">Applies after + </label> <input name="secondsSinceCapture" id="secondsSinceCapture" - value="<c:out value="${rule.rule.secondsSinceCapture }"/>" /></p> + value="<c:out value="${rule.rule.secondsSinceCapture }"/>" /> seconds since capture</p> <p><label for="policy">Policy:</label> <input name="policy" id="policy" value="<c:out value="${rule.rule.policy}"/>" /></p> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-03-24 00:36:16
|
Revision: 3003 http://archive-access.svn.sourceforge.net/archive-access/?rev=3003&view=rev Author: binzino Date: 2010-03-24 00:36:08 +0000 (Wed, 24 Mar 2010) Log Message: ----------- Create branch for hacks fixing JIRA ARI-2260. Added Paths: ----------- tags/nutchwax-0_12_9-JIRA-ARI-2260/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-24 00:20:44
|
Revision: 3002 http://archive-access.svn.sourceforge.net/archive-access/?rev=3002&view=rev Author: bradtofel Date: 2010-03-24 00:20:31 +0000 (Wed, 24 Mar 2010) Log Message: ----------- TWEAK: added simplistic PerformanceLogger Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2010-03-23 23:46:12 UTC (rev 3001) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2010-03-24 00:20:31 UTC (rev 3002) @@ -424,7 +424,9 @@ throws IOException, ServletException, WaybackException { Resource resource = null; try { + PerformanceLogger p = new PerformanceLogger("replay"); SearchResults results = collection.getResourceIndex().query(wbRequest); + p.queried(); if(!(results instanceof CaptureSearchResults)) { throw new ResourceNotAvailableException("Bad results..."); } @@ -434,9 +436,12 @@ CaptureSearchResult closest = captureResults.getClosest(wbRequest, useAnchorWindow); resource = collection.getResourceStore().retrieveResource(closest); + p.retrieved(); ReplayRenderer renderer = replay.getRenderer(wbRequest, closest, resource); renderer.renderResource(httpRequest, httpResponse, wbRequest, closest, resource, uriConverter, captureResults); + p.rendered(); + p.write(wbRequest.getReplayTimestamp() + " " + wbRequest.getRequestUrl()); } finally { if(resource != null) { resource.close(); @@ -448,7 +453,9 @@ HttpServletRequest httpRequest, HttpServletResponse httpResponse) throws ServletException, IOException, WaybackException { + PerformanceLogger p = new PerformanceLogger("query"); SearchResults results = collection.getResourceIndex().query(wbRequest); + p.queried(); if(results instanceof CaptureSearchResults) { CaptureSearchResults cResults = (CaptureSearchResults) results; cResults.markClosest(wbRequest); @@ -462,6 +469,8 @@ } else { throw new WaybackException("Unknown index format"); } + p.rendered(); + p.write(wbRequest.getRequestUrl()); } /** Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java 2010-03-24 00:20:31 UTC (rev 3002) @@ -0,0 +1,71 @@ +/* PerformanceLogger + * + * $Id$: + * + * Created on Mar 19, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.webapp; + +import org.apache.log4j.Logger; + +/** + * @author brad + * + */ +public class PerformanceLogger { + private static final Logger LOGGER = Logger.getLogger( + PerformanceLogger.class.getName()); + + private static char delim = '\t'; + + private String type = null; + private long start = 0; + private long query = 0; + private long retrieve = -1; + private long render = 0; + public PerformanceLogger(String type) { + this.type = type; + this.start = System.currentTimeMillis(); + } + public void queried() { + this.query = System.currentTimeMillis(); + } + public void retrieved() { + this.retrieve = System.currentTimeMillis(); + } + public void rendered() { + this.render = System.currentTimeMillis(); + } + public void write(String info) { + StringBuilder sb = new StringBuilder(40); + sb.append(type).append(delim); + sb.append(query - start).append(delim); + if(retrieve == -1) { + sb.append(render - query).append(delim); + } else { + sb.append(retrieve - query).append(delim); + sb.append(render - retrieve).append(delim); + } + sb.append(info); + LOGGER.debug(sb.toString()); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/PerformanceLogger.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3001 http://archive-access.svn.sourceforge.net/archive-access/?rev=3001&view=rev Author: bradtofel Date: 2010-03-23 23:46:12 +0000 (Tue, 23 Mar 2010) Log Message: ----------- BUGFIX(unreported): was not setting STRICT_REMARKS to false - causing problems with many web pages using <!--- ----> and such. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2010-03-23 23:45:07 UTC (rev 3000) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2010-03-23 23:46:12 UTC (rev 3001) @@ -123,8 +123,9 @@ // and finally, parse, using the special lexer that knows how to // handle javascript blocks containing unescaped HTML entities: Page lexPage = new Page(resource,charSet); - ContextAwareLexer lex = new ContextAwareLexer(new Lexer(lexPage), - context); + Lexer lexer = new Lexer(lexPage); + Lexer.STRICT_REMARKS = false; + ContextAwareLexer lex = new ContextAwareLexer(lexer, context); Node node; try { while((node = lex.nextNode()) != null) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-23 23:45:15
|
Revision: 3000 http://archive-access.svn.sourceforge.net/archive-access/?rev=3000&view=rev Author: bradtofel Date: 2010-03-23 23:45:07 +0000 (Tue, 23 Mar 2010) Log Message: ----------- INTERFACE: now passing AccessPoint reference into PathRequestParsers Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -54,7 +55,7 @@ private final static Pattern WB_QUERY_REGEX = Pattern .compile("^(\\d{0,13})\\*/(.*[^*])$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_QUERY_REGEX.matcher(requestPath); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -56,7 +57,7 @@ .compile("^(\\d{1,14})-(\\d{1,14})\\*/(.*[^*])$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_QUERY2_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -54,7 +55,7 @@ private final static Pattern WB_PATH_QUERY_REGEX = Pattern .compile("^(\\d{0,13})\\*/(.*)\\*$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_PATH_QUERY_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -54,7 +55,7 @@ private final static Pattern WB_PATH_QUERY2_REGEX = Pattern .compile("^(\\d{1,14})-(\\d{1,14})\\*/(.*)\\*$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_PATH_QUERY2_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -27,11 +27,17 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURIFactory; +import org.archive.wayback.ResultURIConverter; import org.archive.wayback.archivalurl.ArchivalUrlRequestParser; import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BetterRequestException; import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from a Replay @@ -55,7 +61,8 @@ super(wrapped); } - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) + throws BetterRequestException { WaybackRequest wbRequest = null; Matcher matcher = WB_REQUEST_REGEX.matcher(requestPath); String urlStr = null; @@ -105,6 +112,33 @@ wbRequest.setReplayRequest(); wbRequest.setRequestUrl(urlStr); + } else { + // see if the remainder looks like an URL: +// String scheme = UrlOperations.urlToScheme(requestPath); +// if(scheme != null) { +// // lets interpret this as a replay request missing the +// // timestamp: use "NOW" +// String nowTS = Timestamp.currentTimestamp().getDateStr(); +// ResultURIConverter conv = ap.getUriConverter(); +// +// String betterURI = conv.makeReplayURI(nowTS, requestPath); +// throw new BetterRequestException(betterURI); +// } else { +// // not obviously an URL... see if UURI can handle it: +// String httpUrl = UrlOperations.HTTP_SCHEME + requestPath; +// try { +// UURIFactory.getInstance(httpUrl); +// // that worked. use httpUrl: +// String nowTS = Timestamp.currentTimestamp().getDateStr(); +// ResultURIConverter conv = ap.getUriConverter(); +// +// String betterURI = conv.makeReplayURI(nowTS, requestPath); +// throw new BetterRequestException(betterURI); +// } catch (URIException e) { +// // oh well. lets just fail: +// } +// } + } return wbRequest; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -28,6 +28,7 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.BetterRequestException; import org.archive.wayback.webapp.AccessPoint; /** @@ -48,17 +49,21 @@ /** * @param requestPath + * @param acessPoint * @return WaybackRequest with information parsed from the requestPath, or * null if information could not be extracted. + * @throws BetterRequestException */ - public abstract WaybackRequest parse(String requestPath); + public abstract WaybackRequest parse(String requestPath, + AccessPoint acessPoint) throws BetterRequestException; /* (non-Javadoc) * @see org.archive.wayback.requestparser.BaseRequestParser#parse(javax.servlet.http.HttpServletRequest, org.archive.wayback.webapp.WaybackContext) */ @Override public WaybackRequest parse(HttpServletRequest httpRequest, - AccessPoint wbContext) throws BadQueryException { + AccessPoint acessPoint) + throws BadQueryException, BetterRequestException { String queryString = httpRequest.getQueryString(); String origRequestPath = httpRequest.getRequestURI(); @@ -66,13 +71,13 @@ if (queryString != null) { origRequestPath += "?" + queryString; } - String contextPath = wbContext.getContextPath(httpRequest); + String contextPath = acessPoint.getContextPath(httpRequest); if (!origRequestPath.startsWith(contextPath)) { return null; } String requestPath = origRequestPath.substring(contextPath.length()); - WaybackRequest wbRequest = parse(requestPath); + WaybackRequest wbRequest = parse(requestPath, acessPoint); if(wbRequest != null) { wbRequest.setResultsPerPage(getMaxRecords()); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -26,7 +26,9 @@ import org.archive.wayback.archivalurl.ArchivalUrlRequestParser; import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BetterRequestException; import org.archive.wayback.requestparser.BaseRequestParser; +import org.archive.wayback.webapp.AccessPoint; import junit.framework.TestCase; @@ -41,87 +43,89 @@ /** * Test method for {@link org.archive.wayback.archivalurl.requestparser.ReplayRequestParser#parse(java.lang.String)}. + * @throws BetterRequestException */ - public void testParseString() { + public void testParseString() throws BetterRequestException { BaseRequestParser wrapped = new ArchivalUrlRequestParser(); ReplayRequestParser p = new ReplayRequestParser(wrapped); WaybackRequest r; - r = p.parse(""); + AccessPoint ap = null; + r = p.parse("",ap); assertNull("Should not parse empty string", r); - r = p.parse("20070101000000/foo.com"); + r = p.parse("20070101000000/foo.com",ap); assertNotNull("Should parse legit request sans scheme", r); assertEquals("parsed request Url",r.getRequestUrl(),"http://foo.com"); assertEquals("Parsed timestamp","20070101000000",r.getReplayTimestamp()); - r = p.parse("20070101000000/foo.com/"); + r = p.parse("20070101000000/foo.com/",ap); assertEquals("parsed request Url, maintaining trailing slash", "http://foo.com/",r.getRequestUrl()); - r = p.parse("200701010000/foo.com"); + r = p.parse("200701010000/foo.com",ap); assertEquals("parsed partial date", "http://foo.com",r.getRequestUrl()); assertEquals("Parsed partial timestamp to earliest", "20070101000000",r.getReplayTimestamp()); - r = p.parse("20070101000000/http://foo.com"); + r = p.parse("20070101000000/http://foo.com",ap); assertEquals("parsed request Url with scheme", "http://foo.com",r.getRequestUrl()); - r = p.parse("20070101000000/http://foo.com/"); + r = p.parse("20070101000000/http://foo.com/",ap); assertEquals("parsed request Url with scheme and trailing slash", "http://foo.com/",r.getRequestUrl()); - r = p.parse("20070101000000/ftp://foo.com/"); + r = p.parse("20070101000000/ftp://foo.com/",ap); assertEquals("parsed request Url with ftp scheme", "ftp://foo.com/",r.getRequestUrl()); - r = p.parse("20070101000000/https://foo.com/"); + r = p.parse("20070101000000/https://foo.com/",ap); assertEquals("parsed request Url with https scheme", "https://foo.com/",r.getRequestUrl()); - r = p.parse("20070101000000js_/http://foo.com/"); + r = p.parse("20070101000000js_/http://foo.com/",ap); assertEquals("parsed request Url with js_ flag", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed js_ flag",r.isJSContext()); assertFalse("css not set",r.isCSSContext()); - r = p.parse("20070101000000cs_/http://foo.com/"); + r = p.parse("20070101000000cs_/http://foo.com/",ap); assertEquals("parsed request Url with cs_ flag", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertFalse("js not set",r.isJSContext()); - r = p.parse("20070101000000cs_js_/http://foo.com/"); + r = p.parse("20070101000000cs_js_/http://foo.com/",ap); assertEquals("parsed request Url with cs_ and js_ flags", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000js_cs_/http://foo.com/"); + r = p.parse("20070101000000js_cs_/http://foo.com/",ap); assertEquals("parsed request Url with cs_ and js_ flags, backvards", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000un_/http://foo.com/"); + r = p.parse("20070101000000un_/http://foo.com/",ap); assertEquals("parsed request Url with unknown flag", "http://foo.com/",r.getRequestUrl()); assertFalse("no cs_ flag",r.isCSSContext()); assertFalse("no js_ flag",r.isJSContext()); - r = p.parse("20070101000000un_js_cs_/http://foo.com/"); + r = p.parse("20070101000000un_js_cs_/http://foo.com/",ap); assertEquals("parsed request Url with falgs and unknown flag", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000js_cs_un_/http://foo.com/"); + r = p.parse("20070101000000js_cs_un_/http://foo.com/",ap); assertEquals("parsed request Url with falgs and unknown flag at end", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000un_js_cs_un_/http://foo.com/"); + r = p.parse("20070101000000un_js_cs_un_/http://foo.com/",ap); assertEquals("parsed request Url with falgs and unknown flags", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2999 http://archive-access.svn.sourceforge.net/archive-access/?rev=2999&view=rev Author: bradtofel Date: 2010-03-23 23:40:36 +0000 (Tue, 23 Mar 2010) Log Message: ----------- Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-03-23 23:39:21 UTC (rev 2998) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-03-23 23:40:36 UTC (rev 2999) @@ -62,6 +62,9 @@ assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com/path:/")); assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com/path:/")); assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com/path:/")); + assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com\\")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com\\")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com:80\\")); } public void testResolveUrl() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2998 http://archive-access.svn.sourceforge.net/archive-access/?rev=2998&view=rev Author: bradtofel Date: 2010-03-23 23:39:21 +0000 (Tue, 23 Mar 2010) Log Message: ----------- Added test for braces Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2010-03-23 23:38:10 UTC (rev 2997) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2010-03-23 23:39:21 UTC (rev 2998) @@ -131,7 +131,11 @@ // unescape '%' (%25) checkCanonicalization("foo.com/pa%25th","foo.com/pa%th"); + //"http://wayback.archive-it.org/1726/20091231154920cs_/http://alumni.creighton.edu/atf/cf/%257B82F49357-B0BC-48DA-B47F-5701CAC6EDFE%257D/MENU-CSSPLAY.css" + checkCanonicalization("foo.com/{a}b","foo.com/%7Ba%7Db"); + checkCanonicalization("foo.com/%7Ba%7Db","foo.com/%7Ba%7Db"); + // replace escaped ' ' with '+' in path, unescape legal '!' in path // no change in query escaping checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2997 http://archive-access.svn.sourceforge.net/archive-access/?rev=2997&view=rev Author: bradtofel Date: 2010-03-23 23:38:10 +0000 (Tue, 23 Mar 2010) Log Message: ----------- Added test for extra escaping regression Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java 2010-03-23 23:36:35 UTC (rev 2996) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java 2010-03-23 23:38:10 UTC (rev 2997) @@ -67,6 +67,10 @@ pc.contextualizeUrl("/../../image/1.html#REF")); assertEquals("http://base.com/image/1.html#REF FOO", pc.contextualizeUrl("/../../image/1.html#REF FOO")); + assertEquals("http://base.com/image/foo?boo=baz", + pc.contextualizeUrl("/image/foo?boo=baz")); + assertEquals("http://base.com/image/foo?boo=baz%3A&gar=war", + pc.contextualizeUrl("/image/foo?boo=baz%3A&gar=war")); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2996 http://archive-access.svn.sourceforge.net/archive-access/?rev=2996&view=rev Author: bradtofel Date: 2010-03-23 23:36:35 +0000 (Tue, 23 Mar 2010) Log Message: ----------- Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java 2010-03-23 23:36:35 UTC (rev 2996) @@ -0,0 +1,59 @@ +/* RobotRulesTest + * + * $Id$: + * + * Created on Jan 15, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.ByteArrayInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class RobotRulesTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.accesscontrol.robotstxt.RobotRules#blocksPathForUA(java.lang.String, java.lang.String)}. + */ + public void testBlocksPathForUA() { + String testString = "User-agent: *\nDisallow:\n"; + RobotRules rr = new RobotRules(); + try { + rr.parse(new ByteArrayInputStream(testString.getBytes())); + assertFalse(rr.hasSyntaxErrors()); + assertFalse(rr.blocksPathForUA("/", "ia_archiver")); + } catch (FileNotFoundException e) { + e.printStackTrace(); + fail(e.getMessage()); + } catch (IOException e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotRulesTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2995 http://archive-access.svn.sourceforge.net/archive-access/?rev=2995&view=rev Author: bradtofel Date: 2010-03-20 01:21:00 +0000 (Sat, 20 Mar 2010) Log Message: ----------- LOGGING: added logging when unable to access a ResourceFile Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2010-03-20 01:19:20 UTC (rev 2994) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/SimpleResourceStore.java 2010-03-20 01:21:00 UTC (rev 2995) @@ -25,6 +25,7 @@ package org.archive.wayback.resourcestore; import java.io.IOException; +import java.util.logging.Logger; import org.archive.wayback.ResourceStore; import org.archive.wayback.core.Resource; @@ -45,6 +46,8 @@ */ public class SimpleResourceStore implements ResourceStore { + private final static Logger LOGGER = Logger.getLogger( + SimpleResourceStore.class.getName()); private String prefix = null; public Resource retrieveResource(CaptureSearchResult result) @@ -71,7 +74,7 @@ r = ResourceFactory.getResource(fileUrl, offset); } catch (IOException e) { - + LOGGER.warning("Unable to retrieve:" + fileUrl + ":" + offset); e.printStackTrace(); throw new ResourceNotAvailableException("Unable to retrieve", e.getLocalizedMessage()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2994 http://archive-access.svn.sourceforge.net/archive-access/?rev=2994&view=rev Author: bradtofel Date: 2010-03-20 01:19:20 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX(unreported): was not actually caching a robots.txt correctly, causing MANY robots.txt requests. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:16:21 UTC (rev 2993) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:19:20 UTC (rev 2994) @@ -99,17 +99,29 @@ private String hostToRobotUrlString(String host) { sb.setLength(0); sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX); - return sb.toString(); + String robotUrl = sb.toString(); + LOGGER.fine("Adding robot URL:" + robotUrl); + return robotUrl; } /* - * Return a List of all robots.txt urls to attempt for this url: - * If originalURL starts with "www.DOMAIN": - * [originalURL,DOMAIN] - * If url starts with "www[0-9]+.DOMAIN": - * [originalURL,www.DOMAIN,DOMAIN] + * Return a List of all robots.txt urls to attempt for this HOST: + * If HOST starts with "www.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://DOMAIN/robots.txt + * ] + * If HOST starts with "www[0-9]+.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://www.DOMAIN/robots.txt, + * http://DOMAIN/robots.txt + * ] * Otherwise: - * [originalURL,www.originalURL] + * [ + * http://HOST/robots.txt, + * http://www.HOST/robots.txt + * ] */ protected List<String> searchResultToRobotUrlStrings(String resultHost) { ArrayList<String> list = new ArrayList<String>(); @@ -135,22 +147,41 @@ private RobotRules getRules(CaptureSearchResult result) { RobotRules rules = null; RobotRules tmpRules = null; - String host = result.getOriginalHost(); + String host; + try { + host = result.getOriginalHost(); + } catch(Exception e) { + LOGGER.warning("ROBOT: Failed to get host from("+result.getOriginalUrl()+")"); + return null; + } List<String> urlStrings = searchResultToRobotUrlStrings(host); Iterator<String> itr = urlStrings.iterator(); String firstUrlString = null; - +// StringBuilder sb = new StringBuilder(); +// for(String ttt : urlStrings) { +// sb.append("RU(").append(ttt).append(")"); +// } +// LOGGER.info("RobotUrls for("+host+")"+sb.toString()); + // loop through them all. As soon as we get a response, store that + // in the cache for the FIRST url we tried and return it.. + // If we get no responses for any of the robot URLs, use "empty" rules, + // and record that in the cache, too. + while(rules == null && itr.hasNext()) { String urlString = (String) itr.next(); if(firstUrlString == null) { firstUrlString = urlString; } if(rulesCache.containsKey(urlString)) { - LOGGER.fine("ROBOT: Cached("+urlString+")"); + LOGGER.info("ROBOT: Cached("+urlString+")"); rules = rulesCache.get(urlString); + if(!urlString.equals(firstUrlString)) { + LOGGER.info("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")"); + rulesCache.put(firstUrlString, rules); + } } else { try { - LOGGER.fine("ROBOT: NotCached("+urlString+")"); + LOGGER.info("ROBOT: NotCached("+urlString+")"); tmpRules = new RobotRules(); Resource resource = webCache.getCachedResource(new URL(urlString), @@ -165,18 +196,19 @@ LOGGER.info("ROBOT: Downloaded("+urlString+")"); } catch (LiveDocumentNotAvailableException e) { - // cache an empty rule: all OK -// rulesCache.put(firstUrlString, emptyRules); -// rules = emptyRules; - continue; + LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")"); + } catch (MalformedURLException e) { e.printStackTrace(); + LOGGER.info("ROBOT: MalformedURLException("+urlString+")"); return null; } catch (IOException e) { - e.printStackTrace(); + e.printStackTrace(System.err); + LOGGER.info("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage()); return null; } catch (LiveWebCacheUnavailableException e) { e.printStackTrace(); + LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); return null; } } @@ -185,6 +217,7 @@ // special-case, allow empty rules if no longer available. rulesCache.put(firstUrlString,emptyRules); rules = emptyRules; + LOGGER.info("No rules available, using emptyRules for:" + firstUrlString); } return rules; } @@ -203,6 +236,7 @@ url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); if(!rules.blocksPathForUA(url.getPath(), userAgent)) { filterResult = ObjectFilter.FILTER_INCLUDE; + LOGGER.fine("ROBOT: ALLOWED("+resultURL+")"); } else { LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2993 http://archive-access.svn.sourceforge.net/archive-access/?rev=2993&view=rev Author: bradtofel Date: 2010-03-20 01:16:21 +0000 (Sat, 20 Mar 2010) Log Message: ----------- LOGGING: toned down logging level for a message Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2010-03-20 01:15:37 UTC (rev 2992) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2010-03-20 01:16:21 UTC (rev 2993) @@ -61,7 +61,7 @@ if(nextSearch == null) { break; } - LOGGER.info("EXCLUSION-MAP:Checking " + nextSearch); + LOGGER.trace("EXCLUSION-MAP:Checking " + nextSearch); if(exclusionMap.containsKey(nextSearch)) { LOGGER.info("EXCLUSION-MAP: EXCLUDED: \"" + nextSearch + "\" (" + url +")"); return true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2992 http://archive-access.svn.sourceforge.net/archive-access/?rev=2992&view=rev Author: bradtofel Date: 2010-03-20 01:15:37 +0000 (Sat, 20 Mar 2010) Log Message: ----------- LOGGING: improved logging Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2010-03-20 01:14:08 UTC (rev 2991) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2010-03-20 01:15:37 UTC (rev 2992) @@ -83,6 +83,8 @@ lastUpdated = -1; currentMap = null; e.printStackTrace(); + LOGGER.error("Reload " + file.getAbsolutePath() + " FAILED:" + + e.getLocalizedMessage()); } } protected Map<String,Object> loadFile(String path) throws IOException { @@ -97,7 +99,7 @@ } String surt = line.startsWith("(") ? line : SURTTokenizer.prefixKey(line); - LOGGER.info("EXCLUSION-MAP: adding " + surt); + LOGGER.trace("EXCLUSION-MAP: adding " + surt); newMap.put(surt, null); } itr.close(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-20 01:14:14
|
Revision: 2991 http://archive-access.svn.sourceforge.net/archive-access/?rev=2991&view=rev Author: bradtofel Date: 2010-03-20 01:14:08 +0000 (Sat, 20 Mar 2010) Log Message: ----------- FEATURE: actually tries to devine if a stream is chunked or not before setting the chunked inputs stream wrapper. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2010-03-20 01:11:51 UTC (rev 2990) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/Resource.java 2010-03-20 01:14:08 UTC (rev 2991) @@ -66,8 +66,49 @@ */ public void setChunkedEncoding() throws IOException { validate(); - is = new ChunkedInputStream(is); + // peek ahead and make sure we have a line with hex numbers: + int max = 50; + is.mark(max+2); + int cur = 0; + boolean isChunked = false; + while(cur < max) { + int nextC = is.read(); + if(nextC == 10) { + // must have read at least 1 hex char: + if(cur > 0) { + nextC = is.read(); + if(nextC == 13) { + isChunked = true; + break; + } + } + } else { + // better be a hex character: + if(!isHex(nextC)) { + break; + } + } + cur++; + } + is.reset(); + if(isChunked) { + is = new ChunkedInputStream(is); + } } + + private boolean isHex(int c) { + if((c >= '0') && (c <= '9')) { + return true; + } + if((c >= 'a') && (c <= 'f')) { + return true; + } + if((c >= 'A') && (c <= 'F')) { + return true; + } + return false; + } + /** * @return * @throws IOException This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-20 01:11:59
|
Revision: 2990 http://archive-access.svn.sourceforge.net/archive-access/?rev=2990&view=rev Author: bradtofel Date: 2010-03-20 01:11:51 +0000 (Sat, 20 Mar 2010) Log Message: ----------- INITIAL REV: tests for 2 transformers. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java 2010-03-20 01:11:51 UTC (rev 2990) @@ -0,0 +1,82 @@ +/* JSStringTransformerTest + * + * $Id$: + * + * Created on Dec 10, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.replay.html.transformer; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; + +import org.archive.wayback.replay.html.ContextResultURIConverterFactory; +import org.archive.wayback.replay.html.ReplayParseContext; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class JSStringTransformerTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.replay.html.transformer.JSStringTransformer#transform(org.archive.wayback.replay.html.ReplayParseContext, java.lang.String)}. + * @throws MalformedURLException + */ + public void testTransform() throws MalformedURLException { + RecordingReplayParseContext rc = new RecordingReplayParseContext(null, new URL("http://foo.com/"), null); + String input = "'<a href=\'http://www.gavelgrab.org\' target=\'_blank\'>Learn more in Gavel Grab</a>'"; + JSStringTransformer jst = new JSStringTransformer(); + jst.transform(rc, input); + assertEquals(1,rc.got.size()); + assertEquals("http://www.gavelgrab.org",rc.got.get(0)); + + input = "'<a href=\'http://www.gavelgrab.org/foobla/blah\' target=\'_blank\'>Learn more in Gavel Grab</a>'"; + rc = new RecordingReplayParseContext(null, new URL("http://foo.com/"), null); + jst.transform(rc, input); + assertEquals(1,rc.got.size()); + assertEquals("http://www.gavelgrab.org",rc.got.get(0)); + + } + public class RecordingReplayParseContext extends ReplayParseContext { + ArrayList<String> got = null; + /** + * @param uriConverterFactory + * @param baseUrl + * @param datespec + */ + public RecordingReplayParseContext( + ContextResultURIConverterFactory uriConverterFactory, + URL baseUrl, String datespec) { + super(uriConverterFactory, baseUrl, datespec); + got = new ArrayList<String>(); + // TODO Auto-generated constructor stub + } + public String contextualizeUrl(String url) { + got.add(url); + return url; + } + + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java 2010-03-20 01:11:51 UTC (rev 2990) @@ -0,0 +1,64 @@ +/* MetaRefreshUrlStringTransformerTest + * + * $Id$: + * + * Created on Jan 12, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.replay.html.transformer; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class MetaRefreshUrlStringTransformerTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.replay.html.transformer.MetaRefreshUrlStringTransformer#transform(org.archive.wayback.replay.html.ReplayParseContext, java.lang.String)}. + */ + public void testTransform() { +// cmpT("0; url=http://foo.com/bar","0; url=(((http://foo.com/bar)))"); +// cmpT("0; url=/bar","0; url=(((/bar)))"); +// cmpT("0; url =/bar","0; url =(((/bar)))"); +// cmpT("0; url =/bar","0; url =(((/bar)))"); +// cmpT("; url =/bar","; url =/bar"); +// cmpT("0; URL =/bar","0; URL =(((/bar)))"); +// +// cmpT("0; URL = /bar","0; URL = (((/bar)))"); +// cmpT("0; URL = /bar ","0; URL = (((/bar))) "); +// cmpT("0; URL = /bar ","0; URL = (((/bar))) "); +// cmpT("0; URL = /baz foo","0; URL = (((/baz foo)))"); +// cmpT("0; URL = /baz foo ","0; URL = (((/baz foo))) "); +// cmpT("0; URL=/baz foo ","0; URL=(((/baz foo))) "); +// +// cmpT("0; UrL=/baz foo ","0; UrL=(((/baz foo))) "); +// cmpT("0; UrL=/baZefoo ","0; UrL=(((/baZefoo))) "); + + } + private void cmpT(String source, String want) { + MetaRefreshUrlStringTransformer m = new MetaRefreshUrlStringTransformer(); + String got = m.transform(null,source); + assertEquals(want, got); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformerTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2989 http://archive-access.svn.sourceforge.net/archive-access/?rev=2989&view=rev Author: bradtofel Date: 2010-03-20 01:11:18 +0000 (Sat, 20 Mar 2010) Log Message: ----------- FEATURE: transformer for META refresh tags Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java 2010-03-20 01:11:18 UTC (rev 2989) @@ -0,0 +1,78 @@ +/* MetaRefreshUrlStringTransformer + * + * $Id$: + * + * Created on Jan 12, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.replay.html.transformer; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; + +/** + * @author brad + * + */ +public class MetaRefreshUrlStringTransformer extends URLStringTransformer +implements StringTransformer { + + private final static Pattern refreshURLPattern = + Pattern.compile("^\\d+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$", + Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); + + /* (non-Javadoc) + * @see org.archive.wayback.replay.html.StringTransformer#transform(org.archive.wayback.replay.html.ReplayParseContext, java.lang.String) + */ + public String transform(ReplayParseContext context, String input) { + /* + <META + HTTP-EQUIV="Refresh" + CONTENT="0; URL=/ics/default.asp"> + + Our argument "input" is set to the value of the "CONTENT" attribute. + + So, we need to search for the "URL=", take everything to the right + of that, trim it, contextualize it, and return that. + */ + Matcher m = refreshURLPattern.matcher(input); + if(m.matches()) { + if(m.groupCount() == 1) { + StringBuilder sb = new StringBuilder(input.length() * 2); + + sb.append(input.substring(0,m.start(1))); + + sb.append(super.transform(context, m.group(1))); + + // This was temporarily used for testing the regex: +// sb.append("(((").append(m.group(1)).append(")))"); + + sb.append(input.substring(m.end(1))); + return sb.toString(); + } + } + return input; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/MetaRefreshUrlStringTransformer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-20 01:05:45
|
Revision: 2988 http://archive-access.svn.sourceforge.net/archive-access/?rev=2988&view=rev Author: bradtofel Date: 2010-03-20 01:05:39 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX(unreported): replaced URL to host processing with REGEX, to better handle URLs with freakish illegal characters before the port/path start. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-03-20 01:02:49 UTC (rev 2987) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-03-20 01:05:39 UTC (rev 2988) @@ -92,6 +92,9 @@ Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + "(" + IP_PATTERN + ")"); + private static final Pattern AUTHORITY_REGEX_SIMPLE = + Pattern.compile("([0-9a-z_.-]++)"); + /** * @param urlPart * @return boolean indicating whether urlPart might be an Authority. @@ -186,22 +189,11 @@ for(String scheme : ALL_SCHEMES) { if(url.startsWith(scheme)) { int hostIdx = scheme.length(); - int portIdx = url.indexOf(PORT_SEPARATOR, hostIdx + 1); - int pathIdx = url.indexOf(PATH_START, hostIdx + 1); - if(portIdx == -1 && pathIdx == -1) { - return url.substring(hostIdx); + + Matcher m = AUTHORITY_REGEX_SIMPLE.matcher(url.substring(hostIdx)); + if(m.find()) { + return m.group(0); } - if(portIdx == -1) { - return url.substring(hostIdx,pathIdx); - } - if(pathIdx == -1) { - return url.substring(hostIdx,portIdx); - } - if(pathIdx > portIdx) { - return url.substring(hostIdx,portIdx); - } else { - return url.substring(hostIdx,pathIdx); - } } } return url; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-20 01:03:00
|
Revision: 2987 http://archive-access.svn.sourceforge.net/archive-access/?rev=2987&view=rev Author: bradtofel Date: 2010-03-20 01:02:49 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX(unreported) now returns closeable iterators, so filehandles/sockets can be cleaned up.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2010-03-20 01:00:50 UTC (rev 2986) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/flatfile/FlatFile.java 2010-03-20 01:02:49 UTC (rev 2987) @@ -180,7 +180,7 @@ * @return Iterator for records beggining with key * @throws IOException */ - public Iterator<String> getRecordIterator(final String prefix) throws IOException { + public CloseableIterator<String> getRecordIterator(final String prefix) throws IOException { RecordIterator itr = null; RandomAccessFile raf = new RandomAccessFile(file,"r"); long offset = findKeyOffset(raf,prefix); @@ -190,7 +190,7 @@ return itr; } - public Iterator<String> getRecordIteratorLT(final String prefix) throws IOException { + public CloseableIterator<String> getRecordIteratorLT(final String prefix) throws IOException { RecordIterator itr = null; RandomAccessFile raf = new RandomAccessFile(file,"r"); long offset = findKeyOffsetLT(raf,prefix); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-20 01:00:57
|
Revision: 2986 http://archive-access.svn.sourceforge.net/archive-access/?rev=2986&view=rev Author: bradtofel Date: 2010-03-20 01:00:50 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX(unreported): was not using correct resolve method, causing extra level of escaping on all GET arguments. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-20 00:59:42 UTC (rev 2985) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2010-03-20 01:00:50 UTC (rev 2986) @@ -100,7 +100,7 @@ url = url.substring(0,hashIdx); } try { - return baseUrl.resolve(url).toString() + frag; + return baseUrl.resolve(url,true).toString() + frag; } catch (URIException e) { e.printStackTrace(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2985 http://archive-access.svn.sourceforge.net/archive-access/?rev=2985&view=rev Author: bradtofel Date: 2010-03-20 00:59:42 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX: now closes iterators so open filehandles don't stack up. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-03-20 00:58:14 UTC (rev 2984) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-03-20 00:59:42 UTC (rev 2985) @@ -91,7 +91,7 @@ public void init() throws IOException { chunkMap = new HashMap<String, String>(); FlatFile ff = new FlatFile(chunkMapPath); - Iterator<String> lines = ff.getSequentialIterator(); + CloseableIterator<String> lines = ff.getSequentialIterator(); while(lines.hasNext()) { String line = lines.next(); String[] parts = line.split("\\s"); @@ -101,6 +101,7 @@ } chunkMap.put(parts[0],parts[1]); } + lines.close(); chunkIndex = new FlatFile(chunkIndexPath); } protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr) @@ -130,7 +131,7 @@ } public Iterator<String> getStringPrefixIterator(String prefix) throws ResourceIndexNotAvailableException, IOException { - Iterator<String> itr = chunkIndex.getRecordIteratorLT(prefix); + CloseableIterator<String> itr = chunkIndex.getRecordIteratorLT(prefix); ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); boolean first = true; while(itr.hasNext()) { @@ -161,6 +162,7 @@ long offset = Long.parseLong(parts[2]); blocks.add(new ZiplinedBlock(url, offset)); } + itr.close(); return new StringPrefixIterator(new ZiplinesChunkIterator(blocks),prefix); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2984 http://archive-access.svn.sourceforge.net/archive-access/?rev=2984&view=rev Author: bradtofel Date: 2010-03-20 00:58:14 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX(unreported): checks that filters are not null, which likely indicates a situation where no results can be returned anyways. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java 2010-03-19 21:18:47 UTC (rev 2983) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java 2010-03-20 00:58:14 UTC (rev 2984) @@ -55,7 +55,11 @@ public int filterObject(CaptureSearchResult r) { Iterator<ObjectFilter<CaptureSearchResult>> itr = filters.iterator(); while(itr.hasNext()) { - int result = itr.next().filterObject(r); + ObjectFilter<CaptureSearchResult> filter = itr.next(); + if(filter == null) { + return FILTER_EXCLUDE; + } + int result = filter.filterObject(r); if(result != FILTER_INCLUDE) { return result; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-19 21:18:55
|
Revision: 2983 http://archive-access.svn.sourceforge.net/archive-access/?rev=2983&view=rev Author: bradtofel Date: 2010-03-19 21:18:47 +0000 (Fri, 19 Mar 2010) Log Message: ----------- BUGFIX(unreported): adding http:// if missing for server-relative redirect Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2010-03-19 02:19:23 UTC (rev 2982) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/RequestFilter.java 2010-03-19 21:18:47 UTC (rev 2983) @@ -39,6 +39,7 @@ import org.apache.log4j.Logger; import org.archive.net.UURI; import org.archive.net.UURIFactory; +import org.archive.util.ArchiveUtils; import org.archive.wayback.exception.ConfigurationException; import org.archive.wayback.util.url.UrlOperations; @@ -131,7 +132,8 @@ int thirdSlash = remainder.indexOf('/'); if(thirdSlash > -1) { String datespec = remainder.substring(0,thirdSlash); - String url = remainder.substring(thirdSlash+1); + String url = ArchiveUtils.addImpliedHttpIfNecessary( + remainder.substring(thirdSlash+1)); String thisPath = httpRequest.getRequestURI(); String queryString = httpRequest.getQueryString(); if (queryString != null) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-03-19 02:19:29
|
Revision: 2982 http://archive-access.svn.sourceforge.net/archive-access/?rev=2982&view=rev Author: binzino Date: 2010-03-19 02:19:23 +0000 (Fri, 19 Mar 2010) Log Message: ----------- Put back line accidentally removed configuring the 'digest' metadata field for indexing. Modified Paths: -------------- tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml Modified: tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml 2010-03-18 23:05:40 UTC (rev 2981) +++ tags/nutchwax-0_13/archive/src/nutch/conf/nutch-site.xml 2010-03-19 02:19:23 UTC (rev 2982) @@ -48,6 +48,7 @@ site:false:false:untokenized url:false:true:tokenized + digest:false:true:no collection:true:true:no_norms date:true:true:no_norms This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-03-18 23:05:50
|
Revision: 2981 http://archive-access.svn.sourceforge.net/archive-access/?rev=2981&view=rev Author: binzino Date: 2010-03-18 23:05:40 +0000 (Thu, 18 Mar 2010) Log Message: ----------- NutchWAX 0.13 release tag/branch. Added Paths: ----------- tags/nutchwax-0_13/ tags/nutchwax-0_13/archive/ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |