From: <bra...@us...> - 2007-07-25 00:19:23
|
Revision: 1857 http://archive-access.svn.sourceforge.net/archive-access/?rev=1857&view=rev Author: bradtofel Date: 2007-07-24 17:19:26 -0700 (Tue, 24 Jul 2007) Log Message: ----------- REFACTOR: moved PeekableIterator into it's own class TWEAK: added type safety Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CompositeSortedIterator.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/PeekableIterator.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CompositeSortedIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CompositeSortedIterator.java 2007-07-25 00:17:15 UTC (rev 1856) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CompositeSortedIterator.java 2007-07-25 00:19:26 UTC (rev 1857) @@ -39,26 +39,27 @@ * * @author brad * @version $Date$, $Revision$ + * @param <E> */ -public class CompositeSortedIterator implements CloseableIterator { +public class CompositeSortedIterator<E> implements CloseableIterator<E> { - private ArrayList<PeekableIterator> components; - private Object next; - private Comparator<Object> comparator; + private ArrayList<PeekableIterator<E>> components; + private E next; + private Comparator<E> comparator; /** * @param comparator Comparator to use for sorting order */ - public CompositeSortedIterator(Comparator<Object> comparator) { + public CompositeSortedIterator(Comparator<E> comparator) { this.comparator = comparator; - components = new ArrayList<PeekableIterator>(); + components = new ArrayList<PeekableIterator<E>>(); next = null; } /** * @param itr Iterator which is a component of this composite */ - public void addComponent(Iterator itr) { - components.add(new PeekableIterator(itr)); + public void addComponent(Iterator<E> itr) { + components.add(new PeekableIterator<E>(itr)); } /* (non-Javadoc) * @see java.util.Iterator#hasNext() @@ -68,11 +69,11 @@ return true; } // find lowest next: - PeekableIterator nextSource = null; + PeekableIterator<E> nextSource = null; for(int i = 0; i < components.size(); i++) { - PeekableIterator pi = components.get(i); + PeekableIterator<E> pi = components.get(i); if(pi.hasNext()) { - Object piNext = pi.peekNext(); + E piNext = pi.peekNext(); if((next == null) || (comparator.compare(next,piNext) > 0)) { nextSource = pi; next = piNext; @@ -87,11 +88,11 @@ /* (non-Javadoc) * @see java.util.Iterator#next() */ - public Object next() { + public E next() { if(!hasNext()) { throw new NoSuchElementException(); } - Object retObject = next; + E retObject = next; next = null; return retObject; } @@ -107,66 +108,8 @@ */ public void close() throws IOException { for(int i = 0; i < components.size(); i++) { - PeekableIterator pi = (PeekableIterator) components.get(i); + PeekableIterator<E> pi = (PeekableIterator<E>) components.get(i); pi.close(); } } - - private class PeekableIterator implements CloseableIterator { - private Object cachedNext; - private Iterator itr; - /** - * @param itr - */ - public PeekableIterator(Iterator itr) { - this.itr = itr; - this.cachedNext = null; - } - /** - * @return true if this Iterator has another element. - */ - public boolean hasNext() { - if(cachedNext != null) { - return true; - } - return itr.hasNext(); - } - /** - * @return Object that will be returned from next(), or null - */ - public Object peekNext() { - if(cachedNext == null) { - if(itr.hasNext()) { - cachedNext = itr.next(); - } - } - return cachedNext; - } - /** - * @return next Object - */ - public Object next() { - if(cachedNext != null) { - Object retObject = cachedNext; - cachedNext = null; - return retObject; - } - return itr.next(); - } - /* (non-Javadoc) - * @see org.archive.wayback.util.Cleanable#clean() - */ - public void close() throws IOException { - if(itr instanceof CloseableIterator) { - CloseableIterator toBeClosed = (CloseableIterator) itr; - toBeClosed.close(); - } - } - /* (non-Javadoc) - * @see java.util.Iterator#remove() - */ - public void remove() { - throw new NotImplementedException(); - } - } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/PeekableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/PeekableIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/PeekableIterator.java 2007-07-25 00:19:26 UTC (rev 1857) @@ -0,0 +1,95 @@ +/* PeekableIterator + * + * $Id$ + * + * Created on 4:37:15 PM Jul 24, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback-core. + * + * wayback-core is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback-core is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-core; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util; + +import java.io.IOException; +import java.util.Iterator; + +import sun.reflect.generics.reflectiveObjects.NotImplementedException; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + * @param <E> + */ +public class PeekableIterator<E> implements CloseableIterator<E> { + private E cachedNext; + private Iterator<E> itr; + /** + * @param itr + */ + public PeekableIterator(Iterator<E> itr) { + this.itr = itr; + this.cachedNext = null; + } + /** + * @return true if this Iterator has another element. + */ + public boolean hasNext() { + if(cachedNext != null) { + return true; + } + return itr.hasNext(); + } + /** + * @return Object that will be returned from next(), or null + */ + public E peekNext() { + if(cachedNext == null) { + if(itr.hasNext()) { + cachedNext = itr.next(); + } + } + return cachedNext; + } + /** + * @return next Object + */ + public E next() { + if(cachedNext != null) { + E retObject = cachedNext; + cachedNext = null; + return retObject; + } + return itr.next(); + } + /* (non-Javadoc) + * @see org.archive.wayback.util.Cleanable#clean() + */ + public void close() throws IOException { + if(itr instanceof CloseableIterator) { + CloseableIterator<E> toBeClosed = (CloseableIterator<E>) itr; + toBeClosed.close(); + } + } + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + public void remove() { + throw new NotImplementedException(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-07-25 00:34:55
|
Revision: 1872 http://archive-access.svn.sourceforge.net/archive-access/?rev=1872&view=rev Author: bradtofel Date: 2007-07-24 17:34:39 -0700 (Tue, 24 Jul 2007) Log Message: ----------- TWEAK: type safety Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/AdaptedIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Adapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CachedFile.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CloseableIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilterChain.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/AdaptedIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/AdaptedIterator.java 2007-07-25 00:33:53 UTC (rev 1871) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/AdaptedIterator.java 2007-07-25 00:34:39 UTC (rev 1872) @@ -33,16 +33,18 @@ * * @author brad * @version $Date$, $Revision$ + * @param <S> + * @param <T> */ -public class AdaptedIterator implements CloseableIterator { - protected Iterator itr; - protected Adapter adapter; - private Object cachedNext = null; +public class AdaptedIterator<S,T> implements CloseableIterator<T> { + protected Iterator<S> itr; + protected Adapter<S,T> adapter; + private T cachedNext = null; /** * @param itr * @param adapter */ - public AdaptedIterator(Iterator itr, Adapter adapter) { + public AdaptedIterator(Iterator<S> itr, Adapter<S,T> adapter) { this.itr = itr; this.adapter = adapter; } @@ -53,8 +55,8 @@ public boolean hasNext() { if(cachedNext != null) return true; while(itr.hasNext()) { - Object o = itr.next(); - Object adapted = adapter.adapt(o); + S o = itr.next(); + T adapted = adapter.adapt(o); if(adapted != null) { cachedNext = adapted; return true; @@ -65,11 +67,11 @@ /* (non-Javadoc) * @see java.util.Iterator#next() */ - public Object next() { + public T next() { if(cachedNext == null) { throw new NoSuchElementException("call hasNext first!"); } - Object o = cachedNext; + T o = cachedNext; cachedNext = null; return o; } @@ -85,7 +87,7 @@ */ public void close() throws IOException { if(itr instanceof CloseableIterator) { - CloseableIterator toBeClosed = (CloseableIterator) itr; + CloseableIterator<S> toBeClosed = (CloseableIterator<S>) itr; toBeClosed.close(); } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Adapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Adapter.java 2007-07-25 00:33:53 UTC (rev 1871) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/Adapter.java 2007-07-25 00:34:39 UTC (rev 1872) @@ -29,13 +29,15 @@ * * @author brad * @version $Date$, $Revision$ + * @param <S> + * @param <T> */ -public interface Adapter { +public interface Adapter<S,T> { /** * Transform one object into another * * @param o * @return new object that is adapted from the old */ - public Object adapt(Object o); + public T adapt(S o); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CachedFile.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CachedFile.java 2007-07-25 00:33:53 UTC (rev 1871) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CachedFile.java 2007-07-25 00:34:39 UTC (rev 1872) @@ -31,7 +31,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.net.URL; -import java.util.Iterator; import org.archive.wayback.util.flatfile.FlatFile; @@ -86,7 +85,7 @@ * @return Iterator of lines in File * @throws IOException */ - public Iterator getSequentialIterator() throws IOException { + public CloseableIterator<String> getSequentialIterator() throws IOException { long nowMS = System.currentTimeMillis(); if(nowMS > nextCheckMS) { refreshFromSource(); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CloseableIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CloseableIterator.java 2007-07-25 00:33:53 UTC (rev 1871) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/CloseableIterator.java 2007-07-25 00:34:39 UTC (rev 1872) @@ -33,6 +33,7 @@ * * @author brad * @version $Date$, $Revision$ + * @param <E> */ -public interface CloseableIterator extends Iterator, Closeable { +public interface CloseableIterator<E> extends Iterator<E>, Closeable { } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilter.java 2007-07-25 00:33:53 UTC (rev 1871) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilter.java 2007-07-25 00:34:39 UTC (rev 1872) @@ -29,8 +29,9 @@ * * @author brad * @version $Date$, $Revision$ + * @param <E> */ -public interface ObjectFilter { +public interface ObjectFilter<E> { /** * constant indicating record should be included in the result set */ @@ -52,6 +53,6 @@ * @param o Object which should be checked for inclusion/exclusion or abort * @return int of FILTER_INCLUDE, FILTER_EXCLUDE, or FILTER_ABORT */ - public int filterObject(Object o); + public int filterObject(E o); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilterChain.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilterChain.java 2007-07-25 00:33:53 UTC (rev 1871) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/ObjectFilterChain.java 2007-07-25 00:34:39 UTC (rev 1872) @@ -33,31 +33,32 @@ * * @author brad * @version $Date$, $Revision$ + * @param <E> */ -public class ObjectFilterChain implements ObjectFilter { +public class ObjectFilterChain<E> implements ObjectFilter<E> { - private ArrayList<ObjectFilter> filters = null; + private ArrayList<ObjectFilter<E>> filters = null; /** * Constructor */ public ObjectFilterChain() { - this.filters = new ArrayList<ObjectFilter>(); + this.filters = new ArrayList<ObjectFilter<E>>(); } /** * @param filter to be added to the chain. filters are processed in the * order they are added to the chain. */ - public void addFilter(ObjectFilter filter) { + public void addFilter(ObjectFilter<E> filter) { filters.add(filter); } /* (non-Javadoc) * @see org.archive.wayback.cdx.filter.RecordFilter#filterRecord(org.archive.wayback.cdx.CDXRecord) */ - public int filterObject(Object o) { + public int filterObject(E o) { int size = filters.size(); int result = FILTER_ABORT; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 01:39:59
|
Revision: 2126 http://archive-access.svn.sourceforge.net/archive-access/?rev=2126&view=rev Author: bradtofel Date: 2008-01-14 17:39:58 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: moved isAuthority() and resolveUrl() from the generic UrlCanonicalizer class, in preparation for making UrlCanonicalizers configurable. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2008-01-15 01:39:58 UTC (rev 2126) @@ -0,0 +1,76 @@ +package org.archive.wayback.util.url; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; + +/** + * Class containing common static URL methods. Primarily resolveUrl() and + * the (currently) unused isAuthority(). + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlOperations { + + private static final String CC_TLDS = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq" + + "|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs" + + "|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx" + + "|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo" + + "|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk" + + "|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg" + + "|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma" + + "|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz" + + "|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm" + + "|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj" + + "|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn" + + "|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu" + + "|wf|ws|ye|yt|yu|za|zm|zw"; + + private static final String GEN_TLDS = "aero|biz|cat|com|coop|edu|gov" + + "|info|int|jobs|mil|mobi|museum|name|net|org|pro|travel"; + + + private static final String ALL_TLD_PATTERN = CC_TLDS + "|" + GEN_TLDS; + + private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"; + + private static final Pattern AUTHORITY_REGEX = + Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + + "(" + IP_PATTERN + ")"); + + /** + * @param urlPart + * @return boolean indicating whether urlPart might be an Authority. + */ + public static boolean isAuthority(String urlPart) { + Matcher m = AUTHORITY_REGEX.matcher(urlPart); + + return (m != null) && m.matches(); + } + + /** + * @param baseUrl + * @param url + * @return url resolved against baseUrl, unless it is absolute already + */ + public static String resolveUrl(String baseUrl, String url) { + // TODO: this only works for http:// + if(url.startsWith("http://")) { + return url; + } + UURI absBaseURI; + UURI resolvedURI = null; + try { + absBaseURI = UURIFactory.getInstance(baseUrl); + resolvedURI = UURIFactory.getInstance(absBaseURI, url); + } catch (URIException e) { + e.printStackTrace(); + return url; + } + return resolvedURI.getEscapedURI(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 02:17:06
|
Revision: 2130 http://archive-access.svn.sourceforge.net/archive-access/?rev=2130&view=rev Author: bradtofel Date: 2008-01-14 18:17:09 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: moved UrlCanonicalizer to org.archive.wayback.util.url Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 01:46:54 UTC (rev 2129) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 02:17:09 UTC (rev 2130) @@ -1,391 +0,0 @@ -/* UrlCanonicalizer - * - * $Id$ - * - * Created on 2:08:07 PM Oct 11, 2006. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.util; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.httpclient.URIException; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; - -/** - * Class that performs the standard Heritrix URL canonicalization. Eventually, - * this should all be configurable, or perhaps be able to read the settings - * used within a Heritrix crawler... or even multiple crawlers... this is hard. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class UrlCanonicalizer { - - - private static final String CDX_PREFIX = " CDX "; - /** - * Strip leading 'www.' - */ - private static final Pattern STRIP_WWW_REGEX = - Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$"); - /** - * Strip leading 'www44.', 'www3.', etc. - */ - private static final Pattern STRIP_WWWN_REGEX = - Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); - /** - * Strip userinfo. - */ - private static final Pattern STRIP_USERINFO_REGEX = - Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$", - Pattern.CASE_INSENSITIVE); - - /** - * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. - * Example: PHPSESSID=9682993c8daa2c5497996114facdc805. - */ - private static final Pattern STRIP_SESSION_ID_REGEX = - Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" + - "[0-9a-zA-Z]{32})(?:&(.*))?$", - Pattern.CASE_INSENSITIVE); - - /** - * Example: sid=9682993c8daa2c5497996114facdc805. - * 'sid=' can be tricky but all sid= followed by 32 byte string - * so far seen have been session ids. Sid is a 32 byte string - * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid' - * so have to have it run after the phpsessid elimination. - */ - private static final Pattern STRIP_SID_REGEX = - Pattern.compile("^(.+)" + - "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); - - /** - * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM. - */ - private static final Pattern STRIP_ASPSESSION_REGEX = - Pattern.compile("^(.+)" + - "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", - Pattern.CASE_INSENSITIVE); - - /** - * Examples: - * - * (.NET 2.0) - * http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx - * => http://legislature.mi.gov/mileg.aspx - * - * (.NET 1.0/1.1) - * http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx - * => http://legislature.mi.gov/mileg.aspx - * - * For more info, see: - * http://msdn2.microsoft.com/en-us/library/aa479315.aspx - * - */ - private static final Pattern STRIP_ASPSESSION2_REGEX = - Pattern.compile("^([^\\?]+/)" + - "(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$", - Pattern.CASE_INSENSITIVE); - - /** - * Examples: - * - * (.NET 2.0) - * http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules - * => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules - * - * For more info, see: - * http://msdn2.microsoft.com/en-us/library/aa479315.aspx - * - */ - - private static final Pattern STRIP_ASPSESSION3_REGEX = - Pattern.compile("^([^\\?]+/" + - "\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" + - "((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$", - Pattern.CASE_INSENSITIVE); - - /** - * Strip ColdFusion session IDs. Remove sessionids that look like the - * following: - * CFID=12412453&CFTOKEN=15501799 - * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A - */ - private static final Pattern STRIP_CFSESSION_REGEX = - Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" + - "(?:&(.*))?$",Pattern.CASE_INSENSITIVE); - - /** - * Run a regex that strips elements of a string. - * - * Assumes the regex has a form that wants to strip elements of the passed - * string. Assumes that if a match, appending group 1 - * and group 2 yields desired result. - * @param url Url to search in. - * @param matcher Matcher whose form yields a group 1 and group 2 if a - * match (non-null. - * @return Original <code>url</code> else concatenization of group 1 - * and group 2. - */ - protected String doStripRegexMatch(String url, Matcher matcher) { - return (matcher != null && matcher.matches())? - checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)): - url; - } - - /** - * @param string String to check. - * @return <code>string</code> if non-null, else empty string (""). - */ - private String checkForNull(String string) { - return (string != null)? string: ""; - } - - /** - * return the canonical string key for the URL argument. - * - * @param urlString - * @return String lookup key for URL argument. - * @throws URIException - */ - public String urlStringToKey(final String urlString) throws URIException { - - String searchUrl = canonicalize(urlString); - - // TODO: force https into http for the moment... - if(searchUrl.startsWith("https://")) { - searchUrl = searchUrl.substring(8); - } - - // TODO: this will only work with http:// scheme. should work with all? - // force add of scheme and possible add '/' with empty path: - if (searchUrl.startsWith("http://")) { - if (-1 == searchUrl.indexOf('/', 8)) { - searchUrl = searchUrl + "/"; - } - } else { - if (-1 == searchUrl.indexOf("/")) { - searchUrl = searchUrl + "/"; - } - searchUrl = "http://" + searchUrl; - } - - // unescape anythying that can be: - UURI tmpURI = UURIFactory.getInstance(searchUrl); - tmpURI.setPath(tmpURI.getPath()); - - - // convert to UURI to perform require URI fixup: - UURI searchURI = UURIFactory.getInstance(tmpURI.getURI()); - - - - - // replace ' ' with '+' (this is only to match Alexa's canonicalization) - String newPath = searchURI.getEscapedPath().replace("%20","+"); -// String newPath = searchURI.getPath().replace(' ','+'); - - // replace multiple consecutive '/'s in the path. - while(newPath.contains("//")) { - newPath = newPath.replace("//","/"); - } - - // this would remove trailing a '/' character, unless the path is empty - // but we're not going to do this just yet.. -// if((newPath.length() > 1) && newPath.endsWith("/")) { -// newPath = newPath.substring(0,newPath.length()-1); -// } -// searchURI.setEscapedPath(newPath); -// searchURI.setRawPath(newPath.toCharArray()); -// String query = searchURI.getEscapedQuery(); - - // TODO: handle non HTTP port stripping, too. -// String portStr = ""; -// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { -// portStr = ":" + searchURI.getPort(); -// } -// return searchURI.getHostBasename() + portStr + -// searchURI.getEscapedPathQuery(); - - StringBuilder sb = new StringBuilder(searchUrl.length()); - sb.append(searchURI.getHostBasename()); - if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { - sb.append(":").append(searchURI.getPort()); - } - sb.append(newPath); - if(searchURI.getEscapedQuery() != null) { - sb.append("?").append(searchURI.getEscapedQuery()); - } - - - return sb.toString(); - } - - - /** - * Idempotent operation that will determine the 'fuzziest' - * form of the url argument. This operation is done prior to adding records - * to the ResourceIndex, and prior to lookup. Current version is exactly - * the default found in Heritrix. When the configuration system for - * Heritrix stabilizes, hopefully this can use the system directly within - * Heritrix. - * - * @param url to be canonicalized. - * @return canonicalized version of url argument. - */ - public String canonicalize(String url) { - url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url)); - url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url)); - url = url.toLowerCase(); - if (url == null || url.length() <= 0) { - return url; - } - - int index = url.lastIndexOf('?'); - if (index > 0) { - if (index == (url.length() - 1)) { - // '?' is last char in url. Strip it. - url = url.substring(0, url.length() - 1); - } else if (url.charAt(index + 1) == '&') { - // Next char is '&'. Strip it. - if (url.length() == (index + 2)) { - // Then url ends with '?&'. Strip them. - url = url.substring(0, url.length() - 2); - } else { - // The '&' is redundant. Strip it. - url = url.substring(0, index + 1) + - url.substring(index + 2); - } - } else if (url.charAt(url.length() - 1) == '&') { - // If we have a lone '&' on end of query str, - // strip it. - url = url.substring(0, url.length() - 1); - } - } - return url; - } - - private static void USAGE() { - System.err.println("Usage: [-f FIELD] [-d DELIM]"); - System.exit(3); - } - /** - * @param args - */ - public static void main(String[] args) { - UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); - int n = 0; - int i = 0; - ArrayList<Integer> columns = new ArrayList<Integer>(); - - long lineNumber = 0; - boolean cdxPassThru = false; - String delimiter = " "; - while(n < args.length) { - String arg = args[n]; - if(arg.compareTo("-cdx") == 0) { - cdxPassThru = true; - n++; - continue; - } - if(n == (args.length -1)) { - USAGE(); - } - String val = args[n+1]; - if(arg.compareTo("-f") == 0) { - columns.add(new Integer(val)); - } else if(arg.compareTo("-d") == 0) { - delimiter = val; - } else { - USAGE(); - } - n += 2; - } - // place default '0' in case none specified: - if(columns.size() == 0) { - columns.add(new Integer(1)); - } - - // convert to int[]: - int[] cols = new int[columns.size()]; - for(int idx = 0; idx < columns.size(); idx++) { - cols[idx] = columns.get(idx).intValue() - 1; - } - BufferedReader r = new BufferedReader(new InputStreamReader(System.in)); - StringBuilder sb = new StringBuilder(); - String line = null; - - while(true) { - try { - line = r.readLine(); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); - } - if(line == null) { - break; - } - lineNumber++; - if(cdxPassThru && line.startsWith(CDX_PREFIX)) { - System.out.println(line); - continue; - } - String parts[] = line.split(delimiter); - for(int column : cols) { - if(column >= parts.length) { - System.err.println("Invalid line " + lineNumber + " (" + - line + ") skipped"); - } else { - try { - parts[column] = canonicalizer.urlStringToKey(parts[column]); - } catch (URIException e) { - System.err.println("Invalid URL in line " + lineNumber + " (" + - line + ") skipped (" + parts[column] + ")"); - e.printStackTrace(); - continue; - } - } - } - sb.setLength(0); - for(i = 0; i < parts.length; i++) { - sb.append(parts[i]); - if(i < (parts.length-1)) { - sb.append(delimiter); - } - } - System.out.println(sb.toString()); - } - } -} \ No newline at end of file Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java (from rev 2128, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java 2008-01-15 02:17:09 UTC (rev 2130) @@ -0,0 +1,391 @@ +/* UrlCanonicalizer + * + * $Id$ + * + * Created on 2:08:07 PM Oct 11, 2006. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.util.url; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; + +/** + * Class that performs the standard Heritrix URL canonicalization. Eventually, + * this should all be configurable, or perhaps be able to read the settings + * used within a Heritrix crawler... or even multiple crawlers... this is hard. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlCanonicalizer { + + + private static final String CDX_PREFIX = " CDX "; + /** + * Strip leading 'www.' + */ + private static final Pattern STRIP_WWW_REGEX = + Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$"); + /** + * Strip leading 'www44.', 'www3.', etc. + */ + private static final Pattern STRIP_WWWN_REGEX = + Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$"); + /** + * Strip userinfo. + */ + private static final Pattern STRIP_USERINFO_REGEX = + Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$", + Pattern.CASE_INSENSITIVE); + + /** + * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A. + * Example: PHPSESSID=9682993c8daa2c5497996114facdc805. + */ + private static final Pattern STRIP_SESSION_ID_REGEX = + Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" + + "[0-9a-zA-Z]{32})(?:&(.*))?$", + Pattern.CASE_INSENSITIVE); + + /** + * Example: sid=9682993c8daa2c5497996114facdc805. + * 'sid=' can be tricky but all sid= followed by 32 byte string + * so far seen have been session ids. Sid is a 32 byte string + * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid' + * so have to have it run after the phpsessid elimination. + */ + private static final Pattern STRIP_SID_REGEX = + Pattern.compile("^(.+)" + + "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE); + + /** + * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM. + */ + private static final Pattern STRIP_ASPSESSION_REGEX = + Pattern.compile("^(.+)" + + "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", + Pattern.CASE_INSENSITIVE); + + /** + * Examples: + * + * (.NET 2.0) + * http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx + * => http://legislature.mi.gov/mileg.aspx + * + * (.NET 1.0/1.1) + * http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx + * => http://legislature.mi.gov/mileg.aspx + * + * For more info, see: + * http://msdn2.microsoft.com/en-us/library/aa479315.aspx + * + */ + private static final Pattern STRIP_ASPSESSION2_REGEX = + Pattern.compile("^([^\\?]+/)" + + "(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$", + Pattern.CASE_INSENSITIVE); + + /** + * Examples: + * + * (.NET 2.0) + * http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules + * => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules + * + * For more info, see: + * http://msdn2.microsoft.com/en-us/library/aa479315.aspx + * + */ + + private static final Pattern STRIP_ASPSESSION3_REGEX = + Pattern.compile("^([^\\?]+/" + + "\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" + + "((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$", + Pattern.CASE_INSENSITIVE); + + /** + * Strip ColdFusion session IDs. Remove sessionids that look like the + * following: + * CFID=12412453&CFTOKEN=15501799 + * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A + */ + private static final Pattern STRIP_CFSESSION_REGEX = + Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" + + "(?:&(.*))?$",Pattern.CASE_INSENSITIVE); + + /** + * Run a regex that strips elements of a string. + * + * Assumes the regex has a form that wants to strip elements of the passed + * string. Assumes that if a match, appending group 1 + * and group 2 yields desired result. + * @param url Url to search in. + * @param matcher Matcher whose form yields a group 1 and group 2 if a + * match (non-null. + * @return Original <code>url</code> else concatenization of group 1 + * and group 2. + */ + protected String doStripRegexMatch(String url, Matcher matcher) { + return (matcher != null && matcher.matches())? + checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)): + url; + } + + /** + * @param string String to check. + * @return <code>string</code> if non-null, else empty string (""). + */ + private String checkForNull(String string) { + return (string != null)? string: ""; + } + + /** + * return the canonical string key for the URL argument. + * + * @param urlString + * @return String lookup key for URL argument. + * @throws URIException + */ + public String urlStringToKey(final String urlString) throws URIException { + + String searchUrl = canonicalize(urlString); + + // TODO: force https into http for the moment... + if(searchUrl.startsWith("https://")) { + searchUrl = searchUrl.substring(8); + } + + // TODO: this will only work with http:// scheme. should work with all? + // force add of scheme and possible add '/' with empty path: + if (searchUrl.startsWith("http://")) { + if (-1 == searchUrl.indexOf('/', 8)) { + searchUrl = searchUrl + "/"; + } + } else { + if (-1 == searchUrl.indexOf("/")) { + searchUrl = searchUrl + "/"; + } + searchUrl = "http://" + searchUrl; + } + + // unescape anythying that can be: + UURI tmpURI = UURIFactory.getInstance(searchUrl); + tmpURI.setPath(tmpURI.getPath()); + + + // convert to UURI to perform require URI fixup: + UURI searchURI = UURIFactory.getInstance(tmpURI.getURI()); + + + + + // replace ' ' with '+' (this is only to match Alexa's canonicalization) + String newPath = searchURI.getEscapedPath().replace("%20","+"); +// String newPath = searchURI.getPath().replace(' ','+'); + + // replace multiple consecutive '/'s in the path. + while(newPath.contains("//")) { + newPath = newPath.replace("//","/"); + } + + // this would remove trailing a '/' character, unless the path is empty + // but we're not going to do this just yet.. +// if((newPath.length() > 1) && newPath.endsWith("/")) { +// newPath = newPath.substring(0,newPath.length()-1); +// } +// searchURI.setEscapedPath(newPath); +// searchURI.setRawPath(newPath.toCharArray()); +// String query = searchURI.getEscapedQuery(); + + // TODO: handle non HTTP port stripping, too. +// String portStr = ""; +// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { +// portStr = ":" + searchURI.getPort(); +// } +// return searchURI.getHostBasename() + portStr + +// searchURI.getEscapedPathQuery(); + + StringBuilder sb = new StringBuilder(searchUrl.length()); + sb.append(searchURI.getHostBasename()); + if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { + sb.append(":").append(searchURI.getPort()); + } + sb.append(newPath); + if(searchURI.getEscapedQuery() != null) { + sb.append("?").append(searchURI.getEscapedQuery()); + } + + + return sb.toString(); + } + + + /** + * Idempotent operation that will determine the 'fuzziest' + * form of the url argument. This operation is done prior to adding records + * to the ResourceIndex, and prior to lookup. Current version is exactly + * the default found in Heritrix. When the configuration system for + * Heritrix stabilizes, hopefully this can use the system directly within + * Heritrix. + * + * @param url to be canonicalized. + * @return canonicalized version of url argument. + */ + public String canonicalize(String url) { + url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url)); + url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url)); + url = url.toLowerCase(); + if (url == null || url.length() <= 0) { + return url; + } + + int index = url.lastIndexOf('?'); + if (index > 0) { + if (index == (url.length() - 1)) { + // '?' is last char in url. Strip it. + url = url.substring(0, url.length() - 1); + } else if (url.charAt(index + 1) == '&') { + // Next char is '&'. Strip it. + if (url.length() == (index + 2)) { + // Then url ends with '?&'. Strip them. + url = url.substring(0, url.length() - 2); + } else { + // The '&' is redundant. Strip it. + url = url.substring(0, index + 1) + + url.substring(index + 2); + } + } else if (url.charAt(url.length() - 1) == '&') { + // If we have a lone '&' on end of query str, + // strip it. + url = url.substring(0, url.length() - 1); + } + } + return url; + } + + private static void USAGE() { + System.err.println("Usage: [-f FIELD] [-d DELIM]"); + System.exit(3); + } + /** + * @param args + */ + public static void main(String[] args) { + UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + int n = 0; + int i = 0; + ArrayList<Integer> columns = new ArrayList<Integer>(); + + long lineNumber = 0; + boolean cdxPassThru = false; + String delimiter = " "; + while(n < args.length) { + String arg = args[n]; + if(arg.compareTo("-cdx") == 0) { + cdxPassThru = true; + n++; + continue; + } + if(n == (args.length -1)) { + USAGE(); + } + String val = args[n+1]; + if(arg.compareTo("-f") == 0) { + columns.add(new Integer(val)); + } else if(arg.compareTo("-d") == 0) { + delimiter = val; + } else { + USAGE(); + } + n += 2; + } + // place default '0' in case none specified: + if(columns.size() == 0) { + columns.add(new Integer(1)); + } + + // convert to int[]: + int[] cols = new int[columns.size()]; + for(int idx = 0; idx < columns.size(); idx++) { + cols[idx] = columns.get(idx).intValue() - 1; + } + BufferedReader r = new BufferedReader(new InputStreamReader(System.in)); + StringBuilder sb = new StringBuilder(); + String line = null; + + while(true) { + try { + line = r.readLine(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + if(line == null) { + break; + } + lineNumber++; + if(cdxPassThru && line.startsWith(CDX_PREFIX)) { + System.out.println(line); + continue; + } + String parts[] = line.split(delimiter); + for(int column : cols) { + if(column >= parts.length) { + System.err.println("Invalid line " + lineNumber + " (" + + line + ") skipped"); + } else { + try { + parts[column] = canonicalizer.urlStringToKey(parts[column]); + } catch (URIException e) { + System.err.println("Invalid URL in line " + lineNumber + " (" + + line + ") skipped (" + parts[column] + ")"); + e.printStackTrace(); + continue; + } + } + } + sb.setLength(0); + for(i = 0; i < parts.length; i++) { + sb.append(parts[i]); + if(i < (parts.length-1)) { + sb.append(delimiter); + } + } + System.out.println(sb.toString()); + } + } +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |