You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2011-03-09 05:54:03
|
Revision: 3428 http://archive-access.svn.sourceforge.net/archive-access/?rev=3428&view=rev Author: bradtofel Date: 2011-03-09 05:53:57 +0000 (Wed, 09 Mar 2011) Log Message: ----------- FEATURE: now sets Thread.name to something hopefully helpful when debugging via jstack Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-03-09 05:51:36 UTC (rev 3427) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-03-09 05:53:57 UTC (rev 3428) @@ -175,6 +175,11 @@ boolean handled = false; try { + String inputPath = translateRequestPathQuery(httpRequest); + Thread.currentThread().setName("Thread " + + Thread.currentThread().getId() + " " + getBeanName() + + " handling: " + inputPath); + wbRequest = getParser().parse(httpRequest, this); if(wbRequest != null) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-03-09 05:51:36 UTC (rev 3427) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-03-09 05:53:57 UTC (rev 3428) @@ -77,6 +77,10 @@ wbRequest.setRequestUrl(urlString); URL url = null; try { + Thread.currentThread().setName("Thread " + + Thread.currentThread().getId() + " " + getBeanName() + + " handling: " + urlString); + try { url = new URL(urlString); } catch(MalformedURLException e) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-03-09 05:51:42
|
Revision: 3427 http://archive-access.svn.sourceforge.net/archive-access/?rev=3427&view=rev Author: bradtofel Date: 2011-03-09 05:51:36 +0000 (Wed, 09 Mar 2011) Log Message: ----------- FEATURE: resets thread name after handling a response, in case it was changed Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java 2011-03-09 05:50:11 UTC (rev 3426) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/RequestFilter.java 2011-03-09 05:51:36 UTC (rev 3427) @@ -103,12 +103,16 @@ public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException { boolean handled = false; - - if (request instanceof HttpServletRequest) { - if (response instanceof HttpServletResponse) { - handled = mapper.handleRequest((HttpServletRequest) request, - (HttpServletResponse) response); + String origThreadName = Thread.currentThread().getName(); + try { + if (request instanceof HttpServletRequest) { + if (response instanceof HttpServletResponse) { + handled = mapper.handleRequest((HttpServletRequest) request, + (HttpServletResponse) response); + } } + } finally { + Thread.currentThread().setName(origThreadName); } if (!handled) { chain.doFilter(request, response); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3426 http://archive-access.svn.sourceforge.net/archive-access/?rev=3426&view=rev Author: bradtofel Date: 2011-03-09 05:50:11 +0000 (Wed, 09 Mar 2011) Log Message: ----------- LOGGING: added extra log message about adding a block to be searched, added -debug command line option to force-enable logging in a few related classes Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-03-09 05:48:30 UTC (rev 3425) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2011-03-09 05:50:11 UTC (rev 3426) @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.logging.Level; import java.util.logging.Logger; import org.archive.wayback.core.CaptureSearchResult; @@ -163,6 +164,7 @@ // add this and keep lookin... String url = chunkMap.get(parts[1]); long offset = Long.parseLong(parts[2]); + LOGGER.info("Adding block source(" + parts[1] + "):" + offset); blocks.add(new ZiplinedBlock(url, offset)); } } finally { @@ -298,6 +300,14 @@ System.exit(1); } + } else if(args[idx].equals("-debug")) { + Logger.getLogger( + ZiplinesSearchResultSource.class.getName()).setLevel(Level.ALL); + Logger.getLogger( + ZiplinesChunkIterator.class.getName()).setLevel(Level.ALL); + Logger.getLogger( + ZiplinedBlock.class.getName()).setLevel(Level.ALL); + } else { break; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3425 http://archive-access.svn.sourceforge.net/archive-access/?rev=3425&view=rev Author: bradtofel Date: 2011-03-09 05:48:30 +0000 (Wed, 09 Mar 2011) Log Message: ----------- LOGGING: changed max record warning to be more sensible/correct Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java 2011-03-09 05:47:15 UTC (rev 3424) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/GuardRailFilter.java 2011-03-09 05:48:30 UTC (rev 3425) @@ -51,7 +51,7 @@ public int filterObject(CaptureSearchResult r) { recordsScanned++; if(recordsScanned > maxRecordsToScan) { - LOGGER.warning("Hit max requests on " + r.getUrlKey() + " " + LOGGER.warning("Hit max results on " + r.getUrlKey() + " " + r.getCaptureTimestamp()); return FILTER_ABORT; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-03-09 05:47:21
|
Revision: 3424 http://archive-access.svn.sourceforge.net/archive-access/?rev=3424&view=rev Author: bradtofel Date: 2011-03-09 05:47:15 +0000 (Wed, 09 Mar 2011) Log Message: ----------- FEATURE: now throws LiveWebTimeoutException if there's a SocketTimeout Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2011-03-09 05:44:56 UTC (rev 3423) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2011-03-09 05:47:15 UTC (rev 3424) @@ -25,6 +25,7 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; /** @@ -53,7 +54,7 @@ */ public Resource getCachedResource(URL url, long maxCacheMS, boolean bUseOlder) throws LiveDocumentNotAvailableException, - LiveWebCacheUnavailableException, IOException; + LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException; /** * closes all resources */ Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java 2011-03-09 05:44:56 UTC (rev 3423) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java 2011-03-09 05:47:15 UTC (rev 3424) @@ -33,10 +33,12 @@ import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.params.HttpClientParams; import org.archive.io.arc.ARCRecord; import org.archive.wayback.core.Resource; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.resourcefile.ArcResource; import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; @@ -59,7 +61,9 @@ public RemoteLiveWebCache() { connectionManager = new MultiThreadedHttpConnectionManager(); hostConfiguration = new HostConfiguration(); - http = new HttpClient(connectionManager); + HttpClientParams params = new HttpClientParams(); + params.setParameter(HttpClientParams.RETRY_HANDLER, new NoRetryHandler()); + http = new HttpClient(params,connectionManager); http.setHostConfiguration(hostConfiguration); } @@ -68,7 +72,7 @@ */ public Resource getCachedResource(URL url, long maxCacheMS, boolean bUseOlder) throws LiveDocumentNotAvailableException, - LiveWebCacheUnavailableException, IOException { + LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException { String urlString = url.toExternalForm(); HttpMethod method = null; try { @@ -102,7 +106,7 @@ throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + " : " + urlString); } catch (SocketTimeoutException e) { - throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + throw new LiveWebTimeoutException(e.getLocalizedMessage() + " : " + urlString); } catch(ConnectTimeoutException e) { throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-03-09 05:45:02
|
Revision: 3423 http://archive-access.svn.sourceforge.net/archive-access/?rev=3423&view=rev Author: bradtofel Date: 2011-03-09 05:44:56 +0000 (Wed, 09 Mar 2011) Log Message: ----------- BUGFIXES(unreported): Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2011-03-09 05:41:43 UTC (rev 3422) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2011-03-09 05:44:56 UTC (rev 3423) @@ -24,6 +24,8 @@ import java.io.IOException; import java.io.InputStream; import java.net.ConnectException; +import java.net.NoRouteToHostException; +import java.net.SocketException; import java.net.UnknownHostException; import java.util.Date; import java.util.logging.Logger; @@ -34,10 +36,13 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpConnection; import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.HttpMethodRetryHandler; import org.apache.commons.httpclient.HttpState; import org.apache.commons.httpclient.SimpleHttpConnectionManager; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.cookie.CookiePolicy; +import org.apache.commons.httpclient.params.HttpClientParams; import org.apache.commons.io.IOUtils; import org.archive.httpclient.HttpRecorderGetMethod; import org.archive.io.RecordingInputStream; @@ -76,16 +81,18 @@ private int inBufferSize = 1024 * 100; // private int outBufferSize = 10; // private int inBufferSize = 100; + private final static HttpMethodRetryHandler noRetryHandler = + new NoRetryHandler(); private final ThreadLocal<HttpClient> tl = new ThreadLocal<HttpClient>() { protected synchronized HttpClient initialValue() { - HttpClient http = new HttpClient(); + HttpClientParams params = new HttpClientParams(); + params.setParameter(HttpClientParams.RETRY_HANDLER, noRetryHandler); IPHttpConnectionManager manager = new IPHttpConnectionManager(); manager.getParams().setConnectionTimeout(connectionTimeoutMS); manager.getParams().setSoTimeout(socketTimeoutMS); - http.setHttpConnectionManager(manager); - return http; + return new HttpClient(params, manager); } }; @@ -134,9 +141,11 @@ getMethod.setRequestHeader("User-Agent", userAgent); int code = client.executeMethod(getMethod); LOGGER.info("URL(" + url + ") HTTP:" + code); - InputStream responseIS = getMethod.getResponseBodyAsStream(); - ByteOp.discardStream(responseIS); - responseIS.close(); + InputStream responseIS = getMethod.getResponseBodyAsStream(); + if(responseIS != null) { + ByteOp.discardStream(responseIS); + responseIS.close(); + } gotUrl = true; } catch (URIException e) { @@ -148,7 +157,11 @@ // LOGGER.warning("Timeout out connecting to " + url); } catch (ConnectException e) { LOGGER.warning("ConnectionRefused to " + url); - + } catch (NoRouteToHostException e) { + LOGGER.warning("NoRouteToHost for " + url); + } catch (SocketException e) { + // should only be things like "Connection Reset", etc.. + LOGGER.warning("SocketException for " + url); } catch (HttpException e) { e.printStackTrace(); // we have to let IOExceptions out, problems caused by local disk This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-03-09 05:41:49
|
Revision: 3422 http://archive-access.svn.sourceforge.net/archive-access/?rev=3422&view=rev Author: bradtofel Date: 2011-03-09 05:41:43 +0000 (Wed, 09 Mar 2011) Log Message: ----------- INITIAL REV: RetryHandler that never retries Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/NoRetryHandler.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/NoRetryHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/NoRetryHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/NoRetryHandler.java 2011-03-09 05:41:43 UTC (rev 3422) @@ -0,0 +1,32 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.liveweb; + +import java.io.IOException; + +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.HttpMethodRetryHandler; + +public class NoRetryHandler implements HttpMethodRetryHandler { + + public boolean retryMethod(HttpMethod arg0, IOException arg1, int arg2) { + return false; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-03-09 05:39:11
|
Revision: 3421 http://archive-access.svn.sourceforge.net/archive-access/?rev=3421&view=rev Author: bradtofel Date: 2011-03-09 05:39:05 +0000 (Wed, 09 Mar 2011) Log Message: ----------- COMMENT: added license Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java 2011-03-09 05:37:24 UTC (rev 3420) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java 2011-03-09 05:39:05 UTC (rev 3421) @@ -1,3 +1,22 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.archive.wayback.exception; import javax.servlet.http.HttpServletResponse; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java 2011-03-09 05:37:24 UTC (rev 3420) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java 2011-03-09 05:39:05 UTC (rev 3421) @@ -1,3 +1,22 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.archive.wayback.exception; import javax.servlet.http.HttpServletResponse; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java 2011-03-09 05:37:24 UTC (rev 3420) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java 2011-03-09 05:39:05 UTC (rev 3421) @@ -1,3 +1,22 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.archive.wayback.exception; import javax.servlet.http.HttpServletResponse; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-03-09 05:37:30
|
Revision: 3420 http://archive-access.svn.sourceforge.net/archive-access/?rev=3420&view=rev Author: bradtofel Date: 2011-03-09 05:37:24 +0000 (Wed, 09 Mar 2011) Log Message: ----------- INITIAL REV: additional Exception classes to specify some timeout and robot not available conditions Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/LiveWebTimeoutException.java 2011-03-09 05:37:24 UTC (rev 3420) @@ -0,0 +1,16 @@ +package org.archive.wayback.exception; + +import javax.servlet.http.HttpServletResponse; + +public class LiveWebTimeoutException extends WaybackException { + + public LiveWebTimeoutException(String message) { + super(message); + } + /** + * @return the HTTP status code appropriate to this exception class. + */ + public int getStatus() { + return HttpServletResponse.SC_SERVICE_UNAVAILABLE; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotNotAvailableException.java 2011-03-09 05:37:24 UTC (rev 3420) @@ -0,0 +1,19 @@ +package org.archive.wayback.exception; + +import javax.servlet.http.HttpServletResponse; + +public class RobotNotAvailableException extends AccessControlException { + protected static final String ID = "accessWebNotAvailable"; + + public RobotNotAvailableException(String message) { + super(message); + id = ID; + } + /** + * @return the HTTP status code appropriate to this exception class. + */ + public int getStatus() { + return HttpServletResponse.SC_SERVICE_UNAVAILABLE; + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/RobotTimedOutAccessControlException.java 2011-03-09 05:37:24 UTC (rev 3420) @@ -0,0 +1,21 @@ +package org.archive.wayback.exception; + +import javax.servlet.http.HttpServletResponse; + +public class RobotTimedOutAccessControlException extends AccessControlException { + protected static final String ID = "accessRobotTimeout"; + + /** + * @param message + */ + public RobotTimedOutAccessControlException(String message) { + super("Robot.txt timed out",message); + id = ID; + } + /** + * @return the HTTP status code appropriate to this exception class. + */ + public int getStatus() { + return HttpServletResponse.SC_SERVICE_UNAVAILABLE; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3419 http://archive-access.svn.sourceforge.net/archive-access/?rev=3419&view=rev Author: bradtofel Date: 2011-03-09 05:35:36 +0000 (Wed, 09 Mar 2011) Log Message: ----------- BUGFIX: (unreported) wasn't checking for null WaybackRequest that caused a NPE Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BaseExceptionRenderer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BaseExceptionRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BaseExceptionRenderer.java 2011-02-28 23:55:04 UTC (rev 3418) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/exception/BaseExceptionRenderer.java 2011-03-09 05:35:36 UTC (rev 3419) @@ -69,6 +69,9 @@ protected boolean requestIsImage(HttpServletRequest httpRequest, WaybackRequest wbRequest) { + if (wbRequest == null) { + return false; + } if(wbRequest.isIMGContext()) { return true; } @@ -81,6 +84,9 @@ protected boolean requestIsJavascript(HttpServletRequest httpRequest, WaybackRequest wbRequest) { + if (wbRequest == null) { + return false; + } if(wbRequest.isJSContext()) { return true; } @@ -90,6 +96,9 @@ protected boolean requestIsCSS(HttpServletRequest httpRequest, WaybackRequest wbRequest) { + if (wbRequest == null) { + return false; + } if(wbRequest.isCSSContext()) { return true; } @@ -105,7 +114,7 @@ httpRequest.setAttribute("exception", exception); UIResults uiResults = new UIResults(wbRequest,uriConverter,exception); boolean handled = false; - if(!wbRequest.isReplayRequest()) { + if((wbRequest != null) && !wbRequest.isReplayRequest()) { if(wbRequest.isXMLMode()) { uiResults.forward(httpRequest, httpResponse, xmlErrorJsp); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-28 23:55:10
|
Revision: 3418 http://archive-access.svn.sourceforge.net/archive-access/?rev=3418&view=rev Author: bradtofel Date: 2011-02-28 23:55:04 +0000 (Mon, 28 Feb 2011) Log Message: ----------- Added deupeRecords:true property to LocalResourceIndex as default Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml 2011-02-17 22:35:47 UTC (rev 3417) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/BDBCollection.xml 2011-02-28 23:55:04 UTC (rev 3418) @@ -65,6 +65,7 @@ </bean> </property> <property name="maxRecords" value="100000000" /> + <property name="dedupeRecords" value="true" /> </bean> <!-- Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml 2011-02-17 22:35:47 UTC (rev 3417) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/CDXCollection.xml 2011-02-28 23:55:04 UTC (rev 3418) @@ -67,6 +67,7 @@ --> </property> <property name="maxRecords" value="10000" /> + <property name="dedupeRecords" value="true" /> </bean> </property> </bean> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: Aaron B. <aa...@ar...> - 2011-02-18 06:33:15
|
An odd mailing list to reply to.....yet there was much rejoicing. Aaron |
From: Gordon M. <go...@ar...> - 2011-02-18 01:27:45
|
Re: MultiMemberGZIPInputStream As of a couple days ago, Heritrix TRUNK has a GZIPMembersInputStream, that offers not just reads across member-boundaries, but... • behaves the same pre and post JDK6u23! • gives calling-code option of stop-on-member-end or read-straight-thru! • reports member start,end offsets in compressed stream! • offers skip/seek in compressed-offsets! ...and that's not all! It's also got: • comments! • tests! • general compliance with our coding standards (like braces around 1-line if/then/else clauses)! Check it out. - Gordon On 2/17/11 2:35 PM, bra...@us... wrote: > Revision: 3417 > http://archive-access.svn.sourceforge.net/archive-access/?rev=3417&view=rev > Author: bradtofel > Date: 2011-02-17 22:35:47 +0000 (Thu, 17 Feb 2011) > > Log Message: > ----------- > Initial checkin of early gzip line dereferencing CDX processing code. > > Modified Paths: > -------------- > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java > > Added Paths: > ----------- > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java > trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java > > Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java 2011-02-09 22:04:47 UTC (rev 3416) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -100,7 +100,9 @@ > private void mapFull(Object y, Text value, Context context) > throws IOException, InterruptedException { > String s = value.toString(); > - > + if(s.startsWith(" CDX ")) { > + return; > + } > boolean problems = true; > i1 = s.indexOf(delim); > if(i1> 0) { > > Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java 2011-02-09 22:04:47 UTC (rev 3416) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -69,6 +69,10 @@ > System.out.println("cdxsort [OPTIONS]<split> <input> <output>"); > System.out.println("\tOPTIONS can be:"); > System.out.println("\t\t-m NUM - try to run with approximately NUM map tasks"); > + System.out.println("\t\t--compressed-input - assume input is compressed, even without .gz suffix"); > + System.out.println("\t\t--gzip-range - assume input lines are PATH START LENGTH such that a"); > + System.out.println("\t\t\t valid gzip record exists in PATH between START and START+LENGTH"); > + System.out.println("\t\t\t that contains the records to process"); > System.out.println("\t\t--compress-output - compress output files with GZip"); > System.out.println("\t\t--delimiter DELIM - assume DELIM delimter for input and output, instead of default<SPACE>"); > System.out.println("\t\t--map-global - use the GLOBAL CDX map function, which implies:"); > @@ -93,6 +97,8 @@ > > long desiredMaps = 10; > boolean compressOutput = false; > + boolean compressedInput = false; > + boolean gzipRange = false; > List<String> otherArgs = new ArrayList<String>(); > int mapMode = CDXCanonicalizingMapper.MODE_FULL; > for (int i = 0; i< args.length; ++i) { > @@ -101,6 +107,10 @@ > desiredMaps = Integer.parseInt(args[++i]); > } else if ("--compress-output".equals(args[i])) { > compressOutput = true; > + } else if ("--compressed-input".equals(args[i])) { > + compressedInput = true; > + } else if ("--gzip-range".equals(args[i])) { > + gzipRange = true; > } else if ("--delimiter".equals(args[i])) { > delim = args[++i]; > } else if ("--map-full".equals(args[i])) { > @@ -175,8 +185,14 @@ > > FileInputFormat.addInputPath(job, inputPath); > FileInputFormat.setMaxInputSplitSize(job, bytesPerMap); > - job.setInputFormatClass(LineDereferencingInputFormat.class); > - > + if(gzipRange) { > + job.setInputFormatClass(GZIPRangeLineDereferencingInputFormat.class); > + } else { > + job.setInputFormatClass(LineDereferencingInputFormat.class); > + if(compressedInput) { > + LineDereferencingRecordReader.forceCompressed(conf); > + } > + } > FileOutputFormat.setOutputPath(job, outputPath); > > return (job.waitForCompletion(true) ? 0 : 1); > > Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java (rev 0) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -0,0 +1,17 @@ > +package org.archive.wayback.hadoop; > + > +import java.io.IOException; > + > +import org.apache.hadoop.io.Text; > +import org.apache.hadoop.mapreduce.InputSplit; > +import org.apache.hadoop.mapreduce.RecordReader; > +import org.apache.hadoop.mapreduce.TaskAttemptContext; > + > +public class GZIPRangeLineDereferencingInputFormat extends LineDereferencingInputFormat { > + @Override > + public RecordReader<Text, Text> createRecordReader(InputSplit split, > + TaskAttemptContext context) throws IOException, > + InterruptedException { > + return new GZIPRangeLineDereferencingRecordReader(); > + } > +} > > Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java (rev 0) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -0,0 +1,85 @@ > +package org.archive.wayback.hadoop; > + > +import java.io.BufferedReader; > +import java.io.ByteArrayInputStream; > +import java.io.IOException; > +import java.io.InputStream; > +import java.io.InputStreamReader; > +import java.util.zip.GZIPInputStream; > + > +import org.apache.hadoop.fs.FSDataInputStream; > +import org.apache.hadoop.fs.Path; > +import org.apache.hadoop.io.Text; > + > +public class GZIPRangeLineDereferencingRecordReader extends LineDereferencingRecordReader{ > + String curInputLine = null; > + FSDataInputStream fsdis = null; > + long curStart = 0; > + byte[] buffer = null; > + @Override > + public boolean nextKeyValue() throws IOException, InterruptedException { > + if(key == null) { > + key = new Text(); > + } > + if(value == null) { > + value = new Text(); > + } > + while(true) { > + if(curReader == null) { > + // are there more? > + if(internal.nextKeyValue()) { > + progress = internal.getProgress(); > + curInputLine = internal.getCurrentValue().toString(); > + String[] parts = curInputLine.split(" "); > + if(parts.length != 3) { > + throw new IOException("Bad format line(" + curInputLine +")"); > + } > + String newFile = parts[0]; > + if(fsdis != null) { > + if(!newFile.equals(curFile)) { > + // close old and open new, otherwise we can just > + // do another read on the current one: > + fsdis.close(); > + curFile = newFile; > + Path path = new Path(curFile); > + fsdis = fileSystem.open(path); > + } > + } else { > + curFile = newFile; > + Path path = new Path(curFile); > + fsdis = fileSystem.open(path); > + } > + curFile = parts[0]; > + curStart = Long.parseLong(parts[1]); > + int length = Integer.parseInt(parts[2]); > + if(buffer == null) { > + buffer = new byte[length]; > + } else if (buffer.length< length) { > + buffer = new byte[length]; > + } > + fsdis.read(curStart,buffer,0,length); > + // the whole chunk is now in buffer: > + InputStream is = > + new GZIPInputStream(new ByteArrayInputStream(buffer,0,length)); > + curReader = new BufferedReader(new InputStreamReader(is)); > + curLine = 0; > + > + } else { > + // all done: > + return false; > + } > + } > + // try to read another line: > + String nextLine = curReader.readLine(); > + if(nextLine != null) { > + key.set(curFile+":"+curStart+":"+curLine); > + value.set(nextLine); > + curLine++; > + return true; > + } > + curReader.close(); > + curReader = null; > + } > + } > + > +} > > Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java (rev 0) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -0,0 +1,81 @@ > +package org.archive.wayback.hadoop; > + > +import org.apache.hadoop.conf.Configuration; > +import org.apache.hadoop.conf.Configured; > +import org.apache.hadoop.fs.FileStatus; > +import org.apache.hadoop.fs.FileSystem; > +import org.apache.hadoop.fs.Path; > +import org.apache.hadoop.io.Text; > +import org.apache.hadoop.mapred.MapRunner; > +import org.apache.hadoop.mapreduce.Job; > +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; > +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; > +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; > +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; > +import org.apache.hadoop.util.Tool; > +import org.apache.hadoop.util.ToolRunner; > + > +public class HTTPImportJob extends Configured implements Tool { > + Configuration conf = null; > + public final static String HTTP_IMPORT_TARGET = "http-import.target"; > + public Configuration getConf() { > + return conf; > + } > + > + public void setConf(Configuration conf) { > + this.conf = conf; > + > + } > + > + > + > + public int run(String[] args) throws Exception { > + Job job = new Job(getConf(), "http-import"); > + Configuration conf = job.getConfiguration(); > + job.setJarByClass(HTTPImportJob.class); > + job.setInputFormatClass(TextInputFormat.class); > + job.setOutputFormatClass(TextOutputFormat.class); > + job.setOutputKeyClass(Text.class); > + job.setOutputValueClass(Text.class); > + job.setMapperClass(HTTPImportMapper.class); > + > + int i = 0; > + int numMaps = 10; > + while(i< args.length -1) { > + if(args[i].equals("-m")) { > + i++; > + numMaps = Integer.parseInt(args[i]); > + i++; > + } else { > + break; > + } > + } > + if(args.length - 3 != i) { > + throw new IllegalArgumentException("wrong number of args..."); > + } > + Path inputPath = new Path(args[i]); > + Path outputPath = new Path(args[i+1]); > + Path targetPath = new Path(args[i+2]); > + > + TextInputFormat.addInputPath(job, inputPath); > + FileOutputFormat.setOutputPath(job, outputPath); > + conf.set(HTTP_IMPORT_TARGET, targetPath.toString()); > + > + conf.setBoolean("mapred.map.tasks.speculative.execution", false); > + > + FileSystem fs = inputPath.getFileSystem(conf); > + FileStatus inputStatus = fs.getFileStatus(inputPath); > + long inputLen = inputStatus.getLen(); > + long bytesPerMap = (int) inputLen / numMaps; > + > + FileInputFormat.setMaxInputSplitSize(job, bytesPerMap); > + > + > + return (job.waitForCompletion(true) ? 0 : 1); > + } > + public static void main(String[] args) throws Exception { > + int res = ToolRunner.run(new Configuration(), new HTTPImportJob(), args); > + System.exit(res); > + } > + > +} > > Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java (rev 0) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -0,0 +1,145 @@ > +package org.archive.wayback.hadoop; > + > +import java.io.FileNotFoundException; > +import java.io.IOException; > +import java.io.InputStream; > +import java.net.URL; > + > +import org.apache.commons.httpclient.HttpClient; > +import org.apache.commons.httpclient.methods.GetMethod; > +import org.apache.commons.httpclient.methods.HeadMethod; > +import org.apache.hadoop.conf.Configuration; > +import org.apache.hadoop.fs.FSDataOutputStream; > +import org.apache.hadoop.fs.FileStatus; > +import org.apache.hadoop.fs.FileSystem; > +import org.apache.hadoop.fs.Path; > +import org.apache.hadoop.io.LongWritable; > +import org.apache.hadoop.io.Text; > +import org.apache.hadoop.mapreduce.Mapper; > +import org.apache.hadoop.mapreduce.Mapper.Context; > + > +public class HTTPImportMapper extends Mapper<LongWritable, Text, Text, Text> { > + public final int BUFSIZ = 4096; > + Path target = null; > + FileSystem filesystem = null; > + Text doneText = null; > + HttpClient client = null; > + public HTTPImportMapper() { > + > + } > + public void init2() { > + System.err.println("Init map..."); > + } > + @Override > + protected void setup(Context context) throws IOException, > + InterruptedException { > + super.setup(context); > + Configuration conf = context.getConfiguration(); > + String targetString = conf.get(HTTPImportJob.HTTP_IMPORT_TARGET); > + if(targetString == null) { > + throw new IOException("No " + HTTPImportJob.HTTP_IMPORT_TARGET > + + " specified"); > + } > + target = new Path(targetString); > + filesystem = target.getFileSystem(conf); > + doneText = new Text("Done"); > + client = new HttpClient(); > + } > + > + @Override > + protected void map(LongWritable key, Text value, Context context) > + throws IOException, InterruptedException { > + > + String valueS = value.toString(); > + String name; > + String url = valueS; > + int idx = valueS.indexOf(' '); > + if(idx == -1) { > + URL tmpUrl = new URL(valueS); > + name = tmpUrl.getPath(); > + if(name.contains("/")) { > + name = name.substring(name.lastIndexOf('/')+1); > + } > + } else { > + name = valueS.substring(0,idx); > + url = valueS.substring(idx+1); > + } > + Path thisTarget = new Path(target,name); > + doCopy(url, thisTarget); > + context.write(value, doneText); > + } > + > + private long getURLLengthByHead(String url) throws IOException { > + HeadMethod head = new HeadMethod(url); > + long urlLen = -1; > + // execute the method and handle any error responses. > + try { > + int code = client.executeMethod(head); > + if(code != 200) { > + throw new IOException("Non-200 for HEAD:" + url); > + } > + urlLen = head.getResponseContentLength(); > + // discard: hope it's really empty (HEAD) and thus small... > + head.getResponseBody(); > + } finally { > + head.releaseConnection(); > + } > + return urlLen; > + } > + > + private long getPathLength(Path path) throws IOException { > + FileStatus stat = null; > + try { > + stat = filesystem.getFileStatus(path); > + // present.. check by size: > + } catch (FileNotFoundException e) { > + return -1; > + } > + return stat.getLen(); > + } > + > + > + private void doCopy(String url, Path target) throws IOException { > + // Check if the target exists (from previous map) > + long targetLen = getPathLength(target); > + long urlLen = -1; > + if(targetLen> -1) { > + // there's a file in the filesystem already, see if it's the > + // same length: > + urlLen = getURLLengthByHead(url); > + if(urlLen == targetLen) { > + // same size, assume it's done: > + return; > + } > + // diff length, do copy again, first remove old: > + if(!filesystem.delete(target, false)) { > + throw new IOException("Failed to delete old copy"); > + } > + } > + // do the copy: > + FSDataOutputStream out = filesystem.create(target, false); > + GetMethod get = new GetMethod(url); > + long copied = 0; > + try { > + int code = client.executeMethod(get); > + if(code != 200) { > + throw new IOException("Non 200 on GET: " + url); > + } > + urlLen = get.getResponseContentLength(); > + InputStream in = get.getResponseBodyAsStream(); > + byte buffer[] = new byte[BUFSIZ]; > + for(int cbread; (cbread = in.read(buffer))>= 0; ) { > + out.write(buffer, 0, cbread); > + copied += cbread; > + } > + } finally { > + get.releaseConnection(); > + out.close(); > + } > + if(copied != urlLen) { > + // ack.. what went wrong? > + throw new IOException("Wrong copy length want(" + urlLen > + + ") got(" + copied + ") URL:" + url); > + } > + } > +} > > Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java 2011-02-09 22:04:47 UTC (rev 3416) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -25,6 +25,7 @@ > import java.io.InputStreamReader; > import java.util.zip.GZIPInputStream; > > +import org.apache.hadoop.conf.Configuration; > import org.apache.hadoop.fs.FileSystem; > import org.apache.hadoop.fs.Path; > import org.apache.hadoop.io.Text; > @@ -45,6 +46,9 @@ > public class LineDereferencingRecordReader extends RecordReader<Text, Text>{ > LineRecordReader internal = new LineRecordReader(); > > + > + protected static final String FORCE_COMPRESSED_FLAG = "line-reref.force-compressed"; > + > FileSystem fileSystem = null; > Text key = null; > Text value = null; > @@ -52,11 +56,18 @@ > String curFile = null; > long curLine = 0; > float progress = 0.0f; > + boolean forceCompressed = false; > + public static void forceCompressed(Configuration conf) { > + conf.setBoolean(FORCE_COMPRESSED_FLAG, true); > + } > + > @Override > public void initialize(InputSplit split, TaskAttemptContext context) > throws IOException, InterruptedException { > + Configuration conf = context.getConfiguration(); > + forceCompressed = conf.getBoolean(FORCE_COMPRESSED_FLAG, false); > FileSplit fileSplit = (FileSplit) split; > - fileSystem = fileSplit.getPath().getFileSystem(context.getConfiguration()); > + fileSystem = fileSplit.getPath().getFileSystem(conf); > internal.initialize(split, context); > } > > @@ -77,8 +88,9 @@ > Path path = new Path(curFile); > InputStream is = fileSystem.open(path); > // TODO: use the real Codec stuff.. > - if(curFile.endsWith(".gz")) { > - is = new GZIPInputStream(is); > + if(forceCompressed || curFile.endsWith(".gz")) { > +// is = new GZIPInputStream(is); > + is = new MultiMemberGZIPInputStream(is); > } > curReader = new BufferedReader(new InputStreamReader(is)); > > > Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java (rev 0) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -0,0 +1,96 @@ > +package org.archive.wayback.hadoop; > + > +import java.io.InputStream; > +import java.io.PushbackInputStream; > +import java.io.IOException; > +import java.util.zip.GZIPInputStream; > + > +public class MultiMemberGZIPInputStream extends GZIPInputStream { > + > + public MultiMemberGZIPInputStream(InputStream in, int size) throws IOException > + { > + // Wrap the stream in a PushbackInputStream... > + super(new PushbackInputStream(in, size), size); > + this.size=size; > + } > + > + public MultiMemberGZIPInputStream(InputStream in) throws IOException > + { > + // Wrap the stream in a PushbackInputStream... > + super(new PushbackInputStream(in, 1024)); > + this.size=-1; > + } > + > + private MultiMemberGZIPInputStream(MultiMemberGZIPInputStream parent) throws IOException > + { > + super(parent.in); > + this.size=-1; > + this.parent=parent.parent==null ? parent : parent.parent; > + this.parent.child=this; > + } > + > + private MultiMemberGZIPInputStream(MultiMemberGZIPInputStream parent, int size) throws IOException > + { > + super(parent.in, size); > + this.size=size; > + this.parent=parent.parent==null ? parent : parent.parent; > + this.parent.child=this; > + } > + > + private MultiMemberGZIPInputStream parent; > + private MultiMemberGZIPInputStream child; > + private int size; > + private boolean eos; > + > + public int read(byte[] inputBuffer, int inputBufferOffset, int inputBufferLen) throws IOException { > + > + if (eos) { return -1;} > + if (this.child!=null) > + return this.child.read(inputBuffer, inputBufferOffset, inputBufferLen); > + > + int charsRead=super.read(inputBuffer, inputBufferOffset, inputBufferLen); > + if (charsRead==-1) > + { > + // Push any remaining buffered data back onto the stream > + // If the stream is then not empty, use it to construct > + // a new instance of this class and delegate this and any > + // future calls to it... > + int n = inf.getRemaining() - 8; > + if (n> 0) > + { > + // More than 8 bytes remaining in deflater > + // First 8 are gzip trailer. Add the rest to > + // any un-read data... > + ((PushbackInputStream)this.in).unread(buf, len-n, n); > + } > + else > + { > + // Nothing in the buffer. We need to know whether or not > + // there is unread data available in the underlying stream > + // since the base class will not handle an empty file. > + // Read a byte to see if there is data and if so, > + // push it back onto the stream... > + byte[] b=new byte[1]; > + int ret=in.read(b,0,1); > + if (ret==-1) > + { > + eos=true; > + return -1; > + } > + else > + ((PushbackInputStream)this.in).unread(b, 0, 1); > + } > + > + MultiMemberGZIPInputStream child; > + if (this.size==-1) > + child=new MultiMemberGZIPInputStream(this); > + else > + child=new MultiMemberGZIPInputStream(this, this.size); > + return child.read(inputBuffer, inputBufferOffset, inputBufferLen); > + } > + else > + return charsRead; > + } > + > +} > + > > Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java > =================================================================== > --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java 2011-02-09 22:04:47 UTC (rev 3416) > +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java 2011-02-17 22:35:47 UTC (rev 3417) > @@ -32,6 +32,8 @@ > try { > pgd.addClass("cdxsort", CDXSortDriver.class, > "A map/reduce program that canonicalizes and provides a total order sort into multiple CDX files"); > + pgd.addClass("http-import", HTTPImportJob.class, > + "A map/reduce program that imports a bunch of URLs into an HDFS directory"); > pgd.driver(args); > // Success > exitCode = 0; > > > This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. > > ------------------------------------------------------------------------------ > The ultimate all-in-one performance toolkit: Intel(R) Parallel Studio XE: > Pinpoint memory and threading errors before they happen. > Find and fix more than 250 security defects in the development cycle. > Locate bottlenecks in serial and parallel code that limit performance. > http://p.sf.net/sfu/intel-dev2devfeb > _______________________________________________ > Archive-access-cvs mailing list > Arc...@li... > https://lists.sourceforge.net/lists/listinfo/archive-access-cvs |
From: <bra...@us...> - 2011-02-17 22:35:54
|
Revision: 3417 http://archive-access.svn.sourceforge.net/archive-access/?rev=3417&view=rev Author: bradtofel Date: 2011-02-17 22:35:47 +0000 (Thu, 17 Feb 2011) Log Message: ----------- Initial checkin of early gzip line dereferencing CDX processing code. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java 2011-02-09 22:04:47 UTC (rev 3416) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXCanonicalizingMapper.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -100,7 +100,9 @@ private void mapFull(Object y, Text value, Context context) throws IOException, InterruptedException { String s = value.toString(); - + if(s.startsWith(" CDX ")) { + return; + } boolean problems = true; i1 = s.indexOf(delim); if(i1 > 0) { Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java 2011-02-09 22:04:47 UTC (rev 3416) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/CDXSortDriver.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -69,6 +69,10 @@ System.out.println("cdxsort [OPTIONS] <split> <input> <output>"); System.out.println("\tOPTIONS can be:"); System.out.println("\t\t-m NUM - try to run with approximately NUM map tasks"); + System.out.println("\t\t--compressed-input - assume input is compressed, even without .gz suffix"); + System.out.println("\t\t--gzip-range - assume input lines are PATH START LENGTH such that a"); + System.out.println("\t\t\t valid gzip record exists in PATH between START and START+LENGTH"); + System.out.println("\t\t\t that contains the records to process"); System.out.println("\t\t--compress-output - compress output files with GZip"); System.out.println("\t\t--delimiter DELIM - assume DELIM delimter for input and output, instead of default <SPACE>"); System.out.println("\t\t--map-global - use the GLOBAL CDX map function, which implies:"); @@ -93,6 +97,8 @@ long desiredMaps = 10; boolean compressOutput = false; + boolean compressedInput = false; + boolean gzipRange = false; List<String> otherArgs = new ArrayList<String>(); int mapMode = CDXCanonicalizingMapper.MODE_FULL; for (int i = 0; i < args.length; ++i) { @@ -101,6 +107,10 @@ desiredMaps = Integer.parseInt(args[++i]); } else if ("--compress-output".equals(args[i])) { compressOutput = true; + } else if ("--compressed-input".equals(args[i])) { + compressedInput = true; + } else if ("--gzip-range".equals(args[i])) { + gzipRange = true; } else if ("--delimiter".equals(args[i])) { delim = args[++i]; } else if ("--map-full".equals(args[i])) { @@ -175,8 +185,14 @@ FileInputFormat.addInputPath(job, inputPath); FileInputFormat.setMaxInputSplitSize(job, bytesPerMap); - job.setInputFormatClass(LineDereferencingInputFormat.class); - + if(gzipRange) { + job.setInputFormatClass(GZIPRangeLineDereferencingInputFormat.class); + } else { + job.setInputFormatClass(LineDereferencingInputFormat.class); + if(compressedInput) { + LineDereferencingRecordReader.forceCompressed(conf); + } + } FileOutputFormat.setOutputPath(job, outputPath); return (job.waitForCompletion(true) ? 0 : 1); Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingInputFormat.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -0,0 +1,17 @@ +package org.archive.wayback.hadoop; + +import java.io.IOException; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; + +public class GZIPRangeLineDereferencingInputFormat extends LineDereferencingInputFormat { + @Override + public RecordReader<Text, Text> createRecordReader(InputSplit split, + TaskAttemptContext context) throws IOException, + InterruptedException { + return new GZIPRangeLineDereferencingRecordReader(); + } +} Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/GZIPRangeLineDereferencingRecordReader.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -0,0 +1,85 @@ +package org.archive.wayback.hadoop; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; + +public class GZIPRangeLineDereferencingRecordReader extends LineDereferencingRecordReader{ + String curInputLine = null; + FSDataInputStream fsdis = null; + long curStart = 0; + byte[] buffer = null; + @Override + public boolean nextKeyValue() throws IOException, InterruptedException { + if(key == null) { + key = new Text(); + } + if(value == null) { + value = new Text(); + } + while(true) { + if(curReader == null) { + // are there more? + if(internal.nextKeyValue()) { + progress = internal.getProgress(); + curInputLine = internal.getCurrentValue().toString(); + String[] parts = curInputLine.split(" "); + if(parts.length != 3) { + throw new IOException("Bad format line(" + curInputLine +")"); + } + String newFile = parts[0]; + if(fsdis != null) { + if(!newFile.equals(curFile)) { + // close old and open new, otherwise we can just + // do another read on the current one: + fsdis.close(); + curFile = newFile; + Path path = new Path(curFile); + fsdis = fileSystem.open(path); + } + } else { + curFile = newFile; + Path path = new Path(curFile); + fsdis = fileSystem.open(path); + } + curFile = parts[0]; + curStart = Long.parseLong(parts[1]); + int length = Integer.parseInt(parts[2]); + if(buffer == null) { + buffer = new byte[length]; + } else if (buffer.length < length) { + buffer = new byte[length]; + } + fsdis.read(curStart,buffer,0,length); + // the whole chunk is now in buffer: + InputStream is = + new GZIPInputStream(new ByteArrayInputStream(buffer,0,length)); + curReader = new BufferedReader(new InputStreamReader(is)); + curLine = 0; + + } else { + // all done: + return false; + } + } + // try to read another line: + String nextLine = curReader.readLine(); + if(nextLine != null) { + key.set(curFile+":"+curStart+":"+curLine); + value.set(nextLine); + curLine++; + return true; + } + curReader.close(); + curReader = null; + } + } + +} Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportJob.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -0,0 +1,81 @@ +package org.archive.wayback.hadoop; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapRunner; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class HTTPImportJob extends Configured implements Tool { + Configuration conf = null; + public final static String HTTP_IMPORT_TARGET = "http-import.target"; + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + } + + + + public int run(String[] args) throws Exception { + Job job = new Job(getConf(), "http-import"); + Configuration conf = job.getConfiguration(); + job.setJarByClass(HTTPImportJob.class); + job.setInputFormatClass(TextInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Text.class); + job.setMapperClass(HTTPImportMapper.class); + + int i = 0; + int numMaps = 10; + while(i < args.length -1) { + if(args[i].equals("-m")) { + i++; + numMaps = Integer.parseInt(args[i]); + i++; + } else { + break; + } + } + if(args.length - 3 != i) { + throw new IllegalArgumentException("wrong number of args..."); + } + Path inputPath = new Path(args[i]); + Path outputPath = new Path(args[i+1]); + Path targetPath = new Path(args[i+2]); + + TextInputFormat.addInputPath(job, inputPath); + FileOutputFormat.setOutputPath(job, outputPath); + conf.set(HTTP_IMPORT_TARGET, targetPath.toString()); + + conf.setBoolean("mapred.map.tasks.speculative.execution", false); + + FileSystem fs = inputPath.getFileSystem(conf); + FileStatus inputStatus = fs.getFileStatus(inputPath); + long inputLen = inputStatus.getLen(); + long bytesPerMap = (int) inputLen / numMaps; + + FileInputFormat.setMaxInputSplitSize(job, bytesPerMap); + + + return (job.waitForCompletion(true) ? 0 : 1); + } + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(new Configuration(), new HTTPImportJob(), args); + System.exit(res); + } + +} Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/HTTPImportMapper.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -0,0 +1,145 @@ +package org.archive.wayback.hadoop; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.commons.httpclient.methods.HeadMethod; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Mapper.Context; + +public class HTTPImportMapper extends Mapper<LongWritable, Text, Text, Text> { + public final int BUFSIZ = 4096; + Path target = null; + FileSystem filesystem = null; + Text doneText = null; + HttpClient client = null; + public HTTPImportMapper() { + + } + public void init2() { + System.err.println("Init map..."); + } + @Override + protected void setup(Context context) throws IOException, + InterruptedException { + super.setup(context); + Configuration conf = context.getConfiguration(); + String targetString = conf.get(HTTPImportJob.HTTP_IMPORT_TARGET); + if(targetString == null) { + throw new IOException("No " + HTTPImportJob.HTTP_IMPORT_TARGET + + " specified"); + } + target = new Path(targetString); + filesystem = target.getFileSystem(conf); + doneText = new Text("Done"); + client = new HttpClient(); + } + + @Override + protected void map(LongWritable key, Text value, Context context) + throws IOException, InterruptedException { + + String valueS = value.toString(); + String name; + String url = valueS; + int idx = valueS.indexOf(' '); + if(idx == -1) { + URL tmpUrl = new URL(valueS); + name = tmpUrl.getPath(); + if(name.contains("/")) { + name = name.substring(name.lastIndexOf('/')+1); + } + } else { + name = valueS.substring(0,idx); + url = valueS.substring(idx+1); + } + Path thisTarget = new Path(target,name); + doCopy(url, thisTarget); + context.write(value, doneText); + } + + private long getURLLengthByHead(String url) throws IOException { + HeadMethod head = new HeadMethod(url); + long urlLen = -1; + // execute the method and handle any error responses. + try { + int code = client.executeMethod(head); + if(code != 200) { + throw new IOException("Non-200 for HEAD:" + url); + } + urlLen = head.getResponseContentLength(); + // discard: hope it's really empty (HEAD) and thus small... + head.getResponseBody(); + } finally { + head.releaseConnection(); + } + return urlLen; + } + + private long getPathLength(Path path) throws IOException { + FileStatus stat = null; + try { + stat = filesystem.getFileStatus(path); + // present.. check by size: + } catch (FileNotFoundException e) { + return -1; + } + return stat.getLen(); + } + + + private void doCopy(String url, Path target) throws IOException { + // Check if the target exists (from previous map) + long targetLen = getPathLength(target); + long urlLen = -1; + if(targetLen > -1) { + // there's a file in the filesystem already, see if it's the + // same length: + urlLen = getURLLengthByHead(url); + if(urlLen == targetLen) { + // same size, assume it's done: + return; + } + // diff length, do copy again, first remove old: + if(!filesystem.delete(target, false)) { + throw new IOException("Failed to delete old copy"); + } + } + // do the copy: + FSDataOutputStream out = filesystem.create(target, false); + GetMethod get = new GetMethod(url); + long copied = 0; + try { + int code = client.executeMethod(get); + if(code != 200) { + throw new IOException("Non 200 on GET: " + url); + } + urlLen = get.getResponseContentLength(); + InputStream in = get.getResponseBodyAsStream(); + byte buffer[] = new byte[BUFSIZ]; + for(int cbread; (cbread = in.read(buffer)) >= 0; ) { + out.write(buffer, 0, cbread); + copied += cbread; + } + } finally { + get.releaseConnection(); + out.close(); + } + if(copied != urlLen) { + // ack.. what went wrong? + throw new IOException("Wrong copy length want(" + urlLen + + ") got(" + copied + ") URL:" + url); + } + } +} Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java 2011-02-09 22:04:47 UTC (rev 3416) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/LineDereferencingRecordReader.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -25,6 +25,7 @@ import java.io.InputStreamReader; import java.util.zip.GZIPInputStream; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; @@ -45,6 +46,9 @@ public class LineDereferencingRecordReader extends RecordReader<Text, Text>{ LineRecordReader internal = new LineRecordReader(); + + protected static final String FORCE_COMPRESSED_FLAG = "line-reref.force-compressed"; + FileSystem fileSystem = null; Text key = null; Text value = null; @@ -52,11 +56,18 @@ String curFile = null; long curLine = 0; float progress = 0.0f; + boolean forceCompressed = false; + public static void forceCompressed(Configuration conf) { + conf.setBoolean(FORCE_COMPRESSED_FLAG, true); + } + @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { + Configuration conf = context.getConfiguration(); + forceCompressed = conf.getBoolean(FORCE_COMPRESSED_FLAG, false); FileSplit fileSplit = (FileSplit) split; - fileSystem = fileSplit.getPath().getFileSystem(context.getConfiguration()); + fileSystem = fileSplit.getPath().getFileSystem(conf); internal.initialize(split, context); } @@ -77,8 +88,9 @@ Path path = new Path(curFile); InputStream is = fileSystem.open(path); // TODO: use the real Codec stuff.. - if(curFile.endsWith(".gz")) { - is = new GZIPInputStream(is); + if(forceCompressed || curFile.endsWith(".gz")) { +// is = new GZIPInputStream(is); + is = new MultiMemberGZIPInputStream(is); } curReader = new BufferedReader(new InputStreamReader(is)); Added: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/MultiMemberGZIPInputStream.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -0,0 +1,96 @@ +package org.archive.wayback.hadoop; + +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.io.IOException; +import java.util.zip.GZIPInputStream; + +public class MultiMemberGZIPInputStream extends GZIPInputStream { + + public MultiMemberGZIPInputStream(InputStream in, int size) throws IOException + { + // Wrap the stream in a PushbackInputStream... + super(new PushbackInputStream(in, size), size); + this.size=size; + } + + public MultiMemberGZIPInputStream(InputStream in) throws IOException + { + // Wrap the stream in a PushbackInputStream... + super(new PushbackInputStream(in, 1024)); + this.size=-1; + } + + private MultiMemberGZIPInputStream(MultiMemberGZIPInputStream parent) throws IOException + { + super(parent.in); + this.size=-1; + this.parent=parent.parent==null ? parent : parent.parent; + this.parent.child=this; + } + + private MultiMemberGZIPInputStream(MultiMemberGZIPInputStream parent, int size) throws IOException + { + super(parent.in, size); + this.size=size; + this.parent=parent.parent==null ? parent : parent.parent; + this.parent.child=this; + } + + private MultiMemberGZIPInputStream parent; + private MultiMemberGZIPInputStream child; + private int size; + private boolean eos; + + public int read(byte[] inputBuffer, int inputBufferOffset, int inputBufferLen) throws IOException { + + if (eos) { return -1;} + if (this.child!=null) + return this.child.read(inputBuffer, inputBufferOffset, inputBufferLen); + + int charsRead=super.read(inputBuffer, inputBufferOffset, inputBufferLen); + if (charsRead==-1) + { + // Push any remaining buffered data back onto the stream + // If the stream is then not empty, use it to construct + // a new instance of this class and delegate this and any + // future calls to it... + int n = inf.getRemaining() - 8; + if (n > 0) + { + // More than 8 bytes remaining in deflater + // First 8 are gzip trailer. Add the rest to + // any un-read data... + ((PushbackInputStream)this.in).unread(buf, len-n, n); + } + else + { + // Nothing in the buffer. We need to know whether or not + // there is unread data available in the underlying stream + // since the base class will not handle an empty file. + // Read a byte to see if there is data and if so, + // push it back onto the stream... + byte[] b=new byte[1]; + int ret=in.read(b,0,1); + if (ret==-1) + { + eos=true; + return -1; + } + else + ((PushbackInputStream)this.in).unread(b, 0, 1); + } + + MultiMemberGZIPInputStream child; + if (this.size==-1) + child=new MultiMemberGZIPInputStream(this); + else + child=new MultiMemberGZIPInputStream(this, this.size); + return child.read(inputBuffer, inputBufferOffset, inputBufferLen); + } + else + return charsRead; + } + +} + Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java 2011-02-09 22:04:47 UTC (rev 3416) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/src/main/java/org/archive/wayback/hadoop/SortDriver.java 2011-02-17 22:35:47 UTC (rev 3417) @@ -32,6 +32,8 @@ try { pgd.addClass("cdxsort", CDXSortDriver.class, "A map/reduce program that canonicalizes and provides a total order sort into multiple CDX files"); + pgd.addClass("http-import", HTTPImportJob.class, + "A map/reduce program that imports a bunch of URLs into an HDFS directory"); pgd.driver(args); // Success exitCode = 0; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-09 22:04:54
|
Revision: 3416 http://archive-access.svn.sourceforge.net/archive-access/?rev=3416&view=rev Author: bradtofel Date: 2011-02-09 22:04:47 +0000 (Wed, 09 Feb 2011) Log Message: ----------- BUGFIX(WWM-36): now only bounces to live web handling for Replay requests Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-02-09 07:03:26 UTC (rev 3415) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2011-02-09 22:04:47 UTC (rev 3416) @@ -245,18 +245,15 @@ handled = true; } catch(WaybackException e) { - boolean drawError = true; - if(e instanceof ResourceNotInArchiveException) { - if((getLiveWebPrefix() != null) - && (getLiveWebPrefix().length() > 0)) { + if((e instanceof ResourceNotInArchiveException) + && wbRequest.isReplayRequest() + && (getLiveWebPrefix() != null) + && (getLiveWebPrefix().length() > 0)) { - String liveUrl = - getLiveWebPrefix() + wbRequest.getRequestUrl(); - httpResponse.sendRedirect(liveUrl); - drawError = false; - } - } - if(drawError) { + String liveUrl = + getLiveWebPrefix() + wbRequest.getRequestUrl(); + httpResponse.sendRedirect(liveUrl); + } else { logNotInArchive(e,wbRequest); getException().renderException(httpRequest, httpResponse, wbRequest, e, getUriConverter()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-09 07:03:32
|
Revision: 3415 http://archive-access.svn.sourceforge.net/archive-access/?rev=3415&view=rev Author: bradtofel Date: 2011-02-09 07:03:26 +0000 (Wed, 09 Feb 2011) Log Message: ----------- Swapped order of admin & robot exclusion checking - admin is now first Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-02-06 15:00:40 UTC (rev 3414) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-02-09 07:03:26 UTC (rev 3415) @@ -86,13 +86,7 @@ CaptureSearchResult result = new CaptureSearchResult(); result.setOriginalUrl(urlString); result.setUrlKey(urlString); - // check robots first, if configured - if(robotFactory != null) { - int ruling = robotFactory.get().filterObject(result); - if(ruling == ExclusionFilter.FILTER_EXCLUDE) { - throw new RobotAccessControlException(urlString + "is blocked by robots.txt"); - } - } + // check admin excludes first, if configured: if(adminFactory != null) { ExclusionFilter f = adminFactory.get(); if(f == null) { @@ -104,6 +98,13 @@ throw new AdministrativeAccessControlException(urlString + "is blocked."); } } + // check robots next, if configured + if(robotFactory != null) { + int ruling = robotFactory.get().filterObject(result); + if(ruling == ExclusionFilter.FILTER_EXCLUDE) { + throw new RobotAccessControlException(urlString + "is blocked by robots.txt"); + } + } // no robots check, or robots.txt says GO: ArcResource r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false); ARCRecord ar = (ARCRecord) r.getArcRecord(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 15:00:46
|
Revision: 3414 http://archive-access.svn.sourceforge.net/archive-access/?rev=3414&view=rev Author: bradtofel Date: 2011-02-06 15:00:40 +0000 (Sun, 06 Feb 2011) Log Message: ----------- BUGFIX (unreported) now attempts to use Location HTTP header if CDX has empty ("-") redirect URL Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp 2011-02-06 14:59:39 UTC (rev 3413) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/UrlRedirectNotice.jsp 2011-02-06 15:00:40 UTC (rev 3414) @@ -1,5 +1,7 @@ <%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8" %><%@ page import="java.util.Date" +%><%@ page import="java.util.Map" +%><%@ page import="java.util.Iterator" %><%@ page import="java.lang.StringBuffer" %><%@ page import="org.archive.wayback.archivalurl.ArchivalUrlDateRedirectReplayRenderer" %><%@ page import="org.archive.wayback.ResultURIConverter" @@ -7,6 +9,7 @@ %><%@ page import="org.archive.wayback.core.WaybackRequest" %><%@ page import="org.archive.wayback.core.CaptureSearchResult" %><%@ page import="org.archive.wayback.util.StringFormatter" +%><%@ page import="org.archive.wayback.util.url.UrlOperations" %><% UIResults results = UIResults.extractReplay(request); @@ -19,7 +22,21 @@ String targetUrl = cResult.getRedirectUrl(); String captureTS = cResult.getCaptureTimestamp(); Date captureDate = cResult.getCaptureDate(); - +if(targetUrl.equals("-")) { + Map<String,String> headers = results.getResource().getHttpHeaders(); + Iterator<String> headerNameItr = headers.keySet().iterator(); + while(headerNameItr.hasNext()) { + String name = headerNameItr.next(); + if(name.toUpperCase().equals("LOCATION")) { + targetUrl = headers.get(name); + // by the spec, these should be absolute already, but just in case: + targetUrl = UrlOperations.resolveUrl(sourceUrl, targetUrl); + + + } + } +} +// TODO: Handle replay if we still don't have a redirect.. String dateSpec = ArchivalUrlDateRedirectReplayRenderer.makeFlagDateSpec(captureTS, wbr); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:59:46
|
Revision: 3413 http://archive-access.svn.sourceforge.net/archive-access/?rev=3413&view=rev Author: bradtofel Date: 2011-02-06 14:59:39 +0000 (Sun, 06 Feb 2011) Log Message: ----------- BUGFIX(unreported): was not properly escaping several fields Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp 2011-02-06 14:56:37 UTC (rev 3412) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/OpenSearchCaptureResults.jsp 2011-02-06 14:59:39 UTC (rev 3413) @@ -52,12 +52,12 @@ <channel> <title>Wayback OpenSearch Results</title> <link><%= queryPrefix %>></link> - <description><%= fmt.format("PathQueryClassic.searchedFor",searchString) %></description> + <description><%= fmt.format("PathQueryClassic.searchedFor",fmt.escapeHtml(searchString)) %></description> <opensearch:totalResults><%= resultCount %></opensearch:totalResults> <opensearch:startIndex><%= firstResult %></opensearch:startIndex> <opensearch:itemsPerPage><%= shownResultCount %></opensearch:itemsPerPage> <atom:link rel="search" type="application/opensearchdescription+xml" href="<%= staticPrefix %>/opensearchdescription.xml"/> - <opensearch:Query role="request" searchTerms="<%= UIResults.encodeXMLContent(searchTerms) %>" startPage="<%= wbRequest.getPageNum() %>" /> + <opensearch:Query role="request" searchTerms="<%= fmt.escapeHtml(searchTerms) %>" startPage="<%= wbRequest.getPageNum() %>" /> <% while(itr.hasNext()) { %> @@ -65,14 +65,12 @@ <% CaptureSearchResult result = itr.next(); - String replayUrl = UIResults.encodeXMLEntity( - uiResults.resultToReplayUrl(result)); + String replayUrl = fmt.escapeHtml(uiResults.resultToReplayUrl(result)); - String prettyDate = UIResults.encodeXMLEntity( + String prettyDate = fmt.escapeHtml( fmt.format("MetaReplay.captureDateDisplay",result.getCaptureDate())); - String requestUrl = UIResults.encodeXMLEntity( - wbRequest.getRequestUrl()); + String requestUrl = fmt.escapeHtml(wbRequest.getRequestUrl()); %> <title><%= prettyDate %></title> <link><%= replayUrl %></link> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:56:43
|
Revision: 3412 http://archive-access.svn.sourceforge.net/archive-access/?rev=3412&view=rev Author: bradtofel Date: 2011-02-06 14:56:37 +0000 (Sun, 06 Feb 2011) Log Message: ----------- Added full date "global" pattern for general use Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties 2011-02-06 14:56:00 UTC (rev 3411) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties 2011-02-06 14:56:37 UTC (rev 3412) @@ -27,6 +27,8 @@ Exception.anchorWindowTooSmall.title=No Resources within Window Range Exception.anchorWindowTooSmall.message=The Window Range specified does not match any Resources. You can try expanding the Anchor Window size and repeating the request. +UIGlobal.fullDate={0,date,H:mm:ss MMM d, yyyy} + UIGlobal.pageTitle=Internet Archive Wayback Machine UIGlobal.helpLink=Help UIGlobal.enterWebAddress=Enter Web Address: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:56:06
|
Revision: 3411 http://archive-access.svn.sourceforge.net/archive-access/?rev=3411&view=rev Author: bradtofel Date: 2011-02-06 14:56:00 +0000 (Sun, 06 Feb 2011) Log Message: ----------- BUGFIX: was not working before - now uses info that should be set by AccessPoint prior to delegating to this jsp Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Interstitial.jsp Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Interstitial.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Interstitial.jsp 2011-02-06 14:54:55 UTC (rev 3410) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/jsp/Interstitial.jsp 2011-02-06 14:56:00 UTC (rev 3411) @@ -1,5 +1,6 @@ <%@ page import="org.archive.wayback.webapp.AccessPoint" %><%@ page import="org.archive.wayback.util.StringFormatter" +%><%@ page import="java.util.Date" %><% String toUrl = request.getParameter(AccessPoint.INTERSTITIAL_TARGET); if(toUrl == null) { @@ -8,22 +9,38 @@ Bad request. require argument <%= AccessPoint.INTERSTITIAL_TARGET %> <% } else { - String secsS = request.getParameter(AccessPoint.INTERSTITIAL_SECONDS); - int secs = 5; + String secsS = request.getParameter(AccessPoint.INTERSTITIAL_SECONDS); + String dateString = request.getParameter(AccessPoint.INTERSTITIAL_DATE); + String replayUrl = request.getParameter(AccessPoint.INTERSTITIAL_URL); + long dateLong = 0; + int secs = 5; try { secs = Integer.parseInt(secsS); } catch (NumberFormatException e) { - } + try { + dateLong = Long.parseLong(dateString); + } catch (NumberFormatException e) { + } if(secs < 1) { secs = 5; } - StringFormatter f = new StringFormatter(null,null); + StringFormatter f = new StringFormatter(null); + String safeReplayUrl = null; + String prettyReplayDate = null; + if(replayUrl != null) { + safeReplayUrl = f.escapeHtml(replayUrl); + } + if(dateLong > 0) { + Date rd = new Date(dateLong); + prettyReplayDate = + f.format("{0,date,H:mm:ss MMM d, yyyy}",rd); + } String safeTargetUrl = f.escapeHtml(toUrl); String safeTargetUrlJS = f.escapeJavaScript(toUrl); %> <jsp:include page="/WEB-INF/template/UI-header.jsp" flush="true" /> - + <!-- dateLong <%= dateLong %> --> <div id="positionHome"> <section> <div id="logoHome"> @@ -38,11 +55,18 @@ } window.setTimeout("go()",<%= secs * 1000 %>); </script> - <h2 class="blue">Hello.</h2> + <h2 class="blue">Welcome to Wayback.</h2> <p class="code">Loading...</p> - <p class="code shift">[Month 00, 0000]</p> - <p class="code">Loading...</p> - <p class="code shift"><%= safeTargetUrl %></p> + <% + if(safeReplayUrl != null && prettyReplayDate != null) { + %> + <p class="code shift"><%= safeReplayUrl %></p> + <p class="code">as close to the date:</p> + <p class="code shift"><%= prettyReplayDate %></p> + <p class="code">as is available..</p> + <% + } + %> <p class="impatient"><a href="<%= safeTargetUrl %>">Impatient?</a></p> <% } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3410 http://archive-access.svn.sourceforge.net/archive-access/?rev=3410&view=rev Author: bradtofel Date: 2011-02-06 14:54:55 +0000 (Sun, 06 Feb 2011) Log Message: ----------- initial rev, attempting end-to-end testing of HTML rewrite Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandlerTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandlerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandlerTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandlerTest.java 2011-02-06 14:54:55 UTC (rev 3410) @@ -0,0 +1,91 @@ +package org.archive.wayback.archivalurl; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.util.htmllex.ContextAwareLexer; +import org.htmlparser.Node; +import org.htmlparser.lexer.Lexer; +import org.htmlparser.lexer.Page; +import org.htmlparser.util.ParserException; + +import junit.framework.TestCase; + +public class FastArchivalUrlReplayParseEventHandlerTest extends TestCase { + + + + + public void testRewrite() throws Exception { + assertEquals("<html><a href=\"http://replay.archive.org/2001/http://www.example.com/foo.html\">foo</a></html>",doEndToEnd("<html><a href=\"/foo.html\">foo</a></html>")); + assertEquals("<html><a href=\"http://replay.archive.org/2001/http://www.example.com/foo.html\">foo</a></html>",doEndToEnd("<html><a href=\"foo.html\">foo</a></html>")); + assertEquals("<html><a href=\"javascript:doWin('http://replay.archive.org/2001/http://www.symphony.org/')\">American Symphony Orchestra League</a></html>",doEndToEnd("<html><a href=\"javascript:doWin('http://www.symphony.org')\">American Symphony Orchestra League</a></html>")); + } + + public String doEndToEnd(String input) throws Exception { + String baseUrl = "http://www.example.com/"; + String timestamp = "2001"; + String outputCharset = "UTF-8"; + String charSet = "UTF-8"; + + ByteArrayInputStream bais = new ByteArrayInputStream(input.getBytes(charSet)); + + FastArchivalUrlReplayParseEventHandler delegator = new FastArchivalUrlReplayParseEventHandler(); + delegator.setCommentJsp(null); + delegator.setJspInsertPath(null); + + ArchivalUrlResultURIConverter uriConverter = new ArchivalUrlResultURIConverter(); + uriConverter.setReplayURIPrefix("http://replay.archive.org/"); + + ArchivalUrlContextResultURIConverterFactory fact = + new ArchivalUrlContextResultURIConverterFactory( + (ArchivalUrlResultURIConverter) uriConverter); + + // The URL of the page, for resolving in-page relative URLs: + URL url = null; + try { + url = new URL(baseUrl); + } catch (MalformedURLException e1) { + // TODO: this shouldn't happen... + e1.printStackTrace(); + throw new IOException(e1.getMessage()); + } + + // To make sure we get the length, we have to buffer it all up... + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + // set up the context: + ReplayParseContext context = + new ReplayParseContext(fact,url,timestamp); + context.setOutputCharset(outputCharset); + context.setOutputStream(baos); + context.setJspExec(null); + + // and finally, parse, using the special lexer that knows how to + // handle javascript blocks containing unescaped HTML entities: + Page lexPage = new Page(bais,charSet); + Lexer lexer = new Lexer(lexPage); + Lexer.STRICT_REMARKS = false; + ContextAwareLexer lex = new ContextAwareLexer(lexer, context); + Node node; + try { + while((node = lex.nextNode()) != null) { + delegator.handleNode(context, node); + } + delegator.handleParseComplete(context); + } catch (ParserException e) { + e.printStackTrace(); + throw new IOException(e.getMessage()); + } + + // At this point, baos contains the utf-8 encoded bytes of our result: + return new String(baos.toByteArray(),outputCharset); + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:54:09
|
Revision: 3409 http://archive-access.svn.sourceforge.net/archive-access/?rev=3409&view=rev Author: bradtofel Date: 2011-02-06 14:54:02 +0000 (Sun, 06 Feb 2011) Log Message: ----------- BUGFIX: was not canonicalizing URLs prior to lookup.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2011-02-06 14:52:48 UTC (rev 3408) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilter.java 2011-02-06 14:54:02 UTC (rev 3409) @@ -23,10 +23,12 @@ import java.util.logging.Logger; import org.apache.commons.httpclient.URIException; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.surt.SURTTokenizer; import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * @@ -43,11 +45,13 @@ private boolean notifiedSeen = false; private boolean notifiedPassed = false; Map<String,Object> exclusionMap = null; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); /** * @param map where each String key is a SURT that is blocked. */ - public StaticMapExclusionFilter(Map<String,Object> map) { + public StaticMapExclusionFilter(Map<String,Object> map, UrlCanonicalizer canonicalizer) { exclusionMap = map; + this.canonicalizer = canonicalizer; } protected boolean isExcluded(String url) { @@ -80,7 +84,14 @@ } notifiedSeen = true; } - String url = r.getOriginalUrl(); + String url; + try { + url = canonicalizer.urlStringToKey(r.getOriginalUrl()); + } catch (URIException e) { + + //e.printStackTrace(); + return FILTER_EXCLUDE; + } if(lastChecked != null) { if(lastChecked.equals(url)) { if(lastCheckedExcluded) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2011-02-06 14:52:48 UTC (rev 3408) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticMapExclusionFilterFactory.java 2011-02-06 14:54:02 UTC (rev 3409) @@ -25,11 +25,13 @@ import java.util.Map; import java.util.logging.Logger; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.accesscontrol.ExclusionFilterFactory; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.surt.SURTTokenizer; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.flatfile.FlatFile; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * @@ -45,6 +47,8 @@ private Map<String,Object> currentMap = null; private File file = null; long lastUpdated = 0; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + /** * Thread object of update thread -- also is flag indicating if the thread * has already been started -- static, and access to it is synchronized. @@ -93,6 +97,7 @@ if(line.length() == 0) { continue; } + line = canonicalizer.urlStringToKey(line); String surt = line.startsWith("(") ? line : SURTTokenizer.prefixKey(line); LOGGER.fine("EXCLUSION-MAP: adding " + surt); @@ -110,7 +115,7 @@ if(currentMap == null) { return null; } - return new StaticMapExclusionFilter(currentMap); + return new StaticMapExclusionFilter(currentMap, canonicalizer); } private synchronized void startUpdateThread() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3408 http://archive-access.svn.sourceforge.net/archive-access/?rev=3408&view=rev Author: bradtofel Date: 2011-02-06 14:52:48 +0000 (Sun, 06 Feb 2011) Log Message: ----------- FEATURE: now attempting to rewrite javascript: content in anchor URLs Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2011-02-06 14:52:07 UTC (rev 3407) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2011-02-06 14:52:48 UTC (rev 3408) @@ -74,8 +74,11 @@ new JSStringTransformer(); private static MetaRefreshUrlStringTransformer metaRefreshTrans = new MetaRefreshUrlStringTransformer(); - private static URLStringTransformer anchorUrlTrans = - new URLStringTransformer(); + private static URLStringTransformer anchorUrlTrans = null; + static { + anchorUrlTrans = new URLStringTransformer(); + anchorUrlTrans.setJsTransformer(jsBlockTrans); + } private static URLStringTransformer cssUrlTrans = new URLStringTransformer("cs_"); private static URLStringTransformer jsUrlTrans = This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3407 http://archive-access.svn.sourceforge.net/archive-access/?rev=3407&view=rev Author: bradtofel Date: 2011-02-06 14:52:07 +0000 (Sun, 06 Feb 2011) Log Message: ----------- FEATURE: now treating http:/ as http:// Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/DatelessReplayRequestParser.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/DatelessReplayRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/DatelessReplayRequestParser.java 2011-02-06 14:51:25 UTC (rev 3406) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/DatelessReplayRequestParser.java 2011-02-06 14:52:07 UTC (rev 3407) @@ -69,6 +69,12 @@ String scheme = UrlOperations.urlToScheme(requestPath); if(scheme == null) { + if(requestPath.startsWith("http:/")) { + requestPath = "http://" + requestPath.substring(6); + scheme = "http://"; + } + } + if(scheme == null) { try { URL u = new URL(UrlOperations.HTTP_SCHEME + requestPath); // does the authority look legit? This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:51:31
|
Revision: 3406 http://archive-access.svn.sourceforge.net/archive-access/?rev=3406&view=rev Author: bradtofel Date: 2011-02-06 14:51:25 +0000 (Sun, 06 Feb 2011) Log Message: ----------- BUGFIX: no longer catching and eating ConnectTimeoutExceptions Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2011-02-06 14:49:24 UTC (rev 3405) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2011-02-06 14:51:25 UTC (rev 3406) @@ -38,8 +38,10 @@ import org.apache.commons.httpclient.SimpleHttpConnectionManager; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.cookie.CookiePolicy; +import org.apache.commons.io.IOUtils; import org.archive.httpclient.HttpRecorderGetMethod; import org.archive.io.RecordingInputStream; +import org.archive.io.ReplayInputStream; import org.archive.io.arc.ARCWriter; import org.archive.net.LaxURI; import org.archive.util.Recorder; @@ -140,10 +142,10 @@ } catch (URIException e) { e.printStackTrace(); } catch (UnknownHostException e) { - LOGGER.warning("Unknown host for " + url); - } catch (ConnectTimeoutException e) { - // TODO: should we act like it's a full block? - LOGGER.warning("Timeout out connecting to " + url); + LOGGER.warning("Unknown host for " + url); +// } catch (ConnectTimeoutException e) { +// // TODO: should we act like it's a full block? +// LOGGER.warning("Timeout out connecting to " + url); } catch (ConnectException e) { LOGGER.warning("ConnectionRefused to " + url); @@ -164,21 +166,24 @@ // now write the content, or a fake record: ARCWriter writer = null; + ReplayInputStream replayIS = null; try { writer = cache.getWriter(); if(gotUrl) { RecordingInputStream ris = recorder.getRecordedInput(); + replayIS = ris.getReplayInputStream(); region = storeInputStreamARCRecord(writer, url, getMethod.getMime(), getMethod.getRemoteIP(), getMethod.getCaptureDate(), - ris.getReplayInputStream(), (int) ris.getSize()); + replayIS, (int) ris.getSize()); } else { region = storeNotAvailable(writer, url); } } finally { + IOUtils.closeQuietly(replayIS); if(writer != null) { cache.returnWriter(writer); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |