From: <ikr...@us...> - 2012-02-28 04:09:52
|
Revision: 3621 http://archive-access.svn.sourceforge.net/archive-access/?rev=3621&view=rev Author: ikreymer Date: 2012-02-28 04:09:45 +0000 (Tue, 28 Feb 2012) Log Message: ----------- FEATURE: Add support for automatically decoding text files that contain a "Content-Encoding: gzip". This functionality is enabled by default for HTML, JS, and CSS replay renderers Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2012-02-28 04:06:57 UTC (rev 3620) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2012-02-28 04:09:45 UTC (rev 3621) @@ -19,12 +19,10 @@ */ package org.archive.wayback.archivalurl; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; -import java.nio.charset.Charset; import java.util.Map; import javax.servlet.ServletException; @@ -46,14 +44,11 @@ import org.archive.wayback.replay.charset.CharsetDetector; import org.archive.wayback.replay.charset.StandardCharsetDetector; import org.archive.wayback.replay.html.ReplayParseContext; -import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.htmllex.ContextAwareLexer; import org.archive.wayback.util.htmllex.ParseEventHandler; import org.htmlparser.Node; -import org.htmlparser.lexer.InputStreamSource; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; -import org.htmlparser.lexer.Source; import org.htmlparser.util.ParserException; /** @@ -91,6 +86,8 @@ CaptureSearchResult result, Resource resource, ResultURIConverter uriConverter, CaptureSearchResults results) throws ServletException, IOException, WaybackException { + + resource = TextReplayRenderer.decodeResource(resource); // The URL of the page, for resolving in-page relative URLs: URL url = null; Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java 2012-02-28 04:09:45 UTC (rev 3621) @@ -0,0 +1,76 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + * Provide a wrapper for a Resource that is gzip encoded, that is, + * Resources that have the header: + * Content-Type: gzip + * + * Used by TextReplayRenderers and other ReplayRenderers that add content to the resulting output + * + */ + +package org.archive.wayback.replay; + +import java.io.IOException; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +import org.archive.wayback.core.Resource; + +public class GzipDecodingResource extends Resource { + + public static final String GZIP = "gzip"; + + private Resource source; + + public GzipDecodingResource(Resource source) + { + this.source = source; + + try { + this.setInputStream(new GZIPInputStream(source)); + } catch (IOException io) { + // If can't read as gzip, might as well as send back raw data. + this.setInputStream(source); + } + } + + @Override + public long getRecordLength() { + return source.getRecordLength(); + } + + @Override + public Map<String, String> getHttpHeaders() { + return source.getHttpHeaders(); + } + + @Override + public void close() throws IOException { + source.close(); + } + + @Override + public int getStatusCode() { + return source.getStatusCode(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java ___________________________________________________________________ Added: svn:mime-type + text/plain Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java 2012-02-28 04:06:57 UTC (rev 3620) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java 2012-02-28 04:09:45 UTC (rev 3621) @@ -44,6 +44,7 @@ "Transfer-Encoding".toUpperCase(); public final static String HTTP_CHUNKED_ENCODING_HEADER = "chunked".toUpperCase(); + public final static String HTTP_CONTENT_ENCODING = "Content-Encoding"; /** Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2012-02-28 04:06:57 UTC (rev 3620) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2012-02-28 04:09:45 UTC (rev 3621) @@ -76,6 +76,9 @@ Map<String,String> headers = HttpHeaderOperation.processHeaders( resource, result, uriConverter, httpHeaderProcessor); + + // Decode resource (such as if gzip encoded) + resource = decodeResource(resource); String charSet = charsetDetector.getCharset(resource, wbRequest); // Load content into an HTML page, and resolve load-time URLs: @@ -150,4 +153,22 @@ public void setGuessedCharsetHeader(String guessedCharsetHeader) { this.guessedCharsetHeader = guessedCharsetHeader; } + + public static Resource decodeResource(Resource resource) throws IOException + { + Map<String, String> headers = resource.getHttpHeaders(); + + if (headers != null) { + String encoding = headers.get(HttpHeaderOperation.HTTP_CONTENT_ENCODING); + if (encoding != null) { + if (encoding.toLowerCase().equals(GzipDecodingResource.GZIP)) { + return new GzipDecodingResource(resource); + } + + //TODO: check for other encodings? + } + } + + return resource; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |