|
From: <ikr...@us...> - 2012-02-28 04:09:52
|
Revision: 3621
http://archive-access.svn.sourceforge.net/archive-access/?rev=3621&view=rev
Author: ikreymer
Date: 2012-02-28 04:09:45 +0000 (Tue, 28 Feb 2012)
Log Message:
-----------
FEATURE: Add support for automatically decoding text files that contain a "Content-Encoding: gzip". This functionality is enabled by default for HTML, JS, and CSS replay renderers
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2012-02-28 04:06:57 UTC (rev 3620)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2012-02-28 04:09:45 UTC (rev 3621)
@@ -19,12 +19,10 @@
*/
package org.archive.wayback.archivalurl;
-import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
-import java.nio.charset.Charset;
import java.util.Map;
import javax.servlet.ServletException;
@@ -46,14 +44,11 @@
import org.archive.wayback.replay.charset.CharsetDetector;
import org.archive.wayback.replay.charset.StandardCharsetDetector;
import org.archive.wayback.replay.html.ReplayParseContext;
-import org.archive.wayback.util.ByteOp;
import org.archive.wayback.util.htmllex.ContextAwareLexer;
import org.archive.wayback.util.htmllex.ParseEventHandler;
import org.htmlparser.Node;
-import org.htmlparser.lexer.InputStreamSource;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
-import org.htmlparser.lexer.Source;
import org.htmlparser.util.ParserException;
/**
@@ -91,6 +86,8 @@
CaptureSearchResult result, Resource resource,
ResultURIConverter uriConverter, CaptureSearchResults results)
throws ServletException, IOException, WaybackException {
+
+ resource = TextReplayRenderer.decodeResource(resource);
// The URL of the page, for resolving in-page relative URLs:
URL url = null;
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java 2012-02-28 04:09:45 UTC (rev 3621)
@@ -0,0 +1,76 @@
+/*
+ * This file is part of the Wayback archival access software
+ * (http://archive-access.sourceforge.net/projects/wayback/).
+ *
+ * Licensed to the Internet Archive (IA) by one or more individual
+ * contributors.
+ *
+ * The IA licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *
+ * Provide a wrapper for a Resource that is gzip encoded, that is,
+ * Resources that have the header:
+ * Content-Type: gzip
+ *
+ * Used by TextReplayRenderers and other ReplayRenderers that add content to the resulting output
+ *
+ */
+
+package org.archive.wayback.replay;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+import org.archive.wayback.core.Resource;
+
+public class GzipDecodingResource extends Resource {
+
+ public static final String GZIP = "gzip";
+
+ private Resource source;
+
+ public GzipDecodingResource(Resource source)
+ {
+ this.source = source;
+
+ try {
+ this.setInputStream(new GZIPInputStream(source));
+ } catch (IOException io) {
+ // If can't read as gzip, might as well as send back raw data.
+ this.setInputStream(source);
+ }
+ }
+
+ @Override
+ public long getRecordLength() {
+ return source.getRecordLength();
+ }
+
+ @Override
+ public Map<String, String> getHttpHeaders() {
+ return source.getHttpHeaders();
+ }
+
+ @Override
+ public void close() throws IOException {
+ source.close();
+ }
+
+ @Override
+ public int getStatusCode() {
+ return source.getStatusCode();
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/GzipDecodingResource.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java 2012-02-28 04:06:57 UTC (rev 3620)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HttpHeaderOperation.java 2012-02-28 04:09:45 UTC (rev 3621)
@@ -44,6 +44,7 @@
"Transfer-Encoding".toUpperCase();
public final static String HTTP_CHUNKED_ENCODING_HEADER =
"chunked".toUpperCase();
+ public final static String HTTP_CONTENT_ENCODING = "Content-Encoding";
/**
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2012-02-28 04:06:57 UTC (rev 3620)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2012-02-28 04:09:45 UTC (rev 3621)
@@ -76,6 +76,9 @@
Map<String,String> headers = HttpHeaderOperation.processHeaders(
resource, result, uriConverter, httpHeaderProcessor);
+
+ // Decode resource (such as if gzip encoded)
+ resource = decodeResource(resource);
String charSet = charsetDetector.getCharset(resource, wbRequest);
// Load content into an HTML page, and resolve load-time URLs:
@@ -150,4 +153,22 @@
public void setGuessedCharsetHeader(String guessedCharsetHeader) {
this.guessedCharsetHeader = guessedCharsetHeader;
}
+
+ public static Resource decodeResource(Resource resource) throws IOException
+ {
+ Map<String, String> headers = resource.getHttpHeaders();
+
+ if (headers != null) {
+ String encoding = headers.get(HttpHeaderOperation.HTTP_CONTENT_ENCODING);
+ if (encoding != null) {
+ if (encoding.toLowerCase().equals(GzipDecodingResource.GZIP)) {
+ return new GzipDecodingResource(resource);
+ }
+
+ //TODO: check for other encodings?
+ }
+ }
+
+ return resource;
+ }
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|