From: <bra...@us...> - 2009-11-05 23:50:24
|
Revision: 2882 http://archive-access.svn.sourceforge.net/archive-access/?rev=2882&view=rev Author: bradtofel Date: 2009-11-05 23:50:17 +0000 (Thu, 05 Nov 2009) Log Message: ----------- INITIAL REV: SAX based, configurable server side rewriting of HTML content. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java 2009-11-05 23:50:17 UTC (rev 2882) @@ -0,0 +1,52 @@ +/* ArchivalUrlContextResultURIConverterFactory + * + * $Id$: + * + * Created on Nov 5, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.archivalurl; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.replay.html.ContextResultURIConverterFactory; + +/** + * @author brad + * + */ +public class ArchivalUrlContextResultURIConverterFactory + implements ContextResultURIConverterFactory { + private ArchivalUrlResultURIConverter converter = null; + public ArchivalUrlContextResultURIConverterFactory( + ArchivalUrlResultURIConverter converter) { + this.converter = converter; + } + /* (non-Javadoc) + * @see org.archive.wayback.replay.html.ContextResultURIConverterFactory#getContextConverter(java.lang.String) + */ + public ResultURIConverter getContextConverter(String flags) { + if(flags == null) { + return converter; + } + return new ArchivalUrlSpecialContextResultURIConverter(converter,flags); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlContextResultURIConverterFactory.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java 2009-11-05 23:50:17 UTC (rev 2882) @@ -0,0 +1,174 @@ +/* ArchivalUrlSAXRewriteReplayRenderer + * + * $Id$ + * + * Created on 12:15:33 PM Feb 12, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.archivalurl; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.ReplayRenderer; +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.WaybackException; +import org.archive.wayback.replay.HttpHeaderOperation; +import org.archive.wayback.replay.HttpHeaderProcessor; +import org.archive.wayback.replay.JSPExecutor; +import org.archive.wayback.replay.charset.CharsetDetector; +import org.archive.wayback.replay.charset.StandardCharsetDetector; +import org.archive.wayback.replay.html.ReplayParseEventDelegator; +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.util.htmllex.ContextAwareLexer; +import org.htmlparser.Node; +import org.htmlparser.lexer.Lexer; +import org.htmlparser.lexer.Page; +import org.htmlparser.util.ParserException; + +public class ArchivalUrlSAXRewriteReplayRenderer implements ReplayRenderer { + private ReplayParseEventDelegator delegator = null; + private HttpHeaderProcessor httpHeaderProcessor; + private CharsetDetector charsetDetector = new StandardCharsetDetector(); + private final static String OUTPUT_CHARSET = "utf-8"; + + public ArchivalUrlSAXRewriteReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { + this.httpHeaderProcessor = httpHeaderProcessor; + } + + // assume this is only called for appropriate doc types: html + public void renderResource(HttpServletRequest httpRequest, + HttpServletResponse httpResponse, WaybackRequest wbRequest, + CaptureSearchResult result, Resource resource, + ResultURIConverter uriConverter, CaptureSearchResults results) + throws ServletException, IOException, WaybackException { + + // copy the HTTP response code: + HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse); + + // transform the original headers according to our headerProcessor: + Map<String,String> headers = HttpHeaderOperation.processHeaders( + resource, result, uriConverter, httpHeaderProcessor); + + // prepare several objects for the parse: + + // a JSPExecutor: + JSPExecutor jspExec = new JSPExecutor(uriConverter, httpRequest, + httpResponse, wbRequest, results, result, resource); + + // The URL of the page, for resolving in-page relative URLs: + URL url = null; + try { + url = new URL(result.getOriginalUrl()); + } catch (MalformedURLException e1) { + // TODO: this shouldn't happen... + throw new IOException(e1); + } + + // To make sure we get the length, we have to buffer it all up... + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + ArchivalUrlContextResultURIConverterFactory fact = + new ArchivalUrlContextResultURIConverterFactory( + (ArchivalUrlResultURIConverter) uriConverter); + // set up the context: + ReplayParseContext context = + new ReplayParseContext(fact,url,result.getCaptureTimestamp()); + context.setOutputCharset(OUTPUT_CHARSET); + context.setOutputStream(baos); + context.setJspExec(jspExec); + + // determine the character set used to encode the document bytes: + String charSet = charsetDetector.getCharset(resource, wbRequest); + + // and finally, parse, using the special lexer that knows how to + // handle javascript blocks containing unescaped HTML entities: + Page lexPage = new Page(resource,charSet); + ContextAwareLexer lex = new ContextAwareLexer(new Lexer(lexPage), + context); + Node node; + try { + while((node = lex.nextNode()) != null) { + delegator.handleNode(context, node); + } + delegator.handleParseComplete(context); + } catch (ParserException e) { + e.printStackTrace(); + throw new IOException(e); + } + + // At this point, baos contains the utf-8 encoded bytes of our result: + byte[] utf8Bytes = baos.toByteArray(); + // set the corrected length: + headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, + String.valueOf(utf8Bytes.length)); + headers.put("X-Wayback-Guessed-Charset", charSet); + + // send back the headers: + HttpHeaderOperation.sendHeaders(headers, httpResponse); + // Tomcat will always send a charset... It's trying to be smarter than + // we are. If the original page didn't include a "charset" as part of + // the "Content-Type" HTTP header, then Tomcat will use the default.. + // who knows what that is, or what that will do to the page.. + // let's try explicitly setting it to what we used: + httpResponse.setCharacterEncoding(OUTPUT_CHARSET); + + httpResponse.getOutputStream().write(utf8Bytes); + } + + /** + * @return the charsetDetector + */ + public CharsetDetector getCharsetDetector() { + return charsetDetector; + } + + /** + * @param charsetDetector the charsetDetector to set + */ + public void setCharsetDetector(CharsetDetector charsetDetector) { + this.charsetDetector = charsetDetector; + } + + /** + * @return the delegator + */ + public ReplayParseEventDelegator getDelegator() { + return delegator; + } + + /** + * @param delegator the delegator to set + */ + public void setDelegator(ReplayParseEventDelegator delegator) { + this.delegator = delegator; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSAXRewriteReplayRenderer.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java 2009-11-05 23:50:17 UTC (rev 2882) @@ -0,0 +1,63 @@ +/* ArchivalUrlSpecialContextResultURIConverter + * + * $Id$ + * + * Created on 12:15:33 PM Feb 12, 2009. + * + * Copyright (C) 2009 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.archivalurl; + +import org.archive.wayback.ResultURIConverter; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class ArchivalUrlSpecialContextResultURIConverter +implements ResultURIConverter { + + + private String replayURIPrefix = null; + private String context; + + public ArchivalUrlSpecialContextResultURIConverter( + ArchivalUrlResultURIConverter converter, String context) { + replayURIPrefix = converter.getReplayURIPrefix(); + this.context = context; + } + + /* (non-Javadoc) + * @see org.archive.wayback.ResultURIConverter#makeReplayURI(java.lang.String, java.lang.String) + */ + public String makeReplayURI(String datespec, String url) { + String suffix = datespec + context + "/" + url; + if(replayURIPrefix == null) { + return suffix; + } else { + if(url.startsWith(replayURIPrefix)) { + return url; + } + return replayURIPrefix + suffix; + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlSpecialContextResultURIConverter.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |