Revision: 3093 http://archive-access.svn.sourceforge.net/archive-access/?rev=3093&view=rev Author: bradtofel Date: 2010-05-17 19:27:12 +0000 (Mon, 17 May 2010) Log Message: ----------- INITIAL REV: leaner default implementation of current server-side rewrite rules. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2010-05-17 19:27:12 UTC (rev 3093) @@ -0,0 +1,346 @@ +/* FastArchivalUrlReplayParseEventHandler + * + * $Id$: + * + * Created on May 4, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.archivalurl; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.HashMap; + +import javax.servlet.ServletException; + +import org.archive.wayback.replay.html.ReplayParseContext; +import org.archive.wayback.replay.html.StringTransformer; +import org.archive.wayback.replay.html.transformer.BlockCSSStringTransformer; +import org.archive.wayback.replay.html.transformer.InlineCSSStringTransformer; +import org.archive.wayback.replay.html.transformer.JSStringTransformer; +import org.archive.wayback.replay.html.transformer.MetaRefreshUrlStringTransformer; +import org.archive.wayback.replay.html.transformer.URLStringTransformer; +import org.archive.wayback.util.htmllex.NodeUtils; +import org.archive.wayback.util.htmllex.ParseContext; +import org.archive.wayback.util.htmllex.ParseEventHandler; +import org.htmlparser.Node; +import org.htmlparser.nodes.TagNode; +import org.htmlparser.nodes.TextNode; + +/** + * Lean and mean ParseEventHandler implementing current best-known server-side + * HTML rewrite rules, and should be much faster than the fully configurable + * version. + * + * @author brad + * + */ +public class FastArchivalUrlReplayParseEventHandler implements + ParseEventHandler { + + private final static String FERRET_DONE_KEY = + FastArchivalUrlReplayParseEventHandler.class.toString(); + + private String jspInsertPath = "/WEB-INF/replay/DisclaimChooser.jsp"; + + private final String[] okHeadTags = { "!DOCTYPE", "HTML", "HEAD", "BASE", + "LINK", "META", "TITLE", "STYLE", "SCRIPT", "BODY" }; + private HashMap<String, Object> okHeadTagMap = null; + private final static String FRAMESET_TAG = "FRAMESET"; + private final static String BODY_TAG = "BODY"; + + private static BlockCSSStringTransformer cssBlockTrans = + new BlockCSSStringTransformer(); + private static InlineCSSStringTransformer cssInlineTrans = + new InlineCSSStringTransformer(); + private static JSStringTransformer jsBlockTrans = + new JSStringTransformer(); + private static MetaRefreshUrlStringTransformer metaRefreshTrans = + new MetaRefreshUrlStringTransformer(); + private static URLStringTransformer anchorUrlTrans = + new URLStringTransformer(); + private static URLStringTransformer cssUrlTrans = + new URLStringTransformer("cs_"); + private static URLStringTransformer jsUrlTrans = + new URLStringTransformer("js_"); + private static URLStringTransformer imageUrlTrans = + new URLStringTransformer("im_"); + + /** Constructor... */ + public FastArchivalUrlReplayParseEventHandler() { + okHeadTagMap = new HashMap<String, Object>(okHeadTags.length); + for (String tag : okHeadTags) { + okHeadTagMap.put(tag, null); + } + } + + // TODO: This should all be refactored up into an abstract base class with + // default no-op methods, allowing a subclass to only override the ones they + // want... + public void handleNode(ParseContext pContext, Node node) + throws IOException { + ReplayParseContext context = (ReplayParseContext) pContext; + if(NodeUtils.isRemarkNode(node)) { +// RemarkNode remarkNode = (RemarkNode) node; +// handleRemarkTextNode(context,remarkNode); + emit(context,null,node,null); + + } else if(NodeUtils.isTextNode(node)) { + TextNode textNode = (TextNode) node; + if(context.isInCSS()) { + handleCSSTextNode(context,textNode); + + } else if(context.isInScriptText()) { + handleJSTextNode(context,textNode); + } else { + emit(context,null,textNode,null); +// handleContentTextNode(context,textNode); + } + } else if(NodeUtils.isTagNode(node)) { + TagNode tagNode = (TagNode) node; + if(tagNode.isEndTag()) { + emit(context,null,tagNode,null); +// handleCloseTagNode(context,tagNode); + } else { + // assume start, possibly empty: + handleOpenTagNode(context,tagNode); + } + } else { + throw new IllegalArgumentException("Unknown node type.."); + } + } + + /** + * @param context + * @param textNode + * @throws IOException + */ + private void handleCSSTextNode(ReplayParseContext context, TextNode textNode) throws IOException { + textNode.setText(cssBlockTrans.transform(context, textNode.getText())); + emit(context,null,textNode,null); + } + /** + * @param context + * @param textNode + * @throws IOException + */ + private void handleJSTextNode(ReplayParseContext context, TextNode textNode) throws IOException { + textNode.setText(jsBlockTrans.transform(context, textNode.getText())); + emit(context,null,textNode,null); + } + + private void handleOpenTagNode(ReplayParseContext context, TagNode tagNode) + throws IOException { + + boolean insertedJsp = context.getData(FERRET_DONE_KEY) != null; + String preEmit = null; + String postEmit = null; + + String tagName = tagNode.getTagName(); + // Time to insert the JSP header? + if(!insertedJsp) { + if(!okHeadTagMap.containsKey(tagName)) { + if(tagName.equals(FRAMESET_TAG)) { + // don't put the insert in framsets: + } else { + String tmp = null; + try { + tmp = + context.getJspExec().jspToString(jspInsertPath); + } catch (ServletException e) { + e.printStackTrace(); + } + if (tagName.equals(BODY_TAG)) { + // insert it now, *after* the current Tag: + postEmit = tmp; + } else { + // hrm... we are seeing a node that should be in + // the body.. lets emit the jsp now, *before* + // the current Tag: + preEmit = tmp; + } + } + context.putData(FERRET_DONE_KEY,""); + } + } + // now do all the usual attribute rewriting: + // this could be slightly optimized by moving tags more likely to occur + // to the front of the if/else if/else if routing... + + if(tagName.equals("A")) { + transformAttr(context, tagNode, "HREF", anchorUrlTrans); + + } else if(tagName.equals("APPLET")) { + transformAttr(context, tagNode, "CODEBASE", anchorUrlTrans); + transformAttr(context, tagNode, "ARCHIVE", anchorUrlTrans); + + } else if(tagName.equals("AREA")) { + transformAttr(context, tagNode, "HREF", anchorUrlTrans); + + } else if(tagName.equals("BASE")) { + String orig = tagNode.getAttribute("HREF"); + if(orig != null) { + try { + context.setBaseUrl(new URL(orig)); + } catch (MalformedURLException e) { + e.printStackTrace(); + } + } + + } else if(tagName.equals("EMBED")) { + transformAttr(context, tagNode, "SRC", anchorUrlTrans); + + } else if(tagName.equals("IFRAME")) { + transformAttr(context, tagNode, "SRC", anchorUrlTrans); + + } else if(tagName.equals("IMG")) { + transformAttr(context, tagNode, "SRC", imageUrlTrans); + + } else if(tagName.equals("INPUT")) { + transformAttr(context, tagNode, "SRC", imageUrlTrans); + + } else if(tagName.equals("FORM")) { + transformAttr(context, tagNode, "ACTION", anchorUrlTrans); + + } else if(tagName.equals("FRAME")) { + transformAttr(context, tagNode, "SRC", anchorUrlTrans); + + } else if(tagName.equals("LINK")) { + if(transformAttrWhere(context, tagNode, "REL", "STYLESHEET", + "HREF",cssUrlTrans)) { + // no-op + } else if(transformAttrWhere(context,tagNode,"REL","SHORTCUT ICON", + "HREF", imageUrlTrans)) { + // no-op + } else { + transformAttr(context, tagNode, "HREF", anchorUrlTrans); + } + + } else if(tagName.equals("META")) { + transformAttrWhere(context, tagNode, "HTTP-EQUIV", "REFRESH", + "CONTENT", metaRefreshTrans); + transformAttr(context, tagNode, "URL", anchorUrlTrans); + + } else if(tagName.equals("OBJECT")) { + transformAttr(context, tagNode, "CODEBASE", anchorUrlTrans); + transformAttr(context, tagNode, "CDATA", anchorUrlTrans); + + } else if(tagName.equals("SCRIPT")) { + transformAttr(context, tagNode, "SRC", jsUrlTrans); + } + // now, for *all* tags... + transformAttr(context,tagNode,"BACKGROUND", imageUrlTrans); + transformAttr(context,tagNode,"STYLE", cssInlineTrans); + transformAttr(context,tagNode,"onclick", jsBlockTrans); + + emit(context,preEmit,tagNode,postEmit); + } + + private void emit(ReplayParseContext context, String pre, Node node, + String post) throws IOException { + + OutputStream out = context.getOutputStream(); + if(out != null) { + Charset charset = Charset.forName(context.getOutputCharset()); + + if(pre != null) { + + out.write(pre.getBytes(charset)); + } + + out.write(node.toHtml(true).getBytes(charset)); + + if(post != null) { + + out.write(post.getBytes(charset)); + } + } + } + + /** + * Transform a particular attribute on a TagNode, if that TagNode has a + * previous value for the updated attribute, AND if that TagNode contains + * another named attribute with a specific value. + * + * @param context the ReplayParseContext + * @param node the TagNode to be updated + * @param attrName update only occurs if the TagNode has an attribute with + * this name. + * @param attrVal update only occurs if the TagNode has an attribute + * attrName has this value, case insensitive. In fact as an optimization, + * it is ASSUMED that this argument is already UPPER-CASED + * @param modAttr the attribute value to update + * @param transformer the StringTransformer responsible for creating the + * new value based on the old one. + * @return true if the attribute was updated. + */ + private boolean transformAttrWhere(ReplayParseContext context, TagNode node, + String attrName, String attrVal, String modAttr, + StringTransformer transformer) { + String val = node.getAttribute(attrName); + if(val != null) { + if(val.toUpperCase().equals(attrVal)) { + return transformAttr(context,node,modAttr,transformer); + } + } + return false; + } + /** + * Transform a particular attribute on a TagNode, iff that attribute exists + * + * @param context The ReplayParseContext being transformed + * @param node the TagNode to update + * @param attr the attribute name to transform + * @param transformer the StringTransformer responsible for creating the + * new value + * @return true if the attribute was found and updated + */ + private boolean transformAttr(ReplayParseContext context, TagNode node, + String attr, StringTransformer transformer) { + String orig = node.getAttribute(attr); + if(orig != null) { + node.setAttribute(attr, + transformer.transform(context, orig)); + return true; + } + return false; + } + public void handleParseComplete(ParseContext context) throws IOException { + // Nothing to do. + } + + /** + * @return the jspInsertPath + */ + public String getJspInsertPath() { + return jspInsertPath; + } + + /** + * @param jspInsertPath the jspInsertPath to set + */ + public void setJspInsertPath(String jspInsertPath) { + this.jspInsertPath = jspInsertPath; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |