|
From: <bra...@us...> - 2009-11-05 23:02:01
|
Revision: 2879
http://archive-access.svn.sourceforge.net/archive-access/?rev=2879&view=rev
Author: bradtofel
Date: 2009-11-05 23:01:46 +0000 (Thu, 05 Nov 2009)
Log Message:
-----------
INITIAL REV: Library which sits on top of htmlparser, enabling SAX stream handling for both indexing, text & link extraction, and streaming modifications to HTML documents.
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,84 @@
+/* ContextAwareLexer
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex;
+
+import org.htmlparser.Node;
+import org.htmlparser.lexer.Lexer;
+import org.htmlparser.util.ParserException;
+
+/**
+ *
+ * The Lexer that comes with htmlparser does not handle non-escaped HTML
+ * entities within SCRIPT tags - by default, something like:
+ *
+ * <script>
+ * for(var i=0; i<23; i++) { j+=i; }
+ * </script>
+ *
+ * Can cause the lexer to skip over a large part of the document. Technically,
+ * the above isn't legit HTML, but of course, folks do stuff like that all the
+ * time. So, this class uses a ParseContext object, passed in at construction,
+ * which observes the SCRIPT and STYLE tags, both setting properties on the
+ * ParseContext, and using that state information to perform a parseCDATA()
+ * call instead of a nextNode() call at the right time, to try to keep the
+ * SAX parsing in sync with the document.
+ *
+ * @author brad
+ *
+ */
+public class ContextAwareLexer extends NodeUtils {
+
+ private Lexer lexer = null;
+ private ParseContext context = null;
+ public ContextAwareLexer(Lexer lexer, ParseContext context) {
+ this.lexer = lexer;
+ this.context = context;
+ }
+ public Node nextNode() throws ParserException {
+ Node node = null;
+ if(context.isInJS()) {
+ node = lexer.parseCDATA(true);
+ if(node != null) {
+ context.setInScriptText(true);
+ context.setInJS(false);
+ return node;
+ }
+ }
+ context.setInScriptText(false);
+ node = lexer.nextNode(context.isInJS());
+ if(node != null) {
+ if(isNonEmptyOpenTagNodeNamed(node, SCRIPT_TAG_NAME)) {
+ context.setInJS(true);
+ } else if(isCloseTagNodeNamed(node, SCRIPT_TAG_NAME)) {
+ context.setInJS(false);
+ } else if(isNonEmptyOpenTagNodeNamed(node, STYLE_TAG_NAME)) {
+ context.setInCSS(true);
+ } else if(isCloseTagNodeNamed(node, STYLE_TAG_NAME)) {
+ context.setInCSS(false);
+ }
+ }
+ return node;
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ContextAwareLexer.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,83 @@
+/* NodeUtils
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex;
+
+import org.htmlparser.Node;
+import org.htmlparser.nodes.RemarkNode;
+import org.htmlparser.nodes.TagNode;
+import org.htmlparser.nodes.TextNode;
+
+public class NodeUtils {
+ public static final String SCRIPT_TAG_NAME = "SCRIPT";
+ public static final String STYLE_TAG_NAME = "STYLE";
+
+ public static boolean isTagNode(Node node) {
+ return (node instanceof TagNode);
+ }
+ public static boolean isTextNode(Node node) {
+ return (node instanceof TextNode);
+ }
+ public static boolean isRemarkNode(Node node) {
+ return (node instanceof RemarkNode);
+ }
+ public static boolean isTagNodeNamed(Node node, String name) {
+ if(isTagNode(node)) {
+ TagNode tagNode = (TagNode) node;
+ String nodeName = tagNode.getTagName();
+ return nodeName.equals(name);
+ }
+ return false;
+ }
+ public static boolean isOpenTagNodeNamed(Node node, String name) {
+ if(isTagNode(node)) {
+ TagNode tagNode = (TagNode) node;
+ if(!tagNode.isEndTag()) {
+ String nodeName = tagNode.getTagName();
+ return nodeName.equals(name);
+ }
+ }
+ return false;
+ }
+ public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) {
+ if(isTagNode(node)) {
+ TagNode tagNode = (TagNode) node;
+ if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) {
+ String nodeName = tagNode.getTagName();
+ return nodeName.equals(name);
+ }
+ }
+ return false;
+ }
+ public static boolean isCloseTagNodeNamed(Node node, String name) {
+ if(isTagNode(node)) {
+ TagNode tagNode = (TagNode) node;
+ if(tagNode.isEndTag()) {
+ String nodeName = tagNode.getTagName();
+ return nodeName.equals(name);
+ }
+ }
+ return false;
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/NodeUtils.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,118 @@
+/* ParseContext
+ *
+ * $Id$
+ *
+ * Created on 2:06:46 PM Feb 19, 2009.
+ *
+ * Copyright (C) 2009 Internet Archive.
+ *
+ * This file is part of test.
+ *
+ * test is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * test is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with test; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashMap;
+
+/**
+ * Class which tracks the context and state involved with parsing an HTML
+ * document via SAX events.
+ *
+ * Also holds some page URL information, and provides some URL resolving
+ * functionality.
+ *
+ * Lastly, this class exposes a general purpose HashMap<String,String> for use
+ * by specific applications.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+
+public class ParseContext {
+ protected URL baseUrl = null;
+
+ private boolean inCSS = false;
+ private boolean inJS = false;
+ private boolean inScriptText = false;
+ private HashMap<String,String> data = null;
+
+ public ParseContext() {
+ data = new HashMap<String, String>();
+ }
+ public void putData(String key, String value) {
+ data.put(key, value);
+ }
+ public String getData(String key) {
+ return data.get(key);
+ }
+ public void setBaseUrl(URL url) {
+ baseUrl = url;
+ }
+ public String resolve(String url) throws MalformedURLException {
+ URL tmp = new URL(baseUrl,url);
+ return tmp.toString();
+ }
+ public String contextualizeUrl(String url) {
+ if(url.startsWith("javascript:")) {
+ return url;
+ }
+ try {
+ return resolve(url);
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ return url;
+ }
+ }
+
+ /**
+ * @return the inCSS
+ */
+ public boolean isInCSS() {
+ return inCSS;
+ }
+ /**
+ * @param inCSS the inCSS to set
+ */
+ public void setInCSS(boolean inCSS) {
+ this.inCSS = inCSS;
+ }
+ /**
+ * @return the inJS
+ */
+ public boolean isInJS() {
+ return inJS;
+ }
+ /**
+ * @param inJS the inJS to set
+ */
+ public void setInJS(boolean inJS) {
+ this.inJS = inJS;
+ }
+
+ /**
+ * @return the inScriptText
+ */
+ public boolean isInScriptText() {
+ return inScriptText;
+ }
+ /**
+ * @param inScriptText the inScriptText to set
+ */
+ public void setInScriptText(boolean inScriptText) {
+ this.inScriptText = inScriptText;
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,255 @@
+/* ParseEventDelegator
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.archive.wayback.util.htmllex.handlers.CSSTextHandler;
+import org.archive.wayback.util.htmllex.handlers.CloseTagHandler;
+import org.archive.wayback.util.htmllex.handlers.ContentTextHandler;
+import org.archive.wayback.util.htmllex.handlers.JSTextHandler;
+import org.archive.wayback.util.htmllex.handlers.OpenTagHandler;
+import org.archive.wayback.util.htmllex.handlers.ParseCompleteHandler;
+import org.archive.wayback.util.htmllex.handlers.RemarkTextHandler;
+import org.htmlparser.Node;
+import org.htmlparser.nodes.RemarkNode;
+import org.htmlparser.nodes.TagNode;
+import org.htmlparser.nodes.TextNode;
+
+/**
+ *
+ * This class provides an abstraction between high-level SAX events, and
+ * application specific low-level SAX event handlers.
+ *
+ * Any object which wishes to receive any low-level SAX events is placed in the
+ * parserVisitors List, and at initialization of this class, each element in
+ * that list is given an opportunity to register to receive whatever low-level
+ * SAX events it is interested in.
+ *
+ * This class also manages casting of Node objects into more event-specific
+ * casts, and uses the ParseContext to route specific nodes to the registered
+ * handlers of each low-level event types.
+ *
+ * This class attempts to be efficient about targeting specific TagNodes:
+ * When registering to receive events, handlers can register for a specific
+ * tag name, or for the global-tag ("*") name.
+ *
+ * As TagNodes are handled, all tag-specific handlers are called, followed by
+ * all global-tag handlers.
+ *
+ * @author brad
+ */
+public class ParseEventDelegator implements ParseEventHandler {
+
+ public static final String WILDCARD_TAG_NAME = "*";
+
+ private Map<String,List<CloseTagHandler>> closeTagHandlers = null;
+ private Map<String,List<OpenTagHandler>> openTagHandlers = null;
+ private List<CSSTextHandler> cssTextHandlers = null;
+ private List<JSTextHandler> jsTextHandler = null;
+ private List<RemarkTextHandler> remarkTextHandler = null;
+ private List<ContentTextHandler> contentTextHandler = null;
+ private List<ParseCompleteHandler> parseCompleteHandlers = null;
+
+ private List<ParseEventDelegatorVisitor> parserVisitors = null;
+
+
+ public void init() {
+ if(parserVisitors != null) {
+ for(ParseEventDelegatorVisitor visitor : parserVisitors) {
+ visitor.visit(this);
+ }
+ }
+ }
+
+ public void handleNode(ParseContext context, Node node)
+ throws IOException {
+
+ if(NodeUtils.isRemarkNode(node)) {
+ RemarkNode remarkNode = (RemarkNode) node;
+ handleRemarkTextNode(context,remarkNode);
+
+ } else if(NodeUtils.isTextNode(node)) {
+ TextNode textNode = (TextNode) node;
+ if(context.isInCSS()) {
+ handleCSSTextNode(context,textNode);
+
+ } else if(context.isInScriptText()) {
+ handleJSTextNode(context,textNode);
+ } else {
+ handleContentTextNode(context,textNode);
+ }
+ } else if(NodeUtils.isTagNode(node)) {
+ TagNode tagNode = (TagNode) node;
+ if(tagNode.isEndTag()) {
+ handleCloseTagNode(context,tagNode);
+ } else {
+ // assume start, possibly empty:
+ handleOpenTagNode(context,tagNode);
+ }
+ } else {
+ throw new IllegalArgumentException("Unknown node type..");
+ }
+ }
+
+ // CLOSE TAG:
+ public void addCloseTagHandler(CloseTagHandler v) {
+ addCloseTagHandler(v, WILDCARD_TAG_NAME);
+ }
+ public void addCloseTagHandler(CloseTagHandler v, String name) {
+ if(closeTagHandlers == null) {
+ closeTagHandlers = new HashMap<String,List<CloseTagHandler>>();
+ }
+ if(!closeTagHandlers.containsKey(name)) {
+ closeTagHandlers.put(name, new ArrayList<CloseTagHandler>());
+ }
+ closeTagHandlers.get(name).add(v);
+ }
+ public void handleCloseTagNode(ParseContext context, TagNode node) throws IOException {
+ String name = node.getTagName();
+ if(closeTagHandlers != null) {
+ for(String n : new String[]{name,WILDCARD_TAG_NAME}) {
+ if(closeTagHandlers.containsKey(n)) {
+ for(CloseTagHandler v : closeTagHandlers.get(n)) {
+ v.handleCloseTagNode(context,node);
+ }
+ }
+ }
+ }
+ }
+
+ // OPEN TAG:
+ public void addOpenTagHandler(OpenTagHandler v) {
+ addOpenTagHandler(v, WILDCARD_TAG_NAME);
+ }
+ public void addOpenTagHandler(OpenTagHandler v, String name) {
+ if(openTagHandlers == null) {
+ openTagHandlers = new HashMap<String,List<OpenTagHandler>>();
+ }
+ if(!openTagHandlers.containsKey(name)) {
+ openTagHandlers.put(name, new ArrayList<OpenTagHandler>());
+ }
+ openTagHandlers.get(name).add(v);
+ }
+
+ public void handleOpenTagNode(ParseContext context, TagNode node) throws IOException {
+ String name = node.getTagName();
+ if(openTagHandlers != null) {
+ for(String n : new String[]{name,WILDCARD_TAG_NAME}) {
+ if(openTagHandlers.containsKey(n)) {
+ for(OpenTagHandler v : openTagHandlers.get(n)) {
+ v.handleOpenTagNode(context,node);
+ }
+ }
+ }
+ }
+ }
+ public void addCSSTextHandler(CSSTextHandler v) {
+ if(cssTextHandlers == null) {
+ cssTextHandlers = new ArrayList<CSSTextHandler>();
+ }
+ cssTextHandlers.add(v);
+ }
+ public void handleCSSTextNode(ParseContext context, TextNode node) throws IOException {
+ if(cssTextHandlers != null) {
+ for(CSSTextHandler v : cssTextHandlers) {
+ v.handleCSSTextNode(context,node);
+ }
+ }
+ }
+ public void addJSTextHandler(JSTextHandler v) {
+ if(jsTextHandler == null) {
+ jsTextHandler = new ArrayList<JSTextHandler>();
+ }
+ jsTextHandler.add(v);
+ }
+ public void handleJSTextNode(ParseContext context, TextNode node) throws IOException {
+ if(jsTextHandler != null) {
+ for(JSTextHandler v : jsTextHandler) {
+ v.handleJSTextNode(context,node);
+ }
+ }
+ }
+
+ public void addRemarkTextHandler(RemarkTextHandler v) {
+ if(remarkTextHandler == null) {
+ remarkTextHandler = new ArrayList<RemarkTextHandler>();
+ }
+ remarkTextHandler.add(v);
+ }
+ public void handleRemarkTextNode(ParseContext context, RemarkNode node) throws IOException {
+ if(remarkTextHandler != null) {
+ for(RemarkTextHandler v : remarkTextHandler) {
+ v.handleRemarkTextNode(context,node);
+ }
+ }
+ }
+
+ public void addContentTextHandler(ContentTextHandler v) {
+ if(contentTextHandler == null) {
+ contentTextHandler = new ArrayList<ContentTextHandler>();
+ }
+ contentTextHandler.add(v);
+ }
+ public void handleContentTextNode(ParseContext context, TextNode node) throws IOException {
+ if(contentTextHandler != null) {
+ for(ContentTextHandler v : contentTextHandler) {
+ v.handleContentTextNode(context,node);
+ }
+ }
+ }
+
+ public void addParseCompleteHandler(ParseCompleteHandler v) {
+ if(parseCompleteHandlers == null) {
+ parseCompleteHandlers = new ArrayList<ParseCompleteHandler>();
+ }
+ parseCompleteHandlers.add(v);
+ }
+ public void handleParseComplete(ParseContext context) throws IOException {
+ if(parseCompleteHandlers != null) {
+ for(ParseCompleteHandler v : parseCompleteHandlers) {
+ v.handleParseComplete(context);
+ }
+ }
+ }
+
+ /**
+ * @return the parserVisitors
+ */
+ public List<ParseEventDelegatorVisitor> getParserVisitors() {
+ return parserVisitors;
+ }
+
+ /**
+ * @param parserVisitors the parserVisitors to set
+ */
+ public void setParserVisitors(List<ParseEventDelegatorVisitor> parserVisitors) {
+ this.parserVisitors = parserVisitors;
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegator.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,42 @@
+/* ParseEventDelegatorVisitor
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex;
+
+
+/**
+ *
+ * Common interface to decouple application-specific handlers from the
+ * ParseEventDelegator object: Any object interested in registering for specific
+ * low-level events can implement this interface, and can be added to the
+ * ParseEventDelegator parserVisitors list, and it will be given an opportunity
+ * to register with the ParseEventDelegator for specific events it is
+ * interested in.
+ *
+ * @author brad
+ *
+ */
+public interface ParseEventDelegatorVisitor {
+ public void visit(ParseEventDelegator rules);
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventDelegatorVisitor.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,43 @@
+/* ParseEventHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex;
+
+import java.io.IOException;
+
+import org.htmlparser.Node;
+
+/**
+ * General interface used with the ContextAwareLexer to handle high-level SAX
+ * stream events. See ParseEventDelegator and ParseEventDelegatorVisitor for
+ * more detailed usage.
+ *
+ * @author brad
+ *
+ */
+public interface ParseEventHandler {
+ public void handleNode(ParseContext context, Node node)
+ throws IOException;
+ public void handleParseComplete(ParseContext context) throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseEventHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,35 @@
+/* CSSTextHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex.handlers;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.htmlparser.nodes.TextNode;
+
+public interface CSSTextHandler {
+ public void handleCSSTextNode(ParseContext context, TextNode node)
+ throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CSSTextHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,35 @@
+/* CloseTagHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex.handlers;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.htmlparser.nodes.TagNode;
+
+public interface CloseTagHandler {
+ public void handleCloseTagNode(ParseContext context, TagNode node)
+ throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/CloseTagHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,35 @@
+/* ContentTextHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex.handlers;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.htmlparser.nodes.TextNode;
+
+public interface ContentTextHandler {
+ public void handleContentTextNode(ParseContext context, TextNode node)
+ throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ContentTextHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,35 @@
+/* JSTextHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex.handlers;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.htmlparser.nodes.TextNode;
+
+public interface JSTextHandler {
+ public void handleJSTextNode(ParseContext context, TextNode node)
+ throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/JSTextHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,35 @@
+/* OpenTagHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex.handlers;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.htmlparser.nodes.TagNode;
+
+public interface OpenTagHandler {
+ public void handleOpenTagNode(ParseContext context, TagNode node)
+ throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/OpenTagHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,35 @@
+/* ParseCompleteHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex.handlers;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseContext;
+
+
+public interface ParseCompleteHandler {
+ public void handleParseComplete(ParseContext context)
+ throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/ParseCompleteHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java 2009-11-05 23:01:46 UTC (rev 2879)
@@ -0,0 +1,35 @@
+/* RemarkTextHandler
+ *
+ * $Id$
+ *
+ * Created on 12:36:59 PM Nov 5, 2009.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+package org.archive.wayback.util.htmllex.handlers;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.htmlparser.nodes.RemarkNode;
+
+public interface RemarkTextHandler {
+ public void handleRemarkTextNode(ParseContext context, RemarkNode node)
+ throws IOException;
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/handlers/RemarkTextHandler.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|