[Htmlparser-cvs] htmlparser/src/org/htmlparser AbstractNode.java,NONE,1.1 Node.java,1.23,1.24 Remark
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv17851/src/org/htmlparser Modified Files: Node.java RemarkNode.java RemarkNodeParser.java Parser.java StringNode.java NodeReader.java Added Files: AbstractNode.java Log Message: Renamed Node to AbstractNode, extracted the new interface, Node, and moved line separator code from AbstractNode to Parser. --- NEW FILE: AbstractNode.java --- // HTMLParser Library v1_4_20030601 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser; import java.io.*; import org.htmlparser.tags.*; import org.htmlparser.util.*; import org.htmlparser.visitors.*; /** * AbstractNode, which implements the Node interface, is the base class for all types of nodes, including tags, string elements, etc */ public abstract class AbstractNode implements Node, Serializable { /** * The beginning position of the tag in the line */ protected int nodeBegin; /** * The ending position of the tag in the line */ protected int nodeEnd; /** * If parent of this tag */ protected CompositeTag parent = null; public AbstractNode(int nodeBegin, int nodeEnd) { this.nodeBegin = nodeBegin; this.nodeEnd = nodeEnd; } /** * Returns a string representation of the node. This is an important method, it allows a simple string transformation * of a web page, regardless of a node.<br> * Typical application code (for extracting only the text from a web page) would then be simplified to :<br> * <pre> * Node node; * for (Enumeration e = parser.elements();e.hasMoreElements();) { * node = (Node)e.nextElement(); * System.out.println(node.toPlainTextString()); // Or do whatever processing you wish with the plain text string * } * </pre> */ public abstract String toPlainTextString(); /** * This method will make it easier when using html parser to reproduce html pages (with or without modifications) * Applications reproducing html can use this method on nodes which are to be used or transferred as they were * recieved, with the original html */ public abstract String toHtml(); /** * Return the string representation of the node. * Subclasses must define this method, and this is typically to be used in the manner<br> * <pre>System.out.println(node)</pre> * @return java.lang.String */ public abstract String toString(); /** * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node * satisfies the filtering criteria. <P/> * * This mechanism allows powerful filtering code to be written very easily, without bothering about collection * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links * out by checking if the current node is a form tag, and going through its contents. However, this ties us down * to specific tags, and is not a very clean approach. <P/> * * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look * like : * <pre> * NodeList collectionList = new NodeList(); * Node node; * String filter = LinkTag.LINK_TAG_FILTER; * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { * node = e.nextNode(); * node.collectInto (collectionVector, filter); * } * </pre> * Thus, collectionList will hold all the link nodes, irrespective of how * deep the links are embedded. This of course implies that tags must * fulfill their responsibilities toward honouring certain filters. * * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not * make calls like : * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like : * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/> * * To find out if your desired tag has filtering support, check the API of the tag. */ public abstract void collectInto(NodeList collectionList, String filter); /** * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node * satisfies the filtering criteria. <P/> * * This mechanism allows powerful filtering code to be written very easily, without bothering about collection * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links * out by checking if the current node is a form tag, and going through its contents. However, this ties us down * to specific tags, and is not a very clean approach. <P/> * * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look * like : * <pre> * NodeList collectionList = new NodeList(); * Node node; * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { * node = e.nextNode(); * node.collectInto (collectionVector, LinkTag.class); * } * </pre> * Thus, collectionList will hold all the link nodes, irrespective of how * deep the links are embedded. */ public void collectInto(NodeList collectionList, Class nodeType) { if (nodeType.getName().equals(this.getClass().getName())) { collectionList.add(this); } } /** * Returns the beginning position of the tag. */ public int elementBegin() { return nodeBegin; } /** * Returns the ending position fo the tag */ public int elementEnd() { return nodeEnd; } public abstract void accept(NodeVisitor visitor); /** * @deprecated - use toHtml() instead */ public final String toHTML() { return toHtml(); } /** * Get the parent of this tag * @return The parent of this node, if it's been set, <code>null</code> otherwise. */ public CompositeTag getParent() { return parent; } /** * Sets the parent of this tag * @param tag */ public void setParent(CompositeTag tag) { parent = tag; } } Index: Node.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Node.java,v retrieving revision 1.23 retrieving revision 1.24 diff -C2 -d -r1.23 -r1.24 *** Node.java 1 Jun 2003 20:50:08 -0000 1.23 --- Node.java 13 Jun 2003 20:27:04 -0000 1.24 *************** *** 1,91 **** - // HTMLParser Library v1_4_20030601 - A java-based parser for HTML - // Copyright (C) Dec 31, 2000 Somik Raha - // - // This library is free software; you can redistribute it and/or - // modify it under the terms of the GNU Lesser General Public - // License as published by the Free Software Foundation; either - // version 2.1 of the License, or (at your option) any later version. - // - // This library is distributed in the hope that it will be useful, - // but WITHOUT ANY WARRANTY; without even the implied warranty of - // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - // Lesser General Public License for more details. - // - // You should have received a copy of the GNU Lesser General Public - // License along with this library; if not, write to the Free Software - // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - // - // For any questions or suggestions, you can write to me at : - // Email :so...@in... - // - // Postal Address : - // Somik Raha - // Extreme Programmer & Coach - // Industrial Logic Corporation - // 2583 Cedar Street, Berkeley, - // CA 94708, USA - // Website : http://www.industriallogic.com - package org.htmlparser; ! import java.io.*; ! ! import org.htmlparser.tags.*; ! import org.htmlparser.util.*; ! import org.htmlparser.visitors.*; ! ! /** ! * A Node interface is implemented by all types of nodes (tags, string elements, etc) ! */ ! public abstract class Node implements Serializable { ! /** ! * The beginning position of the tag in the line ! */ ! protected int nodeBegin; ! ! /** ! * The ending position of the tag in the line ! */ ! protected int nodeEnd; ! ! /** ! * If parent of this tag ! */ ! protected CompositeTag parent; ! ! /** ! * Variable to store lineSeparator. ! * This is setup to read <code>line.separator</code> from the System property. ! * However it can also be changed using the mutator methods. ! * This will be used in the toHTML() methods in all the sub-classes of Node. ! */ ! protected static String lineSeparator = System.getProperty("line.separator", "\n"); ! ! public Node(int nodeBegin, int nodeEnd) { ! this.nodeBegin = nodeBegin; ! this.nodeEnd = nodeEnd; ! this.parent = null; ! } ! ! public Node(int nodeBegin, int nodeEnd, CompositeTag parent) { ! this.nodeBegin = nodeBegin; ! this.nodeEnd = nodeEnd; ! this.parent = parent; ! } ! ! /** ! * @param lineSeparator New Line separator to be used ! */ ! public static void setLineSeparator(String lineSeparator) { ! Node.lineSeparator = lineSeparator; ! } ! ! /** ! * @return String lineSeparator that will be used in toHTML() ! */ ! public static String getLineSeparator() { ! return Node.lineSeparator; ! } /** * Returns a string representation of the node. This is an important method, it allows a simple string transformation --- 1,9 ---- package org.htmlparser; ! import org.htmlparser.tags.CompositeTag; ! import org.htmlparser.util.NodeList; ! import org.htmlparser.visitors.NodeVisitor; + public interface Node { /** * Returns a string representation of the node. This is an important method, it allows a simple string transformation *************** *** 101,105 **** */ public abstract String toPlainTextString(); - /** * This method will make it easier when using html parser to reproduce html pages (with or without modifications) --- 19,22 ---- *************** *** 108,112 **** */ public abstract String toHtml(); - /** * Return the string representation of the node. --- 25,28 ---- *************** *** 116,120 **** */ public abstract String toString(); - /** * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node --- 32,35 ---- *************** *** 151,155 **** */ public abstract void collectInto(NodeList collectionList, String filter); - /** * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node --- 66,69 ---- *************** *** 175,222 **** * deep the links are embedded. */ ! public void collectInto(NodeList collectionList, Class nodeType) { ! if (nodeType.getName().equals(this.getClass().getName())) { ! collectionList.add(this); ! } ! } ! /** * Returns the beginning position of the tag. */ ! public int elementBegin() { ! return nodeBegin; ! } ! /** * Returns the ending position fo the tag */ ! public int elementEnd() { ! return nodeEnd; ! } ! public abstract void accept(NodeVisitor visitor); - /** * @deprecated - use toHtml() instead */ ! public final String toHTML() { ! return toHtml(); ! } ! /** * Get the parent of this tag * @return The parent of this node, if it's been set, <code>null</code> otherwise. */ ! public CompositeTag getParent() { ! return parent; ! } ! /** * Sets the parent of this tag * @param tag */ ! public void setParent(CompositeTag tag) { ! parent = tag; ! } ! ! } --- 89,115 ---- * deep the links are embedded. */ ! public abstract void collectInto(NodeList collectionList, Class nodeType); /** * Returns the beginning position of the tag. */ ! public abstract int elementBegin(); /** * Returns the ending position fo the tag */ ! public abstract int elementEnd(); public abstract void accept(NodeVisitor visitor); /** * @deprecated - use toHtml() instead */ ! public abstract String toHTML(); /** * Get the parent of this tag * @return The parent of this node, if it's been set, <code>null</code> otherwise. */ ! public abstract CompositeTag getParent(); /** * Sets the parent of this tag * @param tag */ ! public abstract void setParent(CompositeTag tag); ! } \ No newline at end of file Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** RemarkNode.java 1 Jun 2003 20:50:08 -0000 1.18 --- RemarkNode.java 13 Jun 2003 20:27:04 -0000 1.19 *************** *** 36,40 **** * The remark tag is identified and represented by this class. */ ! public class RemarkNode extends Node { public final static String REMARK_NODE_FILTER="-r"; --- 36,40 ---- * The remark tag is identified and represented by this class. */ ! public class RemarkNode extends AbstractNode { public final static String REMARK_NODE_FILTER="-r"; Index: RemarkNodeParser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNodeParser.java,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** RemarkNodeParser.java 1 Jun 2003 20:50:08 -0000 1.20 --- RemarkNodeParser.java 13 Jun 2003 20:27:04 -0000 1.21 *************** *** 149,153 **** // We need to continue parsing to the next line //input = reader.getNextLine(); ! tagContents.append(Node.getLineSeparator()); do { input = reader.getNextLine(); --- 149,153 ---- // We need to continue parsing to the next line //input = reader.getNextLine(); ! tagContents.append(Parser.getLineSeparator()); do { input = reader.getNextLine(); Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** Parser.java 1 Jun 2003 20:50:08 -0000 1.43 --- Parser.java 13 Jun 2003 20:27:04 -0000 1.44 *************** *** 217,220 **** --- 217,228 ---- protected transient BufferedInputStream input; + /** + * Variable to store lineSeparator. + * This is setup to read <code>line.separator</code> from the System property. + * However it can also be changed using the mutator methods. + * This will be used in the toHTML() methods in all the sub-classes of Node. + */ + protected static String lineSeparator = System.getProperty("line.separator", "\n"); + /** * A quiet message sink. *************** *** 238,244 **** * @param lineSeparator New Line separator to be used */ ! public static void setLineSeparator(String lineSeparator) { ! Node.setLineSeparator(lineSeparator); } --- 246,252 ---- * @param lineSeparator New Line separator to be used */ ! public static void setLineSeparator(String lineSeparatorString) { ! lineSeparator = lineSeparatorString; } *************** *** 1193,1196 **** --- 1201,1211 ---- parser.addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER)); return parser; + } + + /** + * @return String lineSeparator that will be used in toHTML() + */ + public static String getLineSeparator() { + return lineSeparator; } } Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** StringNode.java 1 Jun 2003 20:50:08 -0000 1.19 --- StringNode.java 13 Jun 2003 20:27:04 -0000 1.20 *************** *** 36,40 **** * Normal text in the html document is identified and represented by this class. */ ! public class StringNode extends Node { public static final String STRING_FILTER="-string"; --- 36,40 ---- * Normal text in the html document is identified and represented by this class. */ ! public class StringNode extends AbstractNode { public static final String STRING_FILTER="-string"; Index: NodeReader.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/NodeReader.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** NodeReader.java 1 Jun 2003 20:50:08 -0000 1.32 --- NodeReader.java 13 Jun 2003 20:27:04 -0000 1.33 *************** *** 336,340 **** public static void setLineSeparator(String lineSeparator) { ! Node.setLineSeparator(lineSeparator); } --- 336,340 ---- public static void setLineSeparator(String lineSeparator) { ! Parser.setLineSeparator(lineSeparator); } *************** *** 345,349 **** public static String getLineSeparator() { ! return (Node.getLineSeparator()); } /** --- 345,349 ---- public static String getLineSeparator() { ! return (Parser.getLineSeparator()); } /** |