[Htmlparser-cvs] htmlparser/src/org/htmlparser AbstractNode.java,1.10,1.11 Node.java,1.36,1.37 NodeR
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv31228 Modified Files: AbstractNode.java Node.java NodeReader.java RemarkNode.java StringNode.java StringNodeFactory.java Log Message: Change tabs to spaces in all source files. Index: AbstractNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/AbstractNode.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** AbstractNode.java 24 Aug 2003 21:59:41 -0000 1.10 --- AbstractNode.java 3 Sep 2003 23:36:18 -0000 1.11 *************** *** 37,54 **** */ public abstract class AbstractNode implements Node, Serializable { ! /** ! * The beginning position of the tag in the line ! */ ! protected int nodeBegin; ! /** ! * The ending position of the tag in the line ! */ ! protected int nodeEnd; ! /** ! * The parent of this node. ! */ ! protected Node parent; /** --- 37,54 ---- */ public abstract class AbstractNode implements Node, Serializable { ! /** ! * The beginning position of the tag in the line ! */ ! protected int nodeBegin; ! /** ! * The ending position of the tag in the line ! */ ! protected int nodeEnd; ! /** ! * The parent of this node. ! */ ! protected Node parent; /** *************** *** 62,214 **** * @param end The ending position of the node. */ ! public AbstractNode (int begin, int end) { ! nodeBegin = begin; ! nodeEnd = end; parent = null; ! } ! /** ! * Returns a string representation of the node. This is an important method, it allows a simple string transformation ! * of a web page, regardless of a node.<br> ! * Typical application code (for extracting only the text from a web page) would then be simplified to :<br> ! * <pre> ! * Node node; ! * for (Enumeration e = parser.elements();e.hasMoreElements();) { ! * node = (Node)e.nextElement(); ! * System.out.println(node.toPlainTextString()); // Or do whatever processing you wish with the plain text string ! * } ! * </pre> ! */ ! public abstract String toPlainTextString(); ! /** ! * This method will make it easier when using html parser to reproduce html pages (with or without modifications) ! * Applications reproducing html can use this method on nodes which are to be used or transferred as they were ! * recieved, with the original html ! */ ! public abstract String toHtml(); ! /** ! * Return the string representation of the node. ! * Subclasses must define this method, and this is typically to be used in the manner<br> ! * <pre>System.out.println(node)</pre> ! * @return java.lang.String ! */ ! public abstract String toString(); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * String filter = LinkTag.LINK_TAG_FILTER; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, filter); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. This of course implies that tags must ! * fulfill their responsibilities toward honouring certain filters. ! * ! * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as ! * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not ! * make calls like : ! * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like : ! * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/> ! * ! * To find out if your desired tag has filtering support, check the API of the tag. ! */ ! public abstract void collectInto(NodeList collectionList, String filter); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, LinkTag.class); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. ! */ ! public void collectInto(NodeList collectionList, Class nodeType) { ! if (nodeType.getName().equals(this.getClass().getName())) { ! collectionList.add(this); ! } ! } ! /** ! * Returns the beginning position of the tag. ! */ ! public int elementBegin() { ! return nodeBegin; ! } ! /** ! * Returns the ending position fo the tag ! */ ! public int elementEnd() { ! return nodeEnd; ! } ! public abstract void accept(Object visitor); ! /** ! * @deprecated - use toHtml() instead ! */ ! public final String toHTML() { ! return toHtml(); ! } ! ! /** ! * Get the parent of this node. * This will always return null when parsing without scanners, * i.e. if semantic parsing was not performed. * The object returned from this method can be safely cast to a <code>CompositeTag</code>. ! * @return The parent of this node, if it's been set, <code>null</code> otherwise. ! */ ! public Node getParent () { ! return (parent); ! } /** ! * Sets the parent of this node. ! * @param node The node that contains this node. Must be a <code>CompositeTag</code>. ! */ ! public void setParent (Node node) { ! parent = node; ! } ! /** * Get the children of this node. * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise. */ ! public NodeList getChildren () { return (children); --- 62,214 ---- * @param end The ending position of the node. */ ! public AbstractNode (int begin, int end) { ! nodeBegin = begin; ! nodeEnd = end; parent = null; ! } ! /** ! * Returns a string representation of the node. This is an important method, it allows a simple string transformation ! * of a web page, regardless of a node.<br> ! * Typical application code (for extracting only the text from a web page) would then be simplified to :<br> ! * <pre> ! * Node node; ! * for (Enumeration e = parser.elements();e.hasMoreElements();) { ! * node = (Node)e.nextElement(); ! * System.out.println(node.toPlainTextString()); // Or do whatever processing you wish with the plain text string ! * } ! * </pre> ! */ ! public abstract String toPlainTextString(); ! /** ! * This method will make it easier when using html parser to reproduce html pages (with or without modifications) ! * Applications reproducing html can use this method on nodes which are to be used or transferred as they were ! * recieved, with the original html ! */ ! public abstract String toHtml(); ! /** ! * Return the string representation of the node. ! * Subclasses must define this method, and this is typically to be used in the manner<br> ! * <pre>System.out.println(node)</pre> ! * @return java.lang.String ! */ ! public abstract String toString(); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * String filter = LinkTag.LINK_TAG_FILTER; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, filter); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. This of course implies that tags must ! * fulfill their responsibilities toward honouring certain filters. ! * ! * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as ! * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not ! * make calls like : ! * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like : ! * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/> ! * ! * To find out if your desired tag has filtering support, check the API of the tag. ! */ ! public abstract void collectInto(NodeList collectionList, String filter); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, LinkTag.class); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. ! */ ! public void collectInto(NodeList collectionList, Class nodeType) { ! if (nodeType.getName().equals(this.getClass().getName())) { ! collectionList.add(this); ! } ! } ! /** ! * Returns the beginning position of the tag. ! */ ! public int elementBegin() { ! return nodeBegin; ! } ! /** ! * Returns the ending position fo the tag ! */ ! public int elementEnd() { ! return nodeEnd; ! } ! public abstract void accept(Object visitor); ! /** ! * @deprecated - use toHtml() instead ! */ ! public final String toHTML() { ! return toHtml(); ! } ! ! /** ! * Get the parent of this node. * This will always return null when parsing without scanners, * i.e. if semantic parsing was not performed. * The object returned from this method can be safely cast to a <code>CompositeTag</code>. ! * @return The parent of this node, if it's been set, <code>null</code> otherwise. ! */ ! public Node getParent () { ! return (parent); ! } /** ! * Sets the parent of this node. ! * @param node The node that contains this node. Must be a <code>CompositeTag</code>. ! */ ! public void setParent (Node node) { ! parent = node; ! } ! /** * Get the children of this node. * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise. */ ! public NodeList getChildren () { return (children); *************** *** 219,223 **** * @param children The new list of children this node contains. */ ! public void setChildren (NodeList children) { this.children = children; --- 219,223 ---- * @param children The new list of children this node contains. */ ! public void setChildren (NodeList children) { this.children = children; *************** *** 225,241 **** /** ! * Returns the text of the string line ! */ ! public String getText() { ! return null; ! } ! ! /** ! * Sets the string contents of the node. ! * @param text The new text for the node. ! */ ! public void setText(String text) { ! } } --- 225,241 ---- /** ! * Returns the text of the string line ! */ ! public String getText() { ! return null; ! } ! ! /** ! * Sets the string contents of the node. ! * @param text The new text for the node. ! */ ! public void setText(String text) { ! } } Index: Node.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Node.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** Node.java 24 Aug 2003 21:59:41 -0000 1.36 --- Node.java 3 Sep 2003 23:36:18 -0000 1.37 *************** *** 34,146 **** public interface Node { ! /** ! * Returns a string representation of the node. This is an important method, it allows a simple string transformation ! * of a web page, regardless of a node.<br> ! * Typical application code (for extracting only the text from a web page) would then be simplified to :<br> ! * <pre> ! * Node node; ! * for (Enumeration e = parser.elements();e.hasMoreElements();) { ! * node = (Node)e.nextElement(); ! * System.out.println(node.toPlainTextString()); // Or do whatever processing you wish with the plain text string ! * } ! * </pre> ! */ ! public abstract String toPlainTextString(); ! /** ! * This method will make it easier when using html parser to reproduce html pages (with or without modifications) ! * Applications reproducing html can use this method on nodes which are to be used or transferred as they were ! * recieved, with the original html ! */ ! public abstract String toHtml(); ! /** ! * Return the string representation of the node. ! * Subclasses must define this method, and this is typically to be used in the manner<br> ! * <pre>System.out.println(node)</pre> ! * @return java.lang.String ! */ ! public abstract String toString(); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * String filter = LinkTag.LINK_TAG_FILTER; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, filter); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. This of course implies that tags must ! * fulfill their responsibilities toward honouring certain filters. ! * ! * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as ! * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not ! * make calls like : ! * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like : ! * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/> ! * ! * To find out if your desired tag has filtering support, check the API of the tag. ! */ ! public abstract void collectInto(NodeList collectionList, String filter); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, LinkTag.class); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. ! */ ! public abstract void collectInto(NodeList collectionList, Class nodeType); ! /** ! * Returns the beginning position of the tag. ! */ ! public abstract int elementBegin(); ! /** ! * Returns the ending position fo the tag ! */ ! public abstract int elementEnd(); ! public abstract void accept(Object visitor); ! /** ! * Get the parent of this node. * This will always return null when parsing without scanners, * i.e. if semantic parsing was not performed. * The object returned from this method can be safely cast to a <code>CompositeTag</code>. ! * @return The parent of this node, if it's been set, <code>null</code> otherwise. ! */ ! public abstract Node getParent (); /** ! * Sets the parent of this node. ! * @param node The node that contains this node. Must be a <code>CompositeTag</code>. ! */ ! public abstract void setParent (Node node); /** --- 34,146 ---- public interface Node { ! /** ! * Returns a string representation of the node. This is an important method, it allows a simple string transformation ! * of a web page, regardless of a node.<br> ! * Typical application code (for extracting only the text from a web page) would then be simplified to :<br> ! * <pre> ! * Node node; ! * for (Enumeration e = parser.elements();e.hasMoreElements();) { ! * node = (Node)e.nextElement(); ! * System.out.println(node.toPlainTextString()); // Or do whatever processing you wish with the plain text string ! * } ! * </pre> ! */ ! public abstract String toPlainTextString(); ! /** ! * This method will make it easier when using html parser to reproduce html pages (with or without modifications) ! * Applications reproducing html can use this method on nodes which are to be used or transferred as they were ! * recieved, with the original html ! */ ! public abstract String toHtml(); ! /** ! * Return the string representation of the node. ! * Subclasses must define this method, and this is typically to be used in the manner<br> ! * <pre>System.out.println(node)</pre> ! * @return java.lang.String ! */ ! public abstract String toString(); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * String filter = LinkTag.LINK_TAG_FILTER; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, filter); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. This of course implies that tags must ! * fulfill their responsibilities toward honouring certain filters. ! * ! * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as ! * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not ! * make calls like : ! * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like : ! * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/> ! * ! * To find out if your desired tag has filtering support, check the API of the tag. ! */ ! public abstract void collectInto(NodeList collectionList, String filter); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : ! * <pre> ! * NodeList collectionList = new NodeList(); ! * Node node; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, LinkTag.class); ! * } ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. ! */ ! public abstract void collectInto(NodeList collectionList, Class nodeType); ! /** ! * Returns the beginning position of the tag. ! */ ! public abstract int elementBegin(); ! /** ! * Returns the ending position fo the tag ! */ ! public abstract int elementEnd(); ! public abstract void accept(Object visitor); ! /** ! * Get the parent of this node. * This will always return null when parsing without scanners, * i.e. if semantic parsing was not performed. * The object returned from this method can be safely cast to a <code>CompositeTag</code>. ! * @return The parent of this node, if it's been set, <code>null</code> otherwise. ! */ ! public abstract Node getParent (); /** ! * Sets the parent of this node. ! * @param node The node that contains this node. Must be a <code>CompositeTag</code>. ! */ ! public abstract void setParent (Node node); /** *************** *** 148,152 **** * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise. */ ! public abstract NodeList getChildren (); /** --- 148,152 ---- * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise. */ ! public abstract NodeList getChildren (); /** *************** *** 154,168 **** * @param children The new list of children this node contains. */ ! public abstract void setChildren (NodeList children); ! /** ! * Returns the text of the string line ! */ ! public String getText(); ! ! /** ! * Sets the string contents of the node. ! * @param text The new text for the node. ! */ ! public void setText(String text); } --- 154,168 ---- * @param children The new list of children this node contains. */ ! public abstract void setChildren (NodeList children); ! /** ! * Returns the text of the string line ! */ ! public String getText(); ! ! /** ! * Sets the string contents of the node. ! * @param text The new text for the node. ! */ ! public void setText(String text); } Index: NodeReader.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/NodeReader.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** NodeReader.java 24 Aug 2003 21:59:41 -0000 1.40 --- NodeReader.java 3 Sep 2003 23:36:18 -0000 1.41 *************** *** 52,98 **** public class NodeReader extends BufferedReader { ! public static final String DECIPHER_ERROR="NodeReader.readElement() : Error occurred while trying to decipher the tag using scanners"; ! protected int posInLine=-1; ! protected String line; ! protected Node node = null; ! protected TagScanner previousOpenScanner = null; ! protected String url; ! private Parser parser; ! private int lineCount; ! private String previousLine; ! private StringParser stringParser = new StringParser(); ! private RemarkNodeParser remarkNodeParser = new RemarkNodeParser(); ! private NodeList nextParsedNode = new NodeList(); ! private boolean dontReadNextLine=false; ! /** ! * The constructor takes in a reader object, it's length and the url to be read. ! */ ! public NodeReader(Reader in,int len,String url) ! { ! super(in, len); ! this.url = url; ! this.parser = null; ! this.lineCount = 1; ! } ! /** ! * This constructor basically overrides the existing constructor in the ! * BufferedReader class. * The URL defaults to an empty string. * @see #NodeReader(Reader,int,String) ! */ ! public NodeReader(Reader in, int len) ! { ! this(in,len,""); ! } ! /** ! * The constructor takes in a reader object, and the url to be read. * The buffer size defaults to 8192. * @see #NodeReader(Reader,int,String) ! */ ! public NodeReader(Reader in,String url) ! { ! this(in, 8192, url); ! } /** --- 52,98 ---- public class NodeReader extends BufferedReader { ! public static final String DECIPHER_ERROR="NodeReader.readElement() : Error occurred while trying to decipher the tag using scanners"; ! protected int posInLine=-1; ! protected String line; ! protected Node node = null; ! protected TagScanner previousOpenScanner = null; ! protected String url; ! private Parser parser; ! private int lineCount; ! private String previousLine; ! private StringParser stringParser = new StringParser(); ! private RemarkNodeParser remarkNodeParser = new RemarkNodeParser(); ! private NodeList nextParsedNode = new NodeList(); ! private boolean dontReadNextLine=false; ! /** ! * The constructor takes in a reader object, it's length and the url to be read. ! */ ! public NodeReader(Reader in,int len,String url) ! { ! super(in, len); ! this.url = url; ! this.parser = null; ! this.lineCount = 1; ! } ! /** ! * This constructor basically overrides the existing constructor in the ! * BufferedReader class. * The URL defaults to an empty string. * @see #NodeReader(Reader,int,String) ! */ ! public NodeReader(Reader in, int len) ! { ! this(in,len,""); ! } ! /** ! * The constructor takes in a reader object, and the url to be read. * The buffer size defaults to 8192. * @see #NodeReader(Reader,int,String) ! */ ! public NodeReader(Reader in,String url) ! { ! this(in, 8192, url); ! } /** *************** *** 105,177 **** } ! /** ! * This method is intended to be called only by scanners, when a situation of dirty html has arisen, ! * and action has been taken to correct the parsed tags. For e.g. if we have html of the form : ! * <pre> ! * <a href="somelink.html"><img src=...><td><tr><a href="someotherlink.html">...</a> ! * </pre> ! * Now to salvage the first link, we'd probably like to insert an end tag somewhere (typically before the ! * second begin link tag). So that the parsing continues uninterrupted, we will need to change the existing ! * line being parsed, to contain the end tag in it. ! */ ! public void changeLine(String line) { ! this.line = line; ! } ! public String getCurrentLine() { ! return line; ! } ! /** ! * Get the last line number that the reader has read ! * @return int last line number read by the reader ! */ ! public int getLastLineNumber() { ! return lineCount-1; ! } ! /** ! * This method is useful when designing your own scanners. You might need to find out what is the location where the ! * reader has stopped last. ! * @return int Last position read by the reader ! */ ! public int getLastReadPosition() { ! if (node!=null) return node.elementEnd(); else ! return 0; ! } ! /* ! * Read the next line ! * @return String containing the line ! */ ! public String getNextLine() ! { ! try ! { ! previousLine = line; ! line = readLine(); ! if (line!=null) ! lineCount++; ! posInLine = 0; ! return line; ! } ! catch (IOException e) ! { ! System.err.println("I/O Exception occurred while reading!"); ! } ! return null; ! } ! /** ! * Returns the parser object for which this reader exists ! * @return org.htmlparser.Parser ! */ ! public Parser getParser() { ! return parser; ! } ! /** ! * Gets the previousOpenScanner. ! * @return Returns a TagScanner ! */ ! public TagScanner getPreviousOpenScanner() { ! return previousOpenScanner; ! } /** --- 105,177 ---- } ! /** ! * This method is intended to be called only by scanners, when a situation of dirty html has arisen, ! * and action has been taken to correct the parsed tags. For e.g. if we have html of the form : ! * <pre> ! * <a href="somelink.html"><img src=...><td><tr><a href="someotherlink.html">...</a> ! * </pre> ! * Now to salvage the first link, we'd probably like to insert an end tag somewhere (typically before the ! * second begin link tag). So that the parsing continues uninterrupted, we will need to change the existing ! * line being parsed, to contain the end tag in it. ! */ ! public void changeLine(String line) { ! this.line = line; ! } ! public String getCurrentLine() { ! return line; ! } ! /** ! * Get the last line number that the reader has read ! * @return int last line number read by the reader ! */ ! public int getLastLineNumber() { ! return lineCount-1; ! } ! /** ! * This method is useful when designing your own scanners. You might need to find out what is the location where the ! * reader has stopped last. ! * @return int Last position read by the reader ! */ ! public int getLastReadPosition() { ! if (node!=null) return node.elementEnd(); else ! return 0; ! } ! /* ! * Read the next line ! * @return String containing the line ! */ ! public String getNextLine() ! { ! try ! { ! previousLine = line; ! line = readLine(); ! if (line!=null) ! lineCount++; ! posInLine = 0; ! return line; ! } ! catch (IOException e) ! { ! System.err.println("I/O Exception occurred while reading!"); ! } ! return null; ! } ! /** ! * Returns the parser object for which this reader exists ! * @return org.htmlparser.Parser ! */ ! public Parser getParser() { ! return parser; ! } ! /** ! * Gets the previousOpenScanner. ! * @return Returns a TagScanner ! */ ! public TagScanner getPreviousOpenScanner() { ! return previousOpenScanner; ! } /** *************** *** 202,241 **** /** ! * Read the next element ! * @return Node - The next node ! */ ! public Node readElement() throws ParserException { return (readElement (false)); } ! /** ! * Read the next element * @param balance_quotes If <code>true</code> string nodes are parsed * paying attention to single and double quotes, such that tag-like * strings are ignored if they are quoted. ! * @return Node - The next node ! */ ! public Node readElement(boolean balance_quotes) throws ParserException ! { ! try { ! if (nextParsedNode.size()>0) { ! node = nextParsedNode.elementAt(0); ! nextParsedNode.remove(0); ! return node; ! } ! if (readNextLine()) { ! do ! { ! line = getNextLine(); ! } ! while (line!=null && line.length()==0); ! ! } else if (dontReadNextLine) { ! dontReadNextLine = false; } else posInLine = getLastReadPosition() + 1; ! if (line==null) return null; --- 202,241 ---- /** ! * Read the next element ! * @return Node - The next node ! */ ! public Node readElement() throws ParserException { return (readElement (false)); } ! /** ! * Read the next element * @param balance_quotes If <code>true</code> string nodes are parsed * paying attention to single and double quotes, such that tag-like * strings are ignored if they are quoted. ! * @return Node - The next node ! */ ! public Node readElement(boolean balance_quotes) throws ParserException ! { ! try { ! if (nextParsedNode.size()>0) { ! node = nextParsedNode.elementAt(0); ! nextParsedNode.remove(0); ! return node; ! } ! if (readNextLine()) { ! do ! { ! line = getNextLine(); ! } ! while (line!=null && line.length()==0); ! ! } else if (dontReadNextLine) { ! dontReadNextLine = false; } else posInLine = getLastReadPosition() + 1; ! if (line==null) return null; *************** *** 255,263 **** } catch (Exception e) ! { StringBuffer msgBuffer = new StringBuffer(); msgBuffer.append(DECIPHER_ERROR+"\n" + ! " Tag being processed : "+tag.getTagName()+"\n" + ! " Current Tag Line : "+tag.getTagLine() ); appendLineDetails(msgBuffer); --- 255,263 ---- } catch (Exception e) ! { StringBuffer msgBuffer = new StringBuffer(); msgBuffer.append(DECIPHER_ERROR+"\n" + ! " Tag being processed : "+tag.getTagName()+"\n" + ! " Current Tag Line : "+tag.getTagLine() ); appendLineDetails(msgBuffer); *************** *** 277,400 **** if (node!=null) return node; } ! ! return null; ! } catch (ParserException pe) { throw pe; } ! catch (Exception e) { ! StringBuffer msgBuffer = new StringBuffer("NodeReader.readElement() : Error occurred while trying to read the next element,"); ! StringWriter sw = new StringWriter(); ! e.printStackTrace(new PrintWriter(sw)); ! appendLineDetails(msgBuffer); ! msgBuffer.append("\n Caused by:\n").append(sw.getBuffer().toString ()); ! ParserException ex = new ParserException(msgBuffer.toString(),e); ! parser.getFeedback().error(msgBuffer.toString(),ex); ! throw ex; ! } ! } ! public void appendLineDetails(StringBuffer msgBuffer) { ! msgBuffer.append("\nat Line "); ! msgBuffer.append(getLineCount()); ! msgBuffer.append(" : "); ! msgBuffer.append(getLine()); ! msgBuffer.append("\nPrevious Line ").append(getLineCount()-1); ! msgBuffer.append(" : ").append(getPreviousLine()); ! } ! /** ! * Do we need to read the next line ? ! * @return true - yes/ false - no ! */ ! protected boolean readNextLine() ! { ! if (dontReadNextLine) { ! return false; ! } ! if (posInLine==-1 || (line!=null && node.elementEnd()+1>=line.length())) ! return true; ! else return false; ! } ! /** ! * The setParser method is used by the parser to put its own object into the reader. This happens internally, ! * so this method is not generally for use by the developer or the user. ! */ ! public void setParser(Parser newParser) { ! parser = newParser; ! } ! /** ! * Sets the previousOpenScanner. ! * @param previousOpenScanner The previousOpenScanner to set ! */ ! public void setPreviousOpenScanner(TagScanner previousOpenScanner) { ! this.previousOpenScanner = previousOpenScanner; ! } ! ! /** ! * @param lineSeparator New Line separator to be used ! */ ! public static void setLineSeparator(String lineSeparator) ! { ! Parser.setLineSeparator(lineSeparator); ! } ! ! /** ! * Gets the line seperator that is being used ! * @return String ! */ ! public static String getLineSeparator() ! { ! return (Parser.getLineSeparator()); ! } ! /** ! * Returns the lineCount. ! * @return int ! */ ! public int getLineCount() { ! return lineCount; ! } ! /** ! * Returns the previousLine. ! * @return String ! */ ! public String getPreviousLine() { ! return previousLine; ! } ! /** ! * Returns the line. ! * @return String ! */ ! public String getLine() { ! return line; ! } ! /** ! * Sets the lineCount. ! * @param lineCount The lineCount to set ! */ ! public void setLineCount(int lineCount) { ! this.lineCount = lineCount; ! } ! /** ! * Sets the posInLine. ! * @param posInLine The posInLine to set ! */ ! public void setPosInLine(int posInLine) { ! this.posInLine = posInLine; ! } ! public void reset() throws IOException { ! super.reset(); ! lineCount = 1; ! posInLine = -1; ! } ! public StringParser getStringParser() { ! return stringParser; ! } /** --- 277,400 ---- if (node!=null) return node; } ! ! return null; ! } catch (ParserException pe) { throw pe; } ! catch (Exception e) { ! StringBuffer msgBuffer = new StringBuffer("NodeReader.readElement() : Error occurred while trying to read the next element,"); ! StringWriter sw = new StringWriter(); ! e.printStackTrace(new PrintWriter(sw)); ! appendLineDetails(msgBuffer); ! msgBuffer.append("\n Caused by:\n").append(sw.getBuffer().toString ()); ! ParserException ex = new ParserException(msgBuffer.toString(),e); ! parser.getFeedback().error(msgBuffer.toString(),ex); ! throw ex; ! } ! } ! public void appendLineDetails(StringBuffer msgBuffer) { ! msgBuffer.append("\nat Line "); ! msgBuffer.append(getLineCount()); ! msgBuffer.append(" : "); ! msgBuffer.append(getLine()); ! msgBuffer.append("\nPrevious Line ").append(getLineCount()-1); ! msgBuffer.append(" : ").append(getPreviousLine()); ! } ! /** ! * Do we need to read the next line ? ! * @return true - yes/ false - no ! */ ! protected boolean readNextLine() ! { ! if (dontReadNextLine) { ! return false; ! } ! if (posInLine==-1 || (line!=null && node.elementEnd()+1>=line.length())) ! return true; ! else return false; ! } ! /** ! * The setParser method is used by the parser to put its own object into the reader. This happens internally, ! * so this method is not generally for use by the developer or the user. ! */ ! public void setParser(Parser newParser) { ! parser = newParser; ! } ! /** ! * Sets the previousOpenScanner. ! * @param previousOpenScanner The previousOpenScanner to set ! */ ! public void setPreviousOpenScanner(TagScanner previousOpenScanner) { ! this.previousOpenScanner = previousOpenScanner; ! } ! ! /** ! * @param lineSeparator New Line separator to be used ! */ ! public static void setLineSeparator(String lineSeparator) ! { ! Parser.setLineSeparator(lineSeparator); ! } ! ! /** ! * Gets the line seperator that is being used ! * @return String ! */ ! public static String getLineSeparator() ! { ! return (Parser.getLineSeparator()); ! } ! /** ! * Returns the lineCount. ! * @return int ! */ ! public int getLineCount() { ! return lineCount; ! } ! /** ! * Returns the previousLine. ! * @return String ! */ ! public String getPreviousLine() { ! return previousLine; ! } ! /** ! * Returns the line. ! * @return String ! */ ! public String getLine() { ! return line; ! } ! /** ! * Sets the lineCount. ! * @param lineCount The lineCount to set ! */ ! public void setLineCount(int lineCount) { ! this.lineCount = lineCount; ! } ! /** ! * Sets the posInLine. ! * @param posInLine The posInLine to set ! */ ! public void setPosInLine(int posInLine) { ! this.posInLine = posInLine; ! } ! public void reset() throws IOException { ! super.reset(); ! lineCount = 1; ! posInLine = -1; ! } ! public StringParser getStringParser() { ! return stringParser; ! } /** *************** *** 404,417 **** * @param nextParsedNode The node that will be returned next by the reader. */ ! public void addNextParsedNode(Node nextParsedNode) { ! this.nextParsedNode.prepend(nextParsedNode); ! } ! ! public boolean isDontReadNextLine() { ! return dontReadNextLine; ! } ! public void setDontReadNextLine(boolean dontReadNextLine) { ! this.dontReadNextLine = dontReadNextLine; ! } } --- 404,417 ---- * @param nextParsedNode The node that will be returned next by the reader. */ ! public void addNextParsedNode(Node nextParsedNode) { ! this.nextParsedNode.prepend(nextParsedNode); ! } ! ! public boolean isDontReadNextLine() { ! return dontReadNextLine; ! } ! public void setDontReadNextLine(boolean dontReadNextLine) { ! this.dontReadNextLine = dontReadNextLine; ! } } Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** RemarkNode.java 24 Aug 2003 21:59:41 -0000 1.27 --- RemarkNode.java 3 Sep 2003 23:36:18 -0000 1.28 *************** *** 38,88 **** public class RemarkNode extends AbstractNode { ! public final static String REMARK_NODE_FILTER="-r"; ! ! /** ! * Tag contents will have the contents of the comment tag. ! */ ! String tagContents; ! /** ! * The HTMLRemarkTag is constructed by providing the beginning posn, ending posn ! * and the tag contents. ! * @param nodeBegin beginning position of the tag ! * @param nodeEnd ending position of the tag ! * @param tagContents contents of the remark tag ! */ ! public RemarkNode(int nodeBegin, int nodeEnd, String tagContents) ! { ! super(nodeBegin,nodeEnd); ! this.tagContents = tagContents; ! } ! /** ! * Returns the text contents of the comment tag. ! */ ! public String getText() ! { ! return tagContents; ! } ! public String toPlainTextString() { ! return tagContents; ! } ! public String toHtml() { ! return "<!--"+tagContents+"-->"; ! } ! /** ! * Print the contents of the remark tag. ! */ ! public String toString() ! { ! return "Comment Tag : "+tagContents+"; begins at : "+elementBegin()+"; ends at : "+elementEnd()+"\n"; ! } ! public void collectInto(NodeList collectionList, String filter) { ! if (filter==REMARK_NODE_FILTER) collectionList.add(this); ! } ! public void accept(Object visitor) { ! ((NodeVisitor)visitor).visitRemarkNode(this); ! } } --- 38,88 ---- public class RemarkNode extends AbstractNode { ! public final static String REMARK_NODE_FILTER="-r"; ! ! /** ! * Tag contents will have the contents of the comment tag. ! */ ! String tagContents; ! /** ! * The HTMLRemarkTag is constructed by providing the beginning posn, ending posn ! * and the tag contents. ! * @param nodeBegin beginning position of the tag ! * @param nodeEnd ending position of the tag ! * @param tagContents contents of the remark tag ! */ ! public RemarkNode(int nodeBegin, int nodeEnd, String tagContents) ! { ! super(nodeBegin,nodeEnd); ! this.tagContents = tagContents; ! } ! /** ! * Returns the text contents of the comment tag. ! */ ! public String getText() ! { ! return tagContents; ! } ! public String toPlainTextString() { ! return tagContents; ! } ! public String toHtml() { ! return "<!--"+tagContents+"-->"; ! } ! /** ! * Print the contents of the remark tag. ! */ ! public String toString() ! { ! return "Comment Tag : "+tagContents+"; begins at : "+elementBegin()+"; ends at : "+elementEnd()+"\n"; ! } ! public void collectInto(NodeList collectionList, String filter) { ! if (filter==REMARK_NODE_FILTER) collectionList.add(this); ! } ! public void accept(Object visitor) { ! ((NodeVisitor)visitor).visitRemarkNode(this); ! } } Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** StringNode.java 24 Aug 2003 21:59:41 -0000 1.35 --- StringNode.java 3 Sep 2003 23:36:18 -0000 1.36 *************** *** 38,93 **** public class StringNode extends AbstractNode { ! public static final String STRING_FILTER="-string"; ! ! /** ! * The text of the string. ! */ ! protected StringBuffer textBuffer; ! /** ! * Constructor takes in the text string, beginning and ending posns. ! * @param text The contents of the string line ! * @param textBegin The beginning position of the string ! * @param textEnd The ending positiong of the string ! */ ! public StringNode (StringBuffer text, int textBegin,int textEnd) ! { ! super(textBegin,textEnd); ! this.textBuffer = text; ! } ! /** ! * Returns the text of the string line ! */ ! public String getText() { ! return textBuffer.toString(); ! } /** * Sets the string contents of the node. * @param text The new text for the node. */ ! public void setText(String text) ! { ! textBuffer = new StringBuffer (text); ! } ! ! public String toPlainTextString() { ! return textBuffer.toString(); ! } ! ! public String toHtml() { ! return textBuffer.toString(); ! } ! ! public String toString() { ! return "Text = "+getText()+"; begins at : "+elementBegin()+"; ends at : "+elementEnd(); ! } ! ! public void collectInto(NodeList collectionList, String filter) { ! if (filter==STRING_FILTER) collectionList.add(this); ! } ! public void accept(Object visitor) { ! ((NodeVisitor)visitor).visitStringNode(this); ! } } --- 38,93 ---- public class StringNode extends AbstractNode { ! public static final String STRING_FILTER="-string"; ! ! /** ! * The text of the string. ! */ ! protected StringBuffer textBuffer; ! /** ! * Constructor takes in the text string, beginning and ending posns. ! * @param text The contents of the string line ! * @param textBegin The beginning position of the string ! * @param textEnd The ending positiong of the string ! */ ! public StringNode (StringBuffer text, int textBegin,int textEnd) ! { ! super(textBegin,textEnd); ! this.textBuffer = text; ! } ! /** ! * Returns the text of the string line ! */ ! public String getText() { ! return textBuffer.toString(); ! } /** * Sets the string contents of the node. * @param text The new text for the node. */ ! public void setText(String text) ! { ! textBuffer = new StringBuffer (text); ! } ! ! public String toPlainTextString() { ! return textBuffer.toString(); ! } ! ! public String toHtml() { ! return textBuffer.toString(); ! } ! ! public String toString() { ! return "Text = "+getText()+"; begins at : "+elementBegin()+"; ends at : "+elementEnd(); ! } ! ! public void collectInto(NodeList collectionList, String filter) { ! if (filter==STRING_FILTER) collectionList.add(this); ! } ! public void accept(Object visitor) { ! ((NodeVisitor)visitor).visitStringNode(this); ! } } Index: StringNodeFactory.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNodeFactory.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** StringNodeFactory.java 12 Jul 2003 00:33:59 -0000 1.1 --- StringNodeFactory.java 3 Sep 2003 23:36:18 -0000 1.2 *************** *** 8,70 **** public class StringNodeFactory implements Serializable { ! ! /** ! * Flag to tell the parser to decode strings returned by StringNode's toPlainTextString. ! * Decoding occurs via the method, org.htmlparser.util.Translate.decode() ! */ ! private boolean shouldDecodeNodes = false; ! /** ! * Flag to tell the parser to remove escape characters, like \n and \t, returned by StringNode's toPlainTextString. ! * Escape character removal occurs via the method, org.htmlparser.util.ParserUtils.removeEscapeCharacters() ! */ ! private boolean shouldRemoveEscapeCharacters = false; ! ! /** ! * Flag to tell the parser to convert non breaking space ! * (i.e. \u00a0) to a space (" "). If true, this will happen inside StringNode's toPlainTextString. ! */ ! private boolean shouldConvertNonBreakingSpace = false; ! public Node createStringNode( ! StringBuffer textBuffer, ! int textBegin, ! int textEnd) { ! Node newNode = new StringNode(textBuffer, textBegin, textEnd); ! if (shouldDecodeNodes()) ! newNode = new DecodingNode(newNode); ! if (shouldRemoveEscapeCharacters()) ! newNode = new EscapeCharacterRemovingNode(newNode); ! if (shouldConvertNonBreakingSpace()) ! newNode = new NonBreakingSpaceConvertingNode(newNode); ! return newNode; ! } ! ! /** ! * Tells the parser to decode nodes using org.htmlparser.util.Translate.decode() ! */ ! public void setNodeDecoding(boolean shouldDecodeNodes) { ! this.shouldDecodeNodes = shouldDecodeNodes; ! } ! public boolean shouldDecodeNodes() { ! return shouldDecodeNodes; ! } ! public void setEscapeCharacterRemoval(boolean shouldRemoveEscapeCharacters) { ! this.shouldRemoveEscapeCharacters = shouldRemoveEscapeCharacters; ! } ! public boolean shouldRemoveEscapeCharacters() { ! return shouldRemoveEscapeCharacters; ! } ! public void setNonBreakSpaceConversion(boolean shouldConvertNonBreakSpace) { ! this.shouldConvertNonBreakingSpace = shouldConvertNonBreakSpace; ! } ! ! public boolean shouldConvertNonBreakingSpace() { ! return shouldConvertNonBreakingSpace; ! } } --- 8,70 ---- public class StringNodeFactory implements Serializable { ! ! /** ! * Flag to tell the parser to decode strings returned by StringNode's toPlainTextString. ! * Decoding occurs via the method, org.htmlparser.util.Translate.decode() ! */ ! private boolean shouldDecodeNodes = false; ! /** ! * Flag to tell the parser to remove escape characters, like \n and \t, returned by StringNode's toPlainTextString. ! * Escape character removal occurs via the method, org.htmlparser.util.ParserUtils.removeEscapeCharacters() ! */ ! private boolean shouldRemoveEscapeCharacters = false; ! ! /** ! * Flag to tell the parser to convert non breaking space ! * (i.e. \u00a0) to a space (" "). If true, this will happen inside StringNode's toPlainTextString. ! */ ! private boolean shouldConvertNonBreakingSpace = false; ! public Node createStringNode( ! StringBuffer textBuffer, ! int textBegin, ! int textEnd) { ! Node newNode = new StringNode(textBuffer, textBegin, textEnd); ! if (shouldDecodeNodes()) ! newNode = new DecodingNode(newNode); ! if (shouldRemoveEscapeCharacters()) ! newNode = new EscapeCharacterRemovingNode(newNode); ! if (shouldConvertNonBreakingSpace()) ! newNode = new NonBreakingSpaceConvertingNode(newNode); ! return newNode; ! } ! ! /** ! * Tells the parser to decode nodes using org.htmlparser.util.Translate.decode() ! */ ! public void setNodeDecoding(boolean shouldDecodeNodes) { ! this.shouldDecodeNodes = shouldDecodeNodes; ! } ! public boolean shouldDecodeNodes() { ! return shouldDecodeNodes; ! } ! public void setEscapeCharacterRemoval(boolean shouldRemoveEscapeCharacters) { ! this.shouldRemoveEscapeCharacters = shouldRemoveEscapeCharacters; ! } ! public boolean shouldRemoveEscapeCharacters() { ! return shouldRemoveEscapeCharacters; ! } ! public void setNonBreakSpaceConversion(boolean shouldConvertNonBreakSpace) { ! this.shouldConvertNonBreakingSpace = shouldConvertNonBreakSpace; ! } ! ! public boolean shouldConvertNonBreakingSpace() { ! return shouldConvertNonBreakingSpace; ! } } |