[Htmlparser-cvs] htmlparser/src/org/htmlparser AbstractNode.java,NONE,1.1 Node.java,1.23,1.24 Remark

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser
In directory sc8-pr-cvs1:/tmp/cvs-serv17851/src/org/htmlparser

Modified Files:
	Node.java RemarkNode.java RemarkNodeParser.java Parser.java 
	StringNode.java NodeReader.java 
Added Files:
	AbstractNode.java 
Log Message:
Renamed Node to AbstractNode, extracted the new interface, Node, and moved line separator code from AbstractNode to Parser.

--- NEW FILE: AbstractNode.java ---
// HTMLParser Library v1_4_20030601 - A java-based parser for HTML
// Copyright (C) Dec 31, 2000 Somik Raha
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// For any questions or suggestions, you can write to me at :
// Email :so...@in...
// 
// Postal Address : 
// Somik Raha
// Extreme Programmer & Coach
// Industrial Logic Corporation
// 2583 Cedar Street, Berkeley, 
// CA 94708, USA
// Website : http://www.industriallogic.com

package org.htmlparser;

import java.io.*;

import org.htmlparser.tags.*;
import org.htmlparser.util.*;
import org.htmlparser.visitors.*;

/**
 * AbstractNode, which implements the Node interface, is the base class for all types of nodes, including tags, string elements, etc
 */
public abstract class AbstractNode implements Node, Serializable {
	/** 
	 * The beginning position of the tag in the line
	 */
	protected int nodeBegin;

	/**
	 * The ending position of the tag in the line
	 */
	protected int nodeEnd;

	/**
	 * If parent of this tag
	 */
	protected CompositeTag parent = null;
	
	public AbstractNode(int nodeBegin, int nodeEnd) {
		this.nodeBegin = nodeBegin;
		this.nodeEnd   = nodeEnd;
	}

	/**
	 * Returns a string representation of the node. This is an important method, it allows a simple string transformation
	 * of a web page, regardless of a node.<br>
	 * Typical application code (for extracting only the text from a web page) would then be simplified to  :<br>
	 * <pre>
	 * Node node;
	 * for (Enumeration e = parser.elements();e.hasMoreElements();) {
	 *    node = (Node)e.nextElement();
	 *    System.out.println(node.toPlainTextString()); // Or do whatever processing you wish with the plain text string
	 * }
	 * </pre>
	 */
	public abstract String toPlainTextString();

	/**
	 * This method will make it easier when using html parser to reproduce html pages (with or without modifications)
	 * Applications reproducing html can use this method on nodes which are to be used or transferred as they were 
	 * recieved, with the original html
	 */
	public abstract String toHtml();

	/**
	 * Return the string representation of the node.
	 * Subclasses must define this method, and this is typically to be used in the manner<br>
	 * <pre>System.out.println(node)</pre>
	 * @return java.lang.String
	 */
	public abstract String toString();

	/**
	 * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node
	 * satisfies the filtering criteria. <P/>
	 * 
	 * This mechanism allows powerful filtering code to be written very easily, without bothering about collection
	 * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it
	 * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links
	 * out by checking if the current node is a form tag, and going through its contents. However, this ties us down
	 * to specific tags, and is not a very clean approach. <P/>
	 * 
	 * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look 
	 * like :
	 * <pre>
	 * NodeList collectionList = new NodeList(); 
	 * Node node; 
	 * String filter = LinkTag.LINK_TAG_FILTER; 
	 * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
	 * 		node = e.nextNode();
	 * 		node.collectInto (collectionVector, filter); 
	 * }
	 * </pre>
	 * Thus, collectionList will hold all the link nodes, irrespective of how
	 * deep the links are embedded. This of course implies that tags must
	 * fulfill their responsibilities toward honouring certain filters.
	 * 
	 * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as 
	 * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not
	 * make calls like : 
	 * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like :
	 * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/>
	 * 
	 * To find out if your desired tag has filtering support, check the API of the tag.
	 */
	public abstract void collectInto(NodeList collectionList, String filter);

	/**
	 * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node
	 * satisfies the filtering criteria. <P/>
	 * 
	 * This mechanism allows powerful filtering code to be written very easily, without bothering about collection
	 * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it
	 * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links
	 * out by checking if the current node is a form tag, and going through its contents. However, this ties us down
	 * to specific tags, and is not a very clean approach. <P/>
	 * 
	 * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look 
	 * like :
	 * <pre>
	 * NodeList collectionList = new NodeList(); 
	 * Node node; 
	 * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
	 * 		node = e.nextNode();
	 * 		node.collectInto (collectionVector, LinkTag.class);
	 * }
	 * </pre>
	 * Thus, collectionList will hold all the link nodes, irrespective of how
	 * deep the links are embedded. 
	 */
	public void collectInto(NodeList collectionList, Class nodeType) {
		if (nodeType.getName().equals(this.getClass().getName())) {
			collectionList.add(this);
		}
	}

	/**
	 * Returns the beginning position of the tag.
	 */
	public int elementBegin() {
		return nodeBegin;
	}

	/**
	 * Returns the ending position fo the tag
	 */
	public int elementEnd() {
		return nodeEnd;
	}

	public abstract void accept(NodeVisitor visitor);

	/**
	 * @deprecated - use toHtml() instead
	 */
	public final String toHTML() {
		return toHtml();
	}
	
	/**
	 * Get the parent of this tag
	 * @return The parent of this node, if it's been set, <code>null</code> otherwise.
	 */
	public CompositeTag getParent() {
		return parent;
	}

	/**
	 * Sets the parent of this tag
	 * @param tag
	 */
	public void setParent(CompositeTag tag) {
		parent = tag;
	}

}

Index: Node.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Node.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** Node.java	1 Jun 2003 20:50:08 -0000	1.23
--- Node.java	13 Jun 2003 20:27:04 -0000	1.24
***************
*** 1,91 ****
- // HTMLParser Library v1_4_20030601 - A java-based parser for HTML
- // Copyright (C) Dec 31, 2000 Somik Raha
- //
- // This library is free software; you can redistribute it and/or
- // modify it under the terms of the GNU Lesser General Public
- // License as published by the Free Software Foundation; either
- // version 2.1 of the License, or (at your option) any later version.
- //
- // This library is distributed in the hope that it will be useful,
- // but WITHOUT ANY WARRANTY; without even the implied warranty of
- // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- // Lesser General Public License for more details.
- // 
- // You should have received a copy of the GNU Lesser General Public
- // License along with this library; if not, write to the Free Software
- // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- //
- // For any questions or suggestions, you can write to me at :
- // Email :so...@in...
- // 
- // Postal Address : 
- // Somik Raha
- // Extreme Programmer & Coach
- // Industrial Logic Corporation
- // 2583 Cedar Street, Berkeley, 
- // CA 94708, USA
- // Website : http://www.industriallogic.com
- 
  package org.htmlparser;
  
! import java.io.*;
! 
! import org.htmlparser.tags.*;
! import org.htmlparser.util.*;
! import org.htmlparser.visitors.*;
! 
! /**
!  * A Node interface is implemented by all types of nodes (tags, string elements, etc)
!  */
! public abstract class Node implements Serializable {
! 	/** 
! 	 * The beginning position of the tag in the line
! 	 */
! 	protected int nodeBegin;
! 
! 	/**
! 	 * The ending position of the tag in the line
! 	 */
! 	protected int nodeEnd;
! 
! 	/**
! 	 * If parent of this tag
! 	 */
! 	protected CompositeTag parent;
! 	
! 	/**
! 	 * Variable to store lineSeparator.
! 	 * This is setup to read <code>line.separator</code> from the System property.
! 	 * However it can also be changed using the mutator methods.
! 	 * This will be used in the toHTML() methods in all the sub-classes of Node.
! 	 */
! 	protected static String lineSeparator = System.getProperty("line.separator", "\n");
! 	
! 	public Node(int nodeBegin, int nodeEnd) {
! 		this.nodeBegin = nodeBegin;
! 		this.nodeEnd   = nodeEnd;
! 		this.parent    = null;
! 	}
! 
! 	public Node(int nodeBegin, int nodeEnd, CompositeTag parent) {
! 		this.nodeBegin = nodeBegin;
! 		this.nodeEnd   = nodeEnd;
! 		this.parent    = parent;
! 	}
! 
! 	/**
! 	 * @param lineSeparator New Line separator to be used
! 	 */
! 	public static void setLineSeparator(String lineSeparator) {
! 		Node.lineSeparator = lineSeparator;
! 	}
! 
! 	/**
! 	 * @return String lineSeparator that will be used in toHTML()
! 	 */
! 	public static String getLineSeparator() {
! 		return Node.lineSeparator;
! 	}
  
  	/**
  	 * Returns a string representation of the node. This is an important method, it allows a simple string transformation
--- 1,9 ----
  package org.htmlparser;
  
! import org.htmlparser.tags.CompositeTag;
! import org.htmlparser.util.NodeList;
! import org.htmlparser.visitors.NodeVisitor;
  
+ public interface Node {
  	/**
  	 * Returns a string representation of the node. This is an important method, it allows a simple string transformation
***************
*** 101,105 ****
  	 */
  	public abstract String toPlainTextString();
- 
  	/**
  	 * This method will make it easier when using html parser to reproduce html pages (with or without modifications)
--- 19,22 ----
***************
*** 108,112 ****
  	 */
  	public abstract String toHtml();
- 
  	/**
  	 * Return the string representation of the node.
--- 25,28 ----
***************
*** 116,120 ****
  	 */
  	public abstract String toString();
- 
  	/**
  	 * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node
--- 32,35 ----
***************
*** 151,155 ****
  	 */
  	public abstract void collectInto(NodeList collectionList, String filter);
- 
  	/**
  	 * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node
--- 66,69 ----
***************
*** 175,222 ****
  	 * deep the links are embedded. 
  	 */
! 	public void collectInto(NodeList collectionList, Class nodeType) {
! 		if (nodeType.getName().equals(this.getClass().getName())) {
! 			collectionList.add(this);
! 		}
! 	}
! 
  	/**
  	 * Returns the beginning position of the tag.
  	 */
! 	public int elementBegin() {
! 		return nodeBegin;
! 	}
! 
  	/**
  	 * Returns the ending position fo the tag
  	 */
! 	public int elementEnd() {
! 		return nodeEnd;
! 	}
! 
  	public abstract void accept(NodeVisitor visitor);
- 
  	/**
  	 * @deprecated - use toHtml() instead
  	 */
! 	public final String toHTML() {
! 		return toHtml();
! 	}
! 	
  	/**
  	 * Get the parent of this tag
  	 * @return The parent of this node, if it's been set, <code>null</code> otherwise.
  	 */
! 	public CompositeTag getParent() {
! 		return parent;
! 	}
! 
  	/**
  	 * Sets the parent of this tag
  	 * @param tag
  	 */
! 	public void setParent(CompositeTag tag) {
! 		parent = tag;
! 	}
! 
! }
--- 89,115 ----
  	 * deep the links are embedded. 
  	 */
! 	public abstract void collectInto(NodeList collectionList, Class nodeType);
  	/**
  	 * Returns the beginning position of the tag.
  	 */
! 	public abstract int elementBegin();
  	/**
  	 * Returns the ending position fo the tag
  	 */
! 	public abstract int elementEnd();
  	public abstract void accept(NodeVisitor visitor);
  	/**
  	 * @deprecated - use toHtml() instead
  	 */
! 	public abstract String toHTML();
  	/**
  	 * Get the parent of this tag
  	 * @return The parent of this node, if it's been set, <code>null</code> otherwise.
  	 */
! 	public abstract CompositeTag getParent();
  	/**
  	 * Sets the parent of this tag
  	 * @param tag
  	 */
! 	public abstract void setParent(CompositeTag tag);
! }
\ No newline at end of file

Index: RemarkNode.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v
retrieving revision 1.18
retrieving revision 1.19
diff -C2 -d -r1.18 -r1.19
*** RemarkNode.java	1 Jun 2003 20:50:08 -0000	1.18
--- RemarkNode.java	13 Jun 2003 20:27:04 -0000	1.19
***************
*** 36,40 ****
   * The remark tag is identified and represented by this class.
   */
! public class RemarkNode extends Node
  {
  	public final static String REMARK_NODE_FILTER="-r";
--- 36,40 ----
   * The remark tag is identified and represented by this class.
   */
! public class RemarkNode extends AbstractNode
  {
  	public final static String REMARK_NODE_FILTER="-r";

Index: RemarkNodeParser.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNodeParser.java,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -d -r1.20 -r1.21
*** RemarkNodeParser.java	1 Jun 2003 20:50:08 -0000	1.20
--- RemarkNodeParser.java	13 Jun 2003 20:27:04 -0000	1.21
***************
*** 149,153 ****
  				// We need to continue parsing to the next line
  				//input = reader.getNextLine();
! 				tagContents.append(Node.getLineSeparator());
  				do {
  					input = reader.getNextLine();		
--- 149,153 ----
  				// We need to continue parsing to the next line
  				//input = reader.getNextLine();
! 				tagContents.append(Parser.getLineSeparator());
  				do {
  					input = reader.getNextLine();		

Index: Parser.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v
retrieving revision 1.43
retrieving revision 1.44
diff -C2 -d -r1.43 -r1.44
*** Parser.java	1 Jun 2003 20:50:08 -0000	1.43
--- Parser.java	13 Jun 2003 20:27:04 -0000	1.44
***************
*** 217,220 ****
--- 217,228 ----
      protected transient BufferedInputStream input;
  
+ 	/**
+ 	 * Variable to store lineSeparator.
+ 	 * This is setup to read <code>line.separator</code> from the System property.
+ 	 * However it can also be changed using the mutator methods.
+ 	 * This will be used in the toHTML() methods in all the sub-classes of Node.
+ 	 */
+ 	protected static String lineSeparator = System.getProperty("line.separator", "\n");
+ 
      /**
       * A quiet message sink.
***************
*** 238,244 ****
  	 * @param lineSeparator New Line separator to be used
  	 */
! 	public static void setLineSeparator(String lineSeparator)
  	{
! 		Node.setLineSeparator(lineSeparator);	
  	}
  	
--- 246,252 ----
  	 * @param lineSeparator New Line separator to be used
  	 */
! 	public static void setLineSeparator(String lineSeparatorString)
  	{
! 		lineSeparator = lineSeparatorString;	
  	}
  	
***************
*** 1193,1196 ****
--- 1201,1211 ----
  		parser.addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER));
  		return parser;
+ 	}
+ 
+ 	/**
+ 	 * @return String lineSeparator that will be used in toHTML()
+ 	 */
+ 	public static String getLineSeparator() {
+ 		return lineSeparator;
  	}
  }

Index: StringNode.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** StringNode.java	1 Jun 2003 20:50:08 -0000	1.19
--- StringNode.java	13 Jun 2003 20:27:04 -0000	1.20
***************
*** 36,40 ****
   * Normal text in the html document is identified and represented by this class.
   */
! public class StringNode extends Node
  {
  	public static final String STRING_FILTER="-string";
--- 36,40 ----
   * Normal text in the html document is identified and represented by this class.
   */
! public class StringNode extends AbstractNode
  {
  	public static final String STRING_FILTER="-string";

Index: NodeReader.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/NodeReader.java,v
retrieving revision 1.32
retrieving revision 1.33
diff -C2 -d -r1.32 -r1.33
*** NodeReader.java	1 Jun 2003 20:50:08 -0000	1.32
--- NodeReader.java	13 Jun 2003 20:27:04 -0000	1.33
***************
*** 336,340 ****
  	public static void setLineSeparator(String lineSeparator)
  	{
! 		Node.setLineSeparator(lineSeparator);	
  	}
  	
--- 336,340 ----
  	public static void setLineSeparator(String lineSeparator)
  	{
! 		Parser.setLineSeparator(lineSeparator);	
  	}
  	
***************
*** 345,349 ****
  	public static String getLineSeparator()
  	{
! 		return (Node.getLineSeparator());
  	}
  	/**
--- 345,349 ----
  	public static String getLineSeparator()
  	{
! 		return (Parser.getLineSeparator());
  	}
  	/**

[Htmlparser-cvs] htmlparser/src/org/htmlparser AbstractNode.java,NONE,1.1 Node.java,1.23,1.24 Remark

[Htmlparser-cvs] htmlparser/src/org/htmlparser AbstractNode.java,NONE,1.1 Node.java,1.23,1.24 RemarkNode.java,1.18,1.19 RemarkNodeParser.java,1.20,1.21 Parser.java,1.43,1.44 StringNode.java,1.19,1.20 NodeReader.java,1.32,1.33