[Htmlparser-cvs] htmlparser/src/org/htmlparser NodeFilter.java,NONE,1.1 AbstractNode.java,1.19,1.20
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv18855/src/org/htmlparser Modified Files: AbstractNode.java Node.java Parser.java RemarkNode.java StringNode.java Added Files: NodeFilter.java Log Message: Implement generic node filtering. Added the NodeFilter interface and the filter package. Sideline tag specific scanners; tags now use only one scanner of each type, TagScanner or CompositeTagScanner (except for ScriptScanner). Obviated PeekingIterator by moving the META tag semantics to doSemanticAction, much simpler, old IteratorImpl is now PeekingIteratorImpl but deprecated. --- NEW FILE: NodeFilter.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/NodeFilter.java,v $ // $Author: derrickoswald $ // $Date: 2003/11/08 21:30:56 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser; /** * Implement this interface to select particular nodes. */ public interface NodeFilter { /** * Predicate to determine whether or not to keep the given node. * The behaviour based on this outcome is determined by the context * in which it is called. It may lead to the node being added to a list * or printed out. See the calling routine for details. * @return <code>true</code> if the node is to be kept, <code>false</code> * if it is to be discarded. */ boolean accept (Node node); } Index: AbstractNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/AbstractNode.java,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** AbstractNode.java 1 Nov 2003 21:55:42 -0000 1.19 --- AbstractNode.java 8 Nov 2003 21:30:56 -0000 1.20 *************** *** 30,36 **** import java.io.Serializable; - import org.htmlparser.lexer.Page; import org.htmlparser.util.NodeList; /** --- 30,37 ---- import java.io.Serializable; + import org.htmlparser.lexer.Page; import org.htmlparser.util.NodeList; + import org.htmlparser.util.ParserException; /** *************** *** 110,174 **** /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : * <pre> * NodeList collectionList = new NodeList(); ! * Node node; ! * String filter = LinkTag.LINK_TAG_FILTER; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, filter); ! * } * </pre> * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. This of course implies that tags must ! * fulfill their responsibilities toward honouring certain filters. ! * ! * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as ! * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not ! * make calls like : ! * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like : ! * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/> ! * ! * To find out if your desired tag has filtering support, check the API of the tag. ! */ ! public abstract void collectInto(NodeList collectionList, String filter); ! ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : * <pre> * NodeList collectionList = new NodeList(); ! * Node node; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, LinkTag.class); ! * } * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. */ ! public void collectInto(NodeList collectionList, Class nodeType) { ! if (nodeType.getName().equals(this.getClass().getName())) ! collectionList.add(this); } --- 111,150 ---- /** ! * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node ! * satisfies the filtering criteria.<P> * ! * This mechanism allows powerful filtering code to be written very easily, ! * without bothering about collection of embedded tags separately. ! * e.g. when we try to get all the links on a page, it is not possible to ! * get it at the top-level, as many tags (like form tags), can contain ! * links embedded in them. We could get the links out by checking if the ! * current node is a {@link CompositeTag}, and going through its children. ! * So this method provides a convenient way to do this.<P> * ! * Using collectInto(), programs get a lot shorter. Now, the code to ! * extract all links from a page would look like: * <pre> * NodeList collectionList = new NodeList(); ! * NodeFilter filter = new TagNameFilter ("A"); ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) ! * e.nextNode().collectInto(collectionList, filter); * </pre> * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded.<P> * ! * Another way to accomplish the same objective is: * <pre> * NodeList collectionList = new NodeList(); ! * NodeFilter filter = new TagClassFilter (LinkTag.class); ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) ! * e.nextNode().collectInto(collectionList, filter); * </pre> ! * This is slightly less specific because the LinkTag class may be ! * registered for more than one node name, e.g. <LINK> tags too. */ ! public void collectInto (NodeList list, NodeFilter filter) { ! if (filter.accept (this)) ! list.add (this); } *************** *** 312,316 **** * The default action is to do nothing. */ ! public void doSemanticAction () { } --- 288,292 ---- * The default action is to do nothing. */ ! public void doSemanticAction () throws ParserException { } Index: Node.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Node.java,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** Node.java 1 Nov 2003 21:55:42 -0000 1.43 --- Node.java 8 Nov 2003 21:30:56 -0000 1.44 *************** *** 32,37 **** import org.htmlparser.util.NodeList; ! public interface Node { /** * Returns a string representation of the node. This is an important method, it allows a simple string transformation --- 32,39 ---- import org.htmlparser.util.NodeList; + import org.htmlparser.util.ParserException; ! public interface Node ! { /** * Returns a string representation of the node. This is an important method, it allows a simple string transformation *************** *** 47,50 **** --- 49,53 ---- */ public abstract String toPlainTextString(); + /** * This method will make it easier when using html parser to reproduce html pages (with or without modifications) *************** *** 53,56 **** --- 56,60 ---- */ public abstract String toHtml(); + /** * Return the string representation of the node. *************** *** 60,124 **** */ public abstract String toString(); /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : * <pre> * NodeList collectionList = new NodeList(); ! * Node node; ! * String filter = LinkTag.LINK_TAG_FILTER; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, filter); ! * } * </pre> * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. This of course implies that tags must ! * fulfill their responsibilities toward honouring certain filters. ! * ! * <B>Important:</B> In order to keep performance optimal, <B>do not create</B> you own filter strings, as ! * the internal matching occurs with the pre-existing filter string object (in the relevant class). i.e. do not ! * make calls like : ! * <I>collectInto(collectionList,"-l")</I>, instead, make calls only like : ! * <I>collectInto(collectionList,LinkTag.LINK_TAG_FILTER)</I>.<P/> ! * ! * To find out if your desired tag has filtering support, check the API of the tag. ! */ ! public abstract void collectInto(NodeList collectionList, String filter); ! /** ! * Collect this node and its child nodes (if-applicable) into the collection parameter, provided the node ! * satisfies the filtering criteria. <P/> ! * ! * This mechanism allows powerful filtering code to be written very easily, without bothering about collection ! * of embedded tags separately. e.g. when we try to get all the links on a page, it is not possible to get it ! * at the top-level, as many tags (like form tags), can contain links embedded in them. We could get the links ! * out by checking if the current node is a form tag, and going through its contents. However, this ties us down ! * to specific tags, and is not a very clean approach. <P/> * ! * Using collectInto(), programs get a lot shorter. Now, the code to extract all links from a page would look ! * like : * <pre> * NodeList collectionList = new NodeList(); ! * Node node; ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { ! * node = e.nextNode(); ! * node.collectInto (collectionVector, LinkTag.class); ! * } * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded. */ ! public abstract void collectInto(NodeList collectionList, Class nodeType); /** * Returns the beginning position of the tag. ! * <br>deprecated Use {@link #getEndPosition} */ public abstract int elementBegin(); --- 64,106 ---- */ public abstract String toString(); + /** ! * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node ! * satisfies the filtering criteria.<P> * ! * This mechanism allows powerful filtering code to be written very easily, ! * without bothering about collection of embedded tags separately. ! * e.g. when we try to get all the links on a page, it is not possible to ! * get it at the top-level, as many tags (like form tags), can contain ! * links embedded in them. We could get the links out by checking if the ! * current node is a {@link CompositeTag}, and going through its children. ! * So this method provides a convenient way to do this.<P> * ! * Using collectInto(), programs get a lot shorter. Now, the code to ! * extract all links from a page would look like: * <pre> * NodeList collectionList = new NodeList(); ! * NodeFilter filter = new TagNameFilter ("A"); ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) ! * e.nextNode().collectInto(collectionList, filter); * </pre> * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded.<P> * ! * Another way to accomplish the same objective is: * <pre> * NodeList collectionList = new NodeList(); ! * NodeFilter filter = new TagClassFilter (LinkTag.class); ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) ! * e.nextNode().collectInto(collectionList, filter); * </pre> ! * This is slightly less specific because the LinkTag class may be ! * registered for more than one node name, e.g. <LINK> tags too. */ ! public abstract void collectInto(NodeList collectionList, NodeFilter filter); ! /** * Returns the beginning position of the tag. ! * <br>deprecated Use {@link #getStartPosition} */ public abstract int elementBegin(); *************** *** 154,157 **** --- 136,142 ---- public abstract void setEndPosition (int position); + /** + * Apply the visitor object (of type NodeVisitor) to this node. + */ public abstract void accept(Object visitor); *************** *** 184,188 **** /** ! * Returns the text of the string line */ public String getText(); --- 169,173 ---- /** ! * Returns the text of the node. */ public String getText(); *************** *** 193,197 **** */ public void setText(String text); ! /** * Perform the meaning of this tag. --- 178,182 ---- */ public void setText(String text); ! /** * Perform the meaning of this tag. *************** *** 201,206 **** * with the character set to use (<META>), the base URL to use * (<BASE>). Other than that, the semantic meaning is up to the ! * application. */ ! public void doSemanticAction (); } --- 186,191 ---- * with the character set to use (<META>), the base URL to use * (<BASE>). Other than that, the semantic meaning is up to the ! * application and it's custom nodes. */ ! public void doSemanticAction () throws ParserException; } Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.73 retrieving revision 1.74 diff -C2 -d -r1.73 -r1.74 *** Parser.java 1 Nov 2003 21:55:42 -0000 1.73 --- Parser.java 8 Nov 2003 21:30:56 -0000 1.74 *************** *** 35,39 **** import java.net.URL; import java.net.URLConnection; - import java.util.HashMap; import java.util.Hashtable; import java.util.Iterator; --- 35,38 ---- *************** *** 41,50 **** import java.util.Vector; ! import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.NodeFactory; - import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.nodeDecorators.DecodingNode; import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; --- 40,49 ---- import java.util.Vector; ! import org.htmlparser.filters.TagNameFilter; ! import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.NodeFactory; import org.htmlparser.nodeDecorators.DecodingNode; import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; *************** *** 72,76 **** import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; - import org.htmlparser.tags.MetaTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.DefaultParserFeedback; --- 71,74 ---- *************** *** 573,577 **** { tag = ((CompositeTagScanner)scanner).createTag (null, 0, 0, null, null, null, null); - tag.setThisScanner (scanner); mBlastocyst.put (ids[i], tag); } --- 571,574 ---- *************** *** 579,583 **** { tag = scanner.createTag (null, 0, 0, null, null, null); - tag.setThisScanner (scanner); mBlastocyst.put (ids[i], tag); } --- 576,579 ---- *************** *** 612,677 **** * } * </pre> */ ! public NodeIterator elements() throws ParserException { ! boolean remove_scanner; ! Node node; ! TagNode tag; ! MetaTag meta; ! String httpEquiv; ! String charset; ! String original; ! IteratorImpl ret; ! ! ret = new IteratorImpl (getLexer (), feedback); ! original = getLexer ().getPage ().getEncoding (); ! remove_scanner = false; ! try ! { ! if (null == mScanners.get ("META")) ! { ! addScanner (new MetaTagScanner ("-m")); ! remove_scanner = true; ! } ! ! /* pre-read up to </HEAD> looking for charset directive */ ! while (null != (node = ret.peek ())) ! { ! if (node instanceof TagNode) ! { ! tag = (TagNode)node; ! if (tag instanceof MetaTag) ! { // check for charset on Content-Type ! meta = (MetaTag)node; ! httpEquiv = meta.getAttribute ("HTTP-EQUIV"); ! if ("Content-Type".equalsIgnoreCase (httpEquiv)) ! { ! charset = getLexer ().getPage ().getCharset (meta.getAttribute ("CONTENT")); ! if (!charset.equalsIgnoreCase (original)) ! { // oops, different character set, restart ! getLexer ().getPage ().setEncoding (charset); ! getLexer ().setPosition (0); ! ret = new IteratorImpl (getLexer (), feedback); ! } ! // once we see the Content-Type meta tag we're finished the pre-read ! break; ! } ! } ! else if (tag.isEndTag ()) ! { ! if (tag.getTagName ().equalsIgnoreCase ("HEAD")) ! // or, once we see the </HEAD> tag we're finished the pre-read ! break; ! } ! } ! } ! } ! finally ! { ! if (remove_scanner) ! mScanners.remove ("META"); ! } ! ! return ret; } --- 608,616 ---- * } * </pre> + * @param filter The filter to apply to the nodes. */ ! public NodeIterator elements () throws ParserException { ! return (new IteratorImpl (getLexer (), feedback)); } *************** *** 707,743 **** /** ! * Parse the given resource, using the filter provided */ ! public void parse(String filter) throws Exception { Node node; ! for (NodeIterator e=elements();e.hasMoreNodes();) { ! node = e.nextNode(); ! if (node!=null) { ! if (filter==null) ! System.out.println(node.toString()); ! else ! { ! // There is a filter. Find if the associated filter of this node ! // matches the specified filter ! if (!(node instanceof Tag)) ! continue; ! Tag tag=(Tag)node; ! TagScanner scanner = tag.getThisScanner(); ! if (scanner==null) ! continue; ! ! String tagFilter = scanner.getFilter(); ! if (tagFilter==null) ! continue; ! if (tagFilter.equals(filter)) ! System.out.println(node.toString()); ! } } ! else System.out.println("Node is null"); } - } --- 646,672 ---- /** ! * Parse the given resource, using the filter provided. ! * @param filter The filter to apply to the parsed nodes. */ ! public void parse (NodeFilter filter) throws ParserException { + NodeIterator e; Node node; ! NodeList list; ! ! list = new NodeList (); ! for (e = elements (); e.hasMoreNodes (); ) { ! node = e.nextNode (); ! if (null != filter) { ! node.collectInto (list, filter); ! for (int i = 0; i < list.size (); i++) ! System.out.println (list.elementAt (i)); ! list.removeAll (); } ! else ! System.out.println (node); } } *************** *** 928,966 **** { System.out.println(); ! System.out.println("Syntax : java -jar htmlparser.jar <resourceLocn/website> -l"); ! System.out.println(" <resourceLocn> the name of the file to be parsed (with complete path if not in current directory)"); ! System.out.println(" -l Show only the link tags extracted from the document"); ! System.out.println(" -i Show only the image tags extracted from the document"); ! System.out.println(" -s Show only the Javascript code extracted from the document"); ! System.out.println(" -t Show only the Style code extracted from the document"); ! System.out.println(" -a Show only the Applet tag extracted from the document"); ! System.out.println(" -j Parse JSP tags"); ! System.out.println(" -m Parse Meta tags"); ! System.out.println(" -T Extract the Title"); ! System.out.println(" -f Extract forms"); ! System.out.println(" -r Extract frameset"); ! System.out.println(" -help This screen"); ! System.out.println(); ! System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); System.out.println(); System.out.println("Example : java -jar htmlparser.jar http://www.yahoo.com"); System.out.println(); ! System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); System.exit(-1); } ! try { ! Parser parser = new Parser(args[0]); ! System.out.println("Parsing " + parser.getURL ()); ! parser.registerScanners(); ! try { ! if (args.length==2) ! { ! parser.parse(args[1]); ! } else ! parser.parse(null); ! } ! catch (Exception e) { ! e.printStackTrace(); ! } } catch (ParserException e) { --- 857,885 ---- { System.out.println(); ! System.out.println("Syntax : java -jar htmlparser.jar <resourceLocn/website> [node_type]"); ! System.out.println(" <resourceLocn/website> the URL or file to be parsed"); ! System.out.println(" node_type an optional node name, for example:"); ! System.out.println(" A - Show only the link tags extracted from the document"); ! System.out.println(" IMG - Show only the image tags extracted from the document"); ! System.out.println(" TITLE - Extract the title from the document"); System.out.println(); System.out.println("Example : java -jar htmlparser.jar http://www.yahoo.com"); System.out.println(); ! System.out.println("For support, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page..."); ! System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); ! System.out.println(); System.exit(-1); } ! try ! { ! Parser parser = new Parser (args[0]); ! parser.registerScanners (); ! System.out.println ("Parsing " + parser.getURL ()); ! NodeFilter filter; ! if (1 < args.length) ! filter = new TagNameFilter (args[1]); ! else ! filter = null; ! parser.parse (filter); } catch (ParserException e) { *************** *** 993,1002 **** } ! public Node [] extractAllNodesThatAre(Class nodeType) throws ParserException { ! NodeList nodeList = new NodeList(); ! for (NodeIterator e = elements();e.hasMoreNodes();) { ! e.nextNode().collectInto(nodeList,nodeType); ! } ! return nodeList.toNodeArray(); } --- 912,942 ---- } ! /** ! * Extract all nodes matching the given filter. ! * @see Node#collectInto() ! */ ! public NodeList extractAllNodesThatMatch (NodeFilter filter) throws ParserException ! { ! NodeIterator e; ! NodeList ret; ! ! ret = new NodeList (); ! for (e = elements (); e.hasMoreNodes (); ) ! e.nextNode ().collectInto (ret, filter); ! ! return (ret); ! } ! ! /** ! * Convenience method to extract all nodes of a given class type. ! * @see Node#collectInto() ! */ ! public Node [] extractAllNodesThatAre (Class nodeType) throws ParserException ! { ! NodeList ret; ! ! ret = extractAllNodesThatMatch (new NodeClassFilter (nodeType)); ! ! return (ret.toNodeArray ()); } Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** RemarkNode.java 1 Nov 2003 01:36:56 -0000 1.35 --- RemarkNode.java 8 Nov 2003 21:30:56 -0000 1.36 *************** *** 91,98 **** } - public void collectInto(NodeList collectionList, String filter) { - if (filter==REMARK_NODE_FILTER) collectionList.add(this); - } - /** * Remark visiting code. --- 91,94 ---- Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** StringNode.java 1 Nov 2003 01:36:56 -0000 1.43 --- StringNode.java 8 Nov 2003 21:30:56 -0000 1.44 *************** *** 85,92 **** } - public void collectInto(NodeList collectionList, String filter) { - if (filter==STRING_FILTER) collectionList.add(this); - } - /** * String visiting code. --- 85,88 ---- |