[Htmlparser-cvs] htmlparser/src/org/htmlparser/tags BaseHrefTag.java,1.31,1.32 CompositeTag.java,1.6
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv18855/src/org/htmlparser/tags Modified Files: BaseHrefTag.java CompositeTag.java LinkTag.java MetaTag.java ScriptTag.java Tag.java Log Message: Implement generic node filtering. Added the NodeFilter interface and the filter package. Sideline tag specific scanners; tags now use only one scanner of each type, TagScanner or CompositeTagScanner (except for ScriptScanner). Obviated PeekingIterator by moving the META tag semantics to doSemanticAction, much simpler, old IteratorImpl is now PeekingIteratorImpl but deprecated. Index: BaseHrefTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/BaseHrefTag.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** BaseHrefTag.java 6 Nov 2003 03:00:27 -0000 1.31 --- BaseHrefTag.java 8 Nov 2003 21:30:56 -0000 1.32 *************** *** 32,35 **** --- 32,36 ---- import org.htmlparser.lexer.Page; import org.htmlparser.util.LinkProcessor; + import org.htmlparser.util.ParserException; /** *************** *** 92,96 **** * This sets the base URL to use for the rest of the page. */ ! public void doSemanticAction () { Page page; --- 93,97 ---- * This sets the base URL to use for the rest of the page. */ ! public void doSemanticAction () throws ParserException { Page page; Index: CompositeTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v retrieving revision 1.64 retrieving revision 1.65 diff -C2 -d -r1.64 -r1.65 *** CompositeTag.java 6 Nov 2003 03:00:28 -0000 1.64 --- CompositeTag.java 8 Nov 2003 21:30:57 -0000 1.65 *************** *** 30,33 **** --- 30,34 ---- import org.htmlparser.Node; + import org.htmlparser.NodeFilter; import org.htmlparser.StringNode; import org.htmlparser.AbstractNode; *************** *** 289,314 **** } ! public void collectInto (NodeList collectionList, String filter) { ! Node node; ! ! super.collectInto (collectionList, filter); for (SimpleNodeIterator e = children(); e.hasMoreNodes ();) ! { ! node = e.nextNode (); ! node.collectInto (collectionList, filter); ! } ! } ! ! public void collectInto (NodeList collectionList, Class nodeType) ! { ! Node node; ! ! super.collectInto (collectionList,nodeType); ! for (SimpleNodeIterator e = children(); e.hasMoreNodes (); ) ! { ! node = e.nextNode (); ! node.collectInto (collectionList, nodeType); ! } } --- 290,333 ---- } ! /** ! * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node ! * satisfies the filtering criteria.<P> ! * ! * This mechanism allows powerful filtering code to be written very easily, ! * without bothering about collection of embedded tags separately. ! * e.g. when we try to get all the links on a page, it is not possible to ! * get it at the top-level, as many tags (like form tags), can contain ! * links embedded in them. We could get the links out by checking if the ! * current node is a {@link CompositeTag}, and going through its children. ! * So this method provides a convenient way to do this.<P> ! * ! * Using collectInto(), programs get a lot shorter. Now, the code to ! * extract all links from a page would look like: ! * <pre> ! * NodeList collectionList = new NodeList(); ! * NodeFilter filter = new TagNameFilter ("A"); ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) ! * e.nextNode().collectInto(collectionList, filter); ! * </pre> ! * Thus, collectionList will hold all the link nodes, irrespective of how ! * deep the links are embedded.<P> ! * ! * Another way to accomplish the same objective is: ! * <pre> ! * NodeList collectionList = new NodeList(); ! * NodeFilter filter = new TagClassFilter (LinkTag.class); ! * for (NodeIterator e = parser.elements(); e.hasMoreNodes();) ! * e.nextNode().collectInto(collectionList, filter); ! * </pre> ! * This is slightly less specific because the LinkTag class may be ! * registered for more than one node name, e.g. <LINK> tags too. ! */ ! public void collectInto (NodeList list, NodeFilter filter) { ! super.collectInto (list, filter); for (SimpleNodeIterator e = children(); e.hasMoreNodes ();) ! e.nextNode ().collectInto (list, filter); ! if (null != getEndTag ()) ! getEndTag ().collectInto (list, filter); } Index: LinkTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/LinkTag.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** LinkTag.java 6 Nov 2003 03:00:29 -0000 1.42 --- LinkTag.java 8 Nov 2003 21:30:57 -0000 1.43 *************** *** 51,60 **** * The set of tag names that indicate the end of this tag. */ ! private static final String[] mEnders = new String[] {"A", "TD", "TR", "FORM", "LI"}; /** * The set of end tag names that indicate the end of this tag. */ ! private static final String[] mEndTagEnders = new String[] {"TD", "TR", "FORM", "LI", "BODY", "HTML"}; /** --- 51,60 ---- * The set of tag names that indicate the end of this tag. */ ! private static final String[] mEnders = new String[] {"A", "P", "DIV", "TD", "TR", "FORM", "LI"}; /** * The set of end tag names that indicate the end of this tag. */ ! private static final String[] mEndTagEnders = new String[] {"P", "DIV", "TD", "TR", "FORM", "LI", "BODY", "HTML"}; /** *************** *** 92,107 **** * } * </pre> - * There is another mechanism available that allows for uniform extraction - * of images. You could do this to get all images from a web page : - * <pre> - * Node node; - * Vector imageCollectionVector = new Vector(); - * for (NodeIterator e = parser.elements();e.hasMoreNode();) { - * node = e.nextHTMLNode(); - * node.collectInto(imageCollectionVector,ImageTag.IMAGE_FILTER); - * } - * </pre> - * The link tag processes all its contents in collectInto(). - * @see #linkData() */ public LinkTag () --- 92,95 ---- Index: MetaTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/MetaTag.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** MetaTag.java 6 Nov 2003 03:00:29 -0000 1.30 --- MetaTag.java 8 Nov 2003 21:30:57 -0000 1.31 *************** *** 30,33 **** --- 30,34 ---- import org.htmlparser.lexer.nodes.Attribute; + import org.htmlparser.util.ParserException; /** *************** *** 100,103 **** --- 101,120 ---- else getAttributesEx ().add (new Attribute ("NAME", metaTagName)); + } + + /** + * Check for a charset directive, and if found, set the charset for the page. + */ + public void doSemanticAction () throws ParserException + { + String httpEquiv; + String charset; + + httpEquiv = getHttpEquiv (); + if ("Content-Type".equalsIgnoreCase (httpEquiv)) + { + charset = getPage ().getCharset (getAttribute ("CONTENT")); + getPage ().setEncoding (charset); + } } Index: ScriptTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ScriptTag.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** ScriptTag.java 6 Nov 2003 03:00:31 -0000 1.31 --- ScriptTag.java 8 Nov 2003 21:30:57 -0000 1.32 *************** *** 29,32 **** --- 29,34 ---- package org.htmlparser.tags; + import org.htmlparser.scanners.ScriptScanner; + /** * A script tag. *************** *** 49,52 **** --- 51,55 ---- public ScriptTag () { + setThisScanner (new ScriptScanner ()); } Index: Tag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/Tag.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** Tag.java 6 Nov 2003 03:00:35 -0000 1.56 --- Tag.java 8 Nov 2003 21:30:57 -0000 1.57 *************** *** 137,153 **** /** - * This method verifies that the current tag matches the provided - * filter. The match is based on the string object and not its contents, - * so ensure that you are using static final filter strings provided - * in the tag classes. - * @see org.htmlparser.Node#collectInto(NodeList, String) - */ - public void collectInto(NodeList collectionList, String filter) - { - if (null != getThisScanner () && getThisScanner ().getFilter () == filter) - collectionList.add (this); - } - - /** * Handle a visitor. * <em>NOTE: This currently defers to accept(NodeVisitor). If --- 137,140 ---- |