[Htmlparser-cvs] htmlparser/src/org/htmlparser/util PeekingIteratorImpl.java,NONE,1.1 IteratorImpl.j
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-11-08 21:31:01
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv18855/src/org/htmlparser/util Modified Files: IteratorImpl.java ParserUtils.java PeekingIterator.java Added Files: PeekingIteratorImpl.java Log Message: Implement generic node filtering. Added the NodeFilter interface and the filter package. Sideline tag specific scanners; tags now use only one scanner of each type, TagScanner or CompositeTagScanner (except for ScriptScanner). Obviated PeekingIterator by moving the META tag semantics to doSemanticAction, much simpler, old IteratorImpl is now PeekingIteratorImpl but deprecated. --- NEW FILE: PeekingIteratorImpl.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/PeekingIteratorImpl.java,v $ // $Author: derrickoswald $ // $Date: 2003/11/08 21:30:57 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.util; import java.util.Vector; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; /** * @deprecated shouldn't need to pre-read tags. */ public class PeekingIteratorImpl implements PeekingIterator { Lexer mLexer; Vector preRead; ParserFeedback feedback; public PeekingIteratorImpl (Lexer lexer, ParserFeedback fb) { mLexer = lexer; preRead = new Vector (25); feedback = fb; } public Node peek () throws ParserException { Node ret; if (null == mLexer) ret = null; else try { ret = mLexer.nextNode (); if (null != ret) { // kick off recursion for the top level node if (ret instanceof org.htmlparser.tags.Tag) { org.htmlparser.tags.Tag tag; String name; org.htmlparser.scanners.TagScanner scanner; tag = (org.htmlparser.tags.Tag)ret; if (!tag.isEndTag ()) { // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); if ((null != scanner) && scanner.evaluate (tag, null)) ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); } } preRead.addElement (ret); } } catch (Exception e) { StringBuffer msgBuffer = new StringBuffer(); msgBuffer.append("Unexpected Exception occurred while reading "); msgBuffer.append(mLexer.getPage ().getUrl ()); msgBuffer.append(", in nextHTMLNode"); // reader.appendLineDetails(msgBuffer); ParserException ex = new ParserException(msgBuffer.toString(),e); feedback.error(msgBuffer.toString(),ex); throw ex; } return (ret); } /** * Makes <code>node</code> the next <code>Node</code> that will be returned. * @param node The node to return next. */ public void push (Node node) { preRead.insertElementAt (node, 0); } /** * Check if more nodes are available. * @return <code>true</code> if a call to <code>nextNode()</code> will succeed. */ public boolean hasMoreNodes() throws ParserException { boolean ret; if (null == mLexer) ret = false; else if (0 != preRead.size ()) ret = true; else ret = !(null == peek ()); return (ret); } /** * Get the next node. * @return The next node in the HTML stream, or null if there are no more nodes. */ public Node nextNode() throws ParserException { Node ret; if (hasMoreNodes ()) ret = (Node)preRead.remove (0); else // should perhaps throw an exception? ret = null; return (ret); } } Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** IteratorImpl.java 6 Nov 2003 03:00:40 -0000 1.34 --- IteratorImpl.java 8 Nov 2003 21:30:57 -0000 1.35 *************** *** 29,118 **** package org.htmlparser.util; - import java.util.Vector; - import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; ! public class IteratorImpl implements PeekingIterator { Lexer mLexer; ! Vector preRead; ! ParserFeedback feedback; public IteratorImpl (Lexer lexer, ParserFeedback fb) { mLexer = lexer; ! preRead = new Vector (25); ! feedback = fb; ! } ! ! public Node peek () throws ParserException ! { ! Node ret; ! ! if (null == mLexer) ! ret = null; ! else ! try ! { ! ret = mLexer.nextNode (); ! if (null != ret) ! { ! // kick off recursion for the top level node ! if (ret instanceof org.htmlparser.tags.Tag) ! { ! org.htmlparser.tags.Tag tag; ! String name; ! org.htmlparser.scanners.TagScanner scanner; ! ! tag = (org.htmlparser.tags.Tag)ret; ! if (!tag.isEndTag ()) ! { ! // now recurse if there is a scanner for this type of tag ! scanner = tag.getThisScanner (); ! if ((null != scanner) && scanner.evaluate (tag, null)) ! ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); ! } ! } ! ! preRead.addElement (ret); ! } ! } ! catch (Exception e) { ! StringBuffer msgBuffer = new StringBuffer(); ! msgBuffer.append("Unexpected Exception occurred while reading "); ! msgBuffer.append(mLexer.getPage ().getUrl ()); ! msgBuffer.append(", in nextHTMLNode"); ! // reader.appendLineDetails(msgBuffer); ! ParserException ex = new ParserException(msgBuffer.toString(),e); ! feedback.error(msgBuffer.toString(),ex); ! throw ex; ! } ! ! return (ret); ! } ! ! /** ! * Makes <code>node</code> the next <code>Node</code> that will be returned. ! * @param node The node to return next. ! */ ! public void push (Node node) ! { ! preRead.insertElementAt (node, 0); } /** * Check if more nodes are available. ! * @return <code>true</code> if a call to <code>nextHTMLNode()</code> will succeed. */ ! public boolean hasMoreNodes() throws ParserException { boolean ret; ! if (null == mLexer) ! ret = false; ! else if (0 != preRead.size ()) ! ret = true; ! else ! ret = !(null == peek ()); return (ret); --- 29,62 ---- package org.htmlparser.util; import org.htmlparser.Node; + import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; + import org.htmlparser.scanners.TagScanner; + import org.htmlparser.tags.Tag; + import org.htmlparser.util.NodeIterator; ! public class IteratorImpl implements NodeIterator { Lexer mLexer; ! ParserFeedback mFeedback; ! Cursor mCursor; public IteratorImpl (Lexer lexer, ParserFeedback fb) { mLexer = lexer; ! mFeedback = fb; ! mCursor = new Cursor (mLexer.getPage (), 0); } /** * Check if more nodes are available. ! * @return <code>true</code> if a call to <code>nextNode()</code> will succeed. */ ! public boolean hasMoreNodes() throws ParserException ! { boolean ret; ! mCursor.setPosition (mLexer.getPosition ()); ! ret = 0 != mLexer.getPage ().getCharacter (mCursor); // more characters? return (ret); *************** *** 123,135 **** * @return The next node in the HTML stream, or null if there are no more nodes. */ ! public Node nextNode() throws ParserException { Node ret; ! if (hasMoreNodes ()) ! ret = (Node)preRead.remove (0); ! else ! // should perhaps throw an exception? ! ret = null; return (ret); } --- 67,109 ---- * @return The next node in the HTML stream, or null if there are no more nodes. */ ! public Node nextNode() throws ParserException ! { Node ret; ! try ! { ! ret = mLexer.nextNode (); ! if (null != ret) ! { ! // kick off recursion for the top level node ! if (ret instanceof Tag) ! { ! Tag tag; ! String name; ! TagScanner scanner; + tag = (Tag)ret; + if (!tag.isEndTag ()) + { + // now recurse if there is a scanner for this type of tag + scanner = tag.getThisScanner (); + if ((null != scanner) && scanner.evaluate (tag, null)) + ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); + } + } + } + } + catch (Exception e) + { + StringBuffer msgBuffer = new StringBuffer(); + msgBuffer.append("Unexpected Exception occurred while reading "); + msgBuffer.append(mLexer.getPage ().getUrl ()); + msgBuffer.append(", in nextHTMLNode"); + // reader.appendLineDetails(msgBuffer); + ParserException ex = new ParserException(msgBuffer.toString(),e); + mFeedback.error(msgBuffer.toString(),ex); + throw ex; + } + return (ret); } Index: ParserUtils.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** ParserUtils.java 26 Oct 2003 19:46:28 -0000 1.33 --- ParserUtils.java 8 Nov 2003 21:30:57 -0000 1.34 *************** *** 34,38 **** --- 34,40 ---- import org.htmlparser.Node; + import org.htmlparser.NodeFilter; import org.htmlparser.Parser; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.Tag; *************** *** 105,119 **** /** ! * Search given node and pick up any objects of given type, return ! * Node array. ! * @param node ! * @param type ! * @return Node[] */ ! public static Node[] findTypeInNode(Node node, Class type) { ! NodeList nodeList = new NodeList(); ! node.collectInto(nodeList, type); ! Node spans[] = nodeList.toNodeArray(); ! return spans; } --- 107,125 ---- /** ! * Search given node and pick up any objects of given type. ! * @param node The node to search. ! * @param type The class to search for. ! * @return A node array with the matching nodes. */ ! public static Node[] findTypeInNode(Node node, Class type) ! { ! NodeFilter filter; ! NodeList ret; ! ! ret = new NodeList (); ! filter = new NodeClassFilter (type); ! node.collectInto (ret, filter); ! ! return (ret.toNodeArray ()); } Index: PeekingIterator.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/PeekingIterator.java,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** PeekingIterator.java 26 Oct 2003 19:46:28 -0000 1.17 --- PeekingIterator.java 8 Nov 2003 21:30:57 -0000 1.18 *************** *** 31,34 **** --- 31,37 ---- import org.htmlparser.Node; + /** + * @deprecated shouldn't need to pre-read tags. + */ public interface PeekingIterator extends NodeIterator{ /** |