[Htmlparser-cvs] htmlparser/src/org/htmlparser/util IteratorImpl.java,1.37,1.38 NodeList.java,1.50,1
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-12-20 23:47:58
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv12747/org/htmlparser/util Modified Files: IteratorImpl.java NodeList.java PeekingIteratorImpl.java Log Message: Reduce recursion on the JVM stack in CompositeTagScanner. Pass a stack of open tags to the scanner. Add smarter tag closing by walking up the stack on encountering an unopened end tag. Avoids a problem with bad HTML such as that found at http://scores.nba.com/games/20031029/scoreboard.html by Shaun Roach. Added testInvalidNesting to CompositeTagScanner Test based on the above. Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** IteratorImpl.java 8 Dec 2003 01:31:56 -0000 1.37 --- IteratorImpl.java 20 Dec 2003 23:47:55 -0000 1.38 *************** *** 32,36 **** import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; ! import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeIterator; --- 32,36 ---- import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; ! import org.htmlparser.scanners.Scanner; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeIterator; *************** *** 69,72 **** --- 69,76 ---- public Node nextNode() throws ParserException { + Tag tag; + String name; + Scanner scanner; + NodeList stack; Node ret; *************** *** 79,86 **** if (ret instanceof Tag) { - Tag tag; - String name; - TagScanner scanner; - tag = (Tag)ret; if (!tag.isEndTag ()) --- 83,86 ---- *************** *** 88,93 **** // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if ((null != scanner) && scanner.evaluate (tag, null)) ! ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); } } --- 88,96 ---- // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if (null != scanner) ! { ! stack = new NodeList (); ! ret = scanner.scan (tag, mLexer, stack); ! } } } Index: NodeList.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/NodeList.java,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** NodeList.java 8 Dec 2003 01:31:56 -0000 1.50 --- NodeList.java 20 Dec 2003 23:47:55 -0000 1.51 *************** *** 158,165 **** } ! public void remove(int index) { System.arraycopy(nodeData, index+1, nodeData, index, size-index-1); nodeData[size-1] = null; size--; } --- 158,168 ---- } ! public Node remove(int index) { ! Node ret; ! ret = nodeData[index]; System.arraycopy(nodeData, index+1, nodeData, index, size-index-1); nodeData[size-1] = null; size--; + return (ret); } Index: PeekingIteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/PeekingIteratorImpl.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** PeekingIteratorImpl.java 8 Nov 2003 21:30:57 -0000 1.1 --- PeekingIteratorImpl.java 20 Dec 2003 23:47:55 -0000 1.2 *************** *** 31,34 **** --- 31,37 ---- import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; + import org.htmlparser.scanners.Scanner; + import org.htmlparser.tags.Tag; + import org.htmlparser.util.NodeList; /** *************** *** 50,53 **** --- 53,60 ---- public Node peek () throws ParserException { + Tag tag; + String name; + Scanner scanner; + NodeList stack; Node ret; *************** *** 63,69 **** if (ret instanceof org.htmlparser.tags.Tag) { - org.htmlparser.tags.Tag tag; - String name; - org.htmlparser.scanners.TagScanner scanner; tag = (org.htmlparser.tags.Tag)ret; --- 70,73 ---- *************** *** 72,77 **** // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if ((null != scanner) && scanner.evaluate (tag, null)) ! ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); } } --- 76,84 ---- // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if (null != scanner) ! { ! stack = new NodeList (); ! ret = scanner.scan (tag, mLexer, stack); ! } } } |