[Htmlparser-cvs] htmlparser/src/org/htmlparser/util IteratorImpl.java,1.37,1.38 NodeList.java,1.50,1

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util
In directory sc8-pr-cvs1:/tmp/cvs-serv12747/org/htmlparser/util

Modified Files:
	IteratorImpl.java NodeList.java PeekingIteratorImpl.java 
Log Message:
Reduce recursion on the JVM stack in CompositeTagScanner.
Pass a stack of open tags to the scanner.
Add smarter tag closing by walking up the stack on encountering an unopened end tag.
Avoids a problem with bad HTML such as that found at
http://scores.nba.com/games/20031029/scoreboard.html by Shaun Roach.
Added testInvalidNesting to CompositeTagScanner Test based on the above.


Index: IteratorImpl.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** IteratorImpl.java	8 Dec 2003 01:31:56 -0000	1.37
--- IteratorImpl.java	20 Dec 2003 23:47:55 -0000	1.38
***************
*** 32,36 ****
  import org.htmlparser.lexer.Cursor;
  import org.htmlparser.lexer.Lexer;
! import org.htmlparser.scanners.TagScanner;
  import org.htmlparser.tags.Tag;
  import org.htmlparser.util.NodeIterator;
--- 32,36 ----
  import org.htmlparser.lexer.Cursor;
  import org.htmlparser.lexer.Lexer;
! import org.htmlparser.scanners.Scanner;
  import org.htmlparser.tags.Tag;
  import org.htmlparser.util.NodeIterator;
***************
*** 69,72 ****
--- 69,76 ----
      public Node nextNode() throws ParserException
      {
+         Tag tag;
+         String name;
+         Scanner scanner;
+         NodeList stack;
          Node ret;
  
***************
*** 79,86 ****
                  if (ret instanceof Tag)
                  {
-                     Tag tag;
-                     String name;
-                     TagScanner scanner;
- 
                      tag = (Tag)ret;
                      if (!tag.isEndTag ())
--- 83,86 ----
***************
*** 88,93 ****
                          // now recurse if there is a scanner for this type of tag
                          scanner = tag.getThisScanner ();
!                         if ((null != scanner) && scanner.evaluate (tag, null))
!                             ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer);
                      }
                  }
--- 88,96 ----
                          // now recurse if there is a scanner for this type of tag
                          scanner = tag.getThisScanner ();
!                         if (null != scanner)
!                         {
!                             stack = new NodeList ();
!                             ret = scanner.scan (tag, mLexer, stack);
!                         }
                      }
                  }

Index: NodeList.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/NodeList.java,v
retrieving revision 1.50
retrieving revision 1.51
diff -C2 -d -r1.50 -r1.51
*** NodeList.java	8 Dec 2003 01:31:56 -0000	1.50
--- NodeList.java	20 Dec 2003 23:47:55 -0000	1.51
***************
*** 158,165 ****
      }
  
!     public void remove(int index) {
          System.arraycopy(nodeData, index+1, nodeData, index, size-index-1);
          nodeData[size-1] = null;
          size--;
      }
  
--- 158,168 ----
      }
  
!     public Node remove(int index) {
!         Node ret;
!         ret = nodeData[index];
          System.arraycopy(nodeData, index+1, nodeData, index, size-index-1);
          nodeData[size-1] = null;
          size--;
+         return (ret);
      }
  

Index: PeekingIteratorImpl.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/PeekingIteratorImpl.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** PeekingIteratorImpl.java	8 Nov 2003 21:30:57 -0000	1.1
--- PeekingIteratorImpl.java	20 Dec 2003 23:47:55 -0000	1.2
***************
*** 31,34 ****
--- 31,37 ----
  import org.htmlparser.Node;
  import org.htmlparser.lexer.Lexer;
+ import org.htmlparser.scanners.Scanner;
+ import org.htmlparser.tags.Tag;
+ import org.htmlparser.util.NodeList;
  
  /**
***************
*** 50,53 ****
--- 53,60 ----
      public Node peek () throws ParserException
      {
+         Tag tag;
+         String name;
+         Scanner scanner;
+         NodeList stack;
          Node ret;
  
***************
*** 63,69 ****
                      if (ret instanceof org.htmlparser.tags.Tag)
                      {
-                         org.htmlparser.tags.Tag tag;
-                         String name;
-                         org.htmlparser.scanners.TagScanner scanner;
  
                          tag = (org.htmlparser.tags.Tag)ret;
--- 70,73 ----
***************
*** 72,77 ****
                              // now recurse if there is a scanner for this type of tag
                              scanner = tag.getThisScanner ();
!                             if ((null != scanner) && scanner.evaluate (tag, null))
!                                 ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer);
                          }
                      }
--- 76,84 ----
                              // now recurse if there is a scanner for this type of tag
                              scanner = tag.getThisScanner ();
!                             if (null != scanner)
!                             {
!                                 stack = new NodeList ();
!                                 ret = scanner.scan (tag, mLexer, stack);
!                             }
                          }
                      }

[Htmlparser-cvs] htmlparser/src/org/htmlparser/util IteratorImpl.java,1.37,1.38 NodeList.java,1.50,1

[Htmlparser-cvs] htmlparser/src/org/htmlparser/util IteratorImpl.java,1.37,1.38 NodeList.java,1.50,1.51 PeekingIteratorImpl.java,1.1,1.2