[Htmlparser-cvs] htmlparser/src/org/htmlparser/tests/lexerTests LexerTests.java,1.12,1.13

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests
In directory sc8-pr-cvs1:/tmp/cvs-serv31101/tests/lexerTests

Modified Files:
	LexerTests.java 
Log Message:
Fix bug #789439 Japanese page causes OutOfMemory Exception
Modified the lexer to skip over JIS escape sequences.


Index: LexerTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** LexerTests.java	8 Dec 2003 01:31:53 -0000	1.12
--- LexerTests.java	31 Dec 2003 14:40:50 -0000	1.13
***************
*** 38,41 ****
--- 38,42 ----
  import java.net.URL;
  import java.net.URLConnection;
+ import java.util.HashSet;
  
  import org.htmlparser.Node;
***************
*** 51,54 ****
--- 52,57 ----
  import org.htmlparser.tags.Tag;
  import org.htmlparser.tests.ParserTestCase;
+ import org.htmlparser.util.NodeIterator;
+ import org.htmlparser.util.NodeList;
  import org.htmlparser.util.ParserException;
  
***************
*** 592,595 ****
--- 595,699 ----
  //        tests.testSpeedStreamWithTags ();
  //    }
+ 
+     static final HashSet mAcceptable;
+     static
+     {
+         mAcceptable = new HashSet ();
+         mAcceptable.add ("A");
+         mAcceptable.add ("BODY");
+         mAcceptable.add ("BR");
+         mAcceptable.add ("CENTER");
+         mAcceptable.add ("FONT");
+         mAcceptable.add ("HEAD");
+         mAcceptable.add ("HR");
+         mAcceptable.add ("HTML");
+         mAcceptable.add ("IMG");
+         mAcceptable.add ("P");
+         mAcceptable.add ("TABLE");
+         mAcceptable.add ("TD");
+         mAcceptable.add ("TITLE");
+         mAcceptable.add ("TR");
+     }
+ 
+     /**
+      * Test case for bug #789439 Japanese page causes OutOfMemory Exception
+      * No exception is thrown in the current version of the parser,
+      * however, the problem is that ISO-2022-JP (aka JIS) encoding sometimes
+      * causes spurious tags.
+      * The root cause is characters bracketed by [esc]$B and [esc](J (contrary
+      * to what is indicated in then j_s_nightingale analysis of the problem) that
+      * sometimes have an angle bracket (&lt; or 0x3c) embedded in them. These
+      * are taken to be tags by the parser, instead of being considered strings.
+      * <p>
+      * The URL refrenced has an ISO-8859-1 encoding (the default), but
+      * Japanese characters intermixed on the page with English, using the JIS
+      * encoding. We detect failure by looking for weird tag names which were
+      * not correctly handled as string nodes.
+      * <p>
+      * Here is a partial dump of the page with escape sequences:
+      * <pre>
+      * 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43
+      * 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61
+      * ..
+      * 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a
+      * 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c
+      * ..
+      * 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25
+      * 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a
+      * ..
+      * 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25
+      * 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f
+      * ..
+      * 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43
+      * 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d
+      * ..
+      * 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21
+      * 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28
+      * 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22
+      * </pre>
+      * <p>
+      * The fix proposed by j_s_nightingale is implemented to swallow JIS
+      * escape sequences in the string parser.
+      * Apparently the fix won't help EUC-JP and Shift-JIS though, so this may
+      * still be a problem.
+      * It's theoretically possible that JIS encoding, or another one,
+      * could be used as attribute names or values within tags as well,
+      * but this is considered improbable and is therefore not handled in
+      * the tag parser state machine.
+      */
+     public void testJIS ()
+         throws ParserException
+     {
+         Parser parser;
+         NodeIterator iterator;
+         
+         parser = new Parser ("http://www.009.com/");
+         iterator = parser.elements ();
+         while (iterator.hasMoreNodes ())
+             checkTagNames (iterator.nextNode ());
+     }
+ 
+     /**
+      * Check the tag name for one of the ones expected on the page.
+      * Recursively check the children.
+      */
+     public void checkTagNames (Node node)
+     {
+         Tag tag;
+         String name;
+         NodeList children;
+         
+         if (node instanceof Tag)
+         {
+             tag = (Tag)node;
+             name = tag.getTagName ();
+             if (!mAcceptable.contains (name))
+                 fail ("unrecognized tag name \"" + name + "\"");
+             children = tag.getChildren ();
+             if (null != children)
+                 for (int i = 0; i < children.size (); i++)
+                     checkTagNames (children.elementAt (i));
+         }
+     }
  
  }