[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.34,1.35 Page.java,1.46,1.47

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5186/lexer

Modified Files:
	Lexer.java Page.java 
Log Message:
Bug #1104627 Parser Crash reading javascript
Bug #1024045 StringBean crashes on an URL
Bug #1021925 StyleTag with missing linefeed prevents page from parsing
Corrected operation with script and style scanners to recognize the ETAGO
when parsing CDATA -- see http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data.
Original solution to bug #741769 ScriptScanner doesn't handle quoted &lt;/script&gt; tags,
was erroneous; it should have been recognized as faulty HTML.
Several test cases changed to follow this advice:
   "Authors should therefore escape "</" within the content."


Index: Page.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v
retrieving revision 1.46
retrieving revision 1.47
diff -C2 -d -r1.46 -r1.47
*** Page.java	13 Feb 2005 22:45:47 -0000	1.46
--- Page.java	7 Mar 2005 02:18:37 -0000	1.47
***************
*** 446,450 ****
      public void close () throws IOException
      {
!         getSource ().destroy ();
      }
  
--- 446,451 ----
      public void close () throws IOException
      {
!         if (null != getSource ())
!             getSource ().destroy ();
      }
  

Index: Lexer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v
retrieving revision 1.34
retrieving revision 1.35
diff -C2 -d -r1.34 -r1.35
*** Lexer.java	13 Feb 2005 22:45:47 -0000	1.34
--- Lexer.java	7 Mar 2005 02:18:37 -0000	1.35
***************
*** 1054,1057 ****
--- 1054,1141 ----
      }
  
+     /**
+      * Return CDATA as a text node.
+      * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">
+      * B.3.2 Specifying non-HTML data</a> of the
+      * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:
+      * <quote>
+      * <b>Element content</b><br>
+      * When script or style data is the content of an element (SCRIPT and STYLE),
+      * the data begins immediately after the element start tag and ends at the
+      * first ETAGO ("&lt;/") delimiter followed by a name start character ([a-zA-Z]);
+      * note that this may not be the element's end tag.
+      * Authors should therefore escape "&lt;/" within the content. Escape mechanisms
+      * are specific to each scripting or style sheet language.
+      * </quote>
+      * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.
+      */
+     public Node parseCDATA ()
+         throws
+             ParserException
+     {
+         int start;
+         int state;
+         boolean done;
+         char ch;
+         int end;
+ 
+         start = mCursor.getPosition ();
+         state = 0;
+         done = false;
+         while (!done)
+         {
+             ch = mPage.getCharacter (mCursor);
+             switch (state)
+             {
+                 case 0: // prior to ETAGO
+                     switch (ch)
+                     {
+                         case 0: // end of input
+                             done = true;
+                             break;
+                         case '<':
+                             state = 1;
+                             break;
+                         default:
+                             break;
+                     }
+                     break;
+                 case 1: // <
+                     switch (ch)
+                     {
+                         case 0: // end of input
+                             done = true;
+                             break;
+                         case '/':
+                             state = 2;
+                             break;
+                         default:
+                             state = 0;
+                             break;
+                     }
+                     break;
+                 case 2: // </
+                     if (0 == ch)
+                         done = true;
+                     else if (Character.isLetter (ch))
+                     {
+                         done = true;
+                         // back up to the start of ETAGO
+                         mCursor.retreat ();
+                         mCursor.retreat ();
+                         mCursor.retreat ();
+                     }
+                     else
+                         state = 0;
+                     break;
+                 default:
+                     throw new IllegalStateException ("how the fuck did we get in state " + state);
+             }
+         }
+         end = mCursor.getPosition ();
+ 
+         return (makeString (start, end));
+     }
+ 
      //
      // NodeFactory interface