[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.35,1.36

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25217/lexer

Modified Files:
	Lexer.java 
Log Message:
Add STRICT flag to ScriptScanner to revert to legacy handling of broken ETAGO (</).
If STRICT is true, scan according to HTML specification, else if false, scan with
quote smart state machine which heuristically yields the correct parse.


Index: Lexer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v
retrieving revision 1.35
retrieving revision 1.36
diff -C2 -d -r1.35 -r1.36
*** Lexer.java	7 Mar 2005 02:18:37 -0000	1.35
--- Lexer.java	12 Mar 2005 17:53:08 -0000	1.36
***************
*** 1058,1062 ****
       * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">
       * B.3.2 Specifying non-HTML data</a> of the
!      * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:
       * <quote>
       * <b>Element content</b><br>
--- 1058,1062 ----
       * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">
       * B.3.2 Specifying non-HTML data</a> of the
!      * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br>
       * <quote>
       * <b>Element content</b><br>
***************
*** 1074,1080 ****
--- 1074,1098 ----
              ParserException
      {
+         return (parseCDATA (false));
+     }
+ 
+     /**
+      * Return CDATA as a text node.
+      * Slightly less rigid than {@link #parseCDATA()} this method provides for
+      * parsing CDATA that may contain quoted strings that have embedded
+      * ETAGO ("&lt;/") delimiters and skips single and multiline comments.
+      * @param quotesmart If <code>true</code> the strict definition of CDATA is
+      * extended to allow for single or double quoted ETAGO ("&lt;/") sequences.
+      * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.
+      * @see #parseCDATA()
+      */
+     public Node parseCDATA (boolean quotesmart)
+         throws
+             ParserException
+     {
          int start;
          int state;
          boolean done;
+         char quote;
          char ch;
          int end;
***************
*** 1083,1086 ****
--- 1101,1105 ----
          state = 0;
          done = false;
+         quote = 0;
          while (!done)
          {
***************
*** 1094,1099 ****
                              done = true;
                              break;
                          case '<':
!                             state = 1;
                              break;
                          default:
--- 1113,1180 ----
                              done = true;
                              break;
+                         case '\'':
+                             if (quotesmart)
+                                 if (0 == quote)
+                                     quote = '\''; // enter quoted state
+                                 else if ('\'' == quote)
+                                     quote = 0; // exit quoted state
+                             break;
+                         case '"':
+                             if (quotesmart)
+                                 if (0 == quote)
+                                     quote = '"'; // enter quoted state
+                                 else if ('"' == quote)
+                                     quote = 0; // exit quoted state
+                             break;
+                         case '\\':
+                             if (quotesmart)
+                                 if (0 != quote)
+                                 {
+                                     ch = mPage.getCharacter (mCursor); // try to consume escaped character
+                                     if (0 == ch)
+                                         mCursor.retreat ();
+                                     else if (  (ch != '\\') && (ch != quote))
+                                         mCursor.retreat (); // unconsume char if character was not an escapable char.
+                                 }
+                             break;
+                         case '/':
+                             if (quotesmart)
+                                 if (0 == quote)
+                                 {
+                                     // handle multiline and double slash comments (with a quote)
+                                     ch = mPage.getCharacter (mCursor);
+                                     if (0 == ch)
+                                         mCursor.retreat ();
+                                     else if ('/' == ch)
+                                     {
+                                         do
+                                             ch = mPage.getCharacter (mCursor);
+                                         while ((ch != 0) && (ch != '\n'));
+                                     }
+                                     else if ('*' == ch)
+                                     {
+                                         do
+                                         {
+                                             do
+                                                 ch = mPage.getCharacter (mCursor);
+                                             while ((ch != 0) && (ch != '*'));
+                                             ch = mPage.getCharacter (mCursor);
+                                             if (ch == '*')
+                                                 mCursor.retreat ();
+                                         }
+                                         while ((ch != 0) && (ch != '/'));
+                                     }
+                                     else
+                                         mCursor.retreat ();
+                                 }
+                             break;
                          case '<':
!                             if (quotesmart)
!                             {
!                                 if (0 == quote)
!                                     state = 1;
!                             }
!                             else
!                                 state = 1;
                              break;
                          default: