[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.35,1.36
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2005-03-12 17:53:19
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25217/lexer Modified Files: Lexer.java Log Message: Add STRICT flag to ScriptScanner to revert to legacy handling of broken ETAGO (</). If STRICT is true, scan according to HTML specification, else if false, scan with quote smart state machine which heuristically yields the correct parse. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** Lexer.java 7 Mar 2005 02:18:37 -0000 1.35 --- Lexer.java 12 Mar 2005 17:53:08 -0000 1.36 *************** *** 1058,1062 **** * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> * B.3.2 Specifying non-HTML data</a> of the ! * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>: * <quote> * <b>Element content</b><br> --- 1058,1062 ---- * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> * B.3.2 Specifying non-HTML data</a> of the ! * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br> * <quote> * <b>Element content</b><br> *************** *** 1074,1080 **** --- 1074,1098 ---- ParserException { + return (parseCDATA (false)); + } + + /** + * Return CDATA as a text node. + * Slightly less rigid than {@link #parseCDATA()} this method provides for + * parsing CDATA that may contain quoted strings that have embedded + * ETAGO ("</") delimiters and skips single and multiline comments. + * @param quotesmart If <code>true</code> the strict definition of CDATA is + * extended to allow for single or double quoted ETAGO ("</") sequences. + * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. + * @see #parseCDATA() + */ + public Node parseCDATA (boolean quotesmart) + throws + ParserException + { int start; int state; boolean done; + char quote; char ch; int end; *************** *** 1083,1086 **** --- 1101,1105 ---- state = 0; done = false; + quote = 0; while (!done) { *************** *** 1094,1099 **** done = true; break; case '<': ! state = 1; break; default: --- 1113,1180 ---- done = true; break; + case '\'': + if (quotesmart) + if (0 == quote) + quote = '\''; // enter quoted state + else if ('\'' == quote) + quote = 0; // exit quoted state + break; + case '"': + if (quotesmart) + if (0 == quote) + quote = '"'; // enter quoted state + else if ('"' == quote) + quote = 0; // exit quoted state + break; + case '\\': + if (quotesmart) + if (0 != quote) + { + ch = mPage.getCharacter (mCursor); // try to consume escaped character + if (0 == ch) + mCursor.retreat (); + else if ( (ch != '\\') && (ch != quote)) + mCursor.retreat (); // unconsume char if character was not an escapable char. + } + break; + case '/': + if (quotesmart) + if (0 == quote) + { + // handle multiline and double slash comments (with a quote) + ch = mPage.getCharacter (mCursor); + if (0 == ch) + mCursor.retreat (); + else if ('/' == ch) + { + do + ch = mPage.getCharacter (mCursor); + while ((ch != 0) && (ch != '\n')); + } + else if ('*' == ch) + { + do + { + do + ch = mPage.getCharacter (mCursor); + while ((ch != 0) && (ch != '*')); + ch = mPage.getCharacter (mCursor); + if (ch == '*') + mCursor.retreat (); + } + while ((ch != 0) && (ch != '/')); + } + else + mCursor.retreat (); + } + break; case '<': ! if (quotesmart) ! { ! if (0 == quote) ! state = 1; ! } ! else ! state = 1; break; default: |