Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25265/lexer
Modified Files:
Lexer.java
Log Message:
Fix bug# 919738 Text has not been extracted correctly using StringBean
and (duplicate) bug #936392 ScriptTag visitor fails for comments with '
by handling single and multiline ecmascript comments in the Lexer class
when called with quotesmart true.
Also added test cases for, but didn't fix bug #923146 tag nesting rule
too strict for forms (org.htmlparser.tests.tagTests.InputTagTest.testTable)
and bug #922439 OutOfMemory on huge HTML files (4,7MB)
(org.htmlparser.tests.MemoryTest) which are thus currently failing.
Index: Lexer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** Lexer.java 18 Feb 2004 12:34:04 -0000 1.27
--- Lexer.java 22 May 2004 03:57:29 -0000 1.28
***************
*** 303,306 ****
--- 303,307 ----
break;
default:
+ probe.retreat (); // string needs to see leading foreslash
ret = parseString (probe, quotesmart);
break;
***************
*** 412,415 ****
--- 413,445 ----
else if (quotesmart && (ch == quote))
quote = 0; // exit quoted state
+ else if (quotesmart && (0 == quote) && (ch == '/'))
+ {
+ // handle multiline and double slash comments (with a quote) in script like:
+ // I can't handle single quotations.
+ ch = mPage.getCharacter (cursor);
+ if (0 == ch)
+ done = true;
+ else if ('/' == ch)
+ {
+ do
+ ch = mPage.getCharacter (cursor);
+ while ((ch != 0) && (ch != '\n'));
+ }
+ else if ('*' == ch)
+ {
+ do
+ {
+ do
+ ch = mPage.getCharacter (cursor);
+ while ((ch != 0) && (ch != '*'));
+ ch = mPage.getCharacter (cursor);
+ if (ch == '*')
+ cursor.retreat ();
+ }
+ while ((ch != 0) && (ch != '/'));
+ }
+ else
+ cursor.retreat ();
+ }
else if ((0 == quote) && ('<' == ch))
{
|