Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1:/tmp/cvs-serv31101/lexer
Modified Files:
Lexer.java
Log Message:
Fix bug #789439 Japanese page causes OutOfMemory Exception
Modified the lexer to skip over JIS escape sequences.
Index: Lexer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** Lexer.java 8 Dec 2003 01:31:51 -0000 1.22
--- Lexer.java 31 Dec 2003 14:40:49 -0000 1.23
***************
*** 312,315 ****
--- 312,359 ----
/**
+ * Advance the cursor through a JIS escape sequence.
+ * @param cursor A cursor positioned within the escape sequence.
+ */
+ protected void scanJIS (Cursor cursor)
+ throws
+ ParserException
+ {
+ boolean done;
+ char ch;
+ int state;
+
+ done = false;
+ state = 0;
+ while (!done)
+ {
+ ch = mPage.getCharacter (cursor);
+ if (0 == ch)
+ done = true;
+ else
+ switch (state)
+ {
+ case 0:
+ if (0x1b == ch) // escape
+ state = 1;
+ break;
+ case 1:
+ if ('(' == ch)
+ state = 2;
+ else
+ state = 0;
+ break;
+ case 2:
+ if ('J' == ch)
+ done = true;
+ else
+ state = 0;
+ break;
+ default:
+ throw new IllegalStateException ("how the fuck did we get in state " + state);
+ }
+ }
+ }
+
+ /**
* Parse a string node.
* Scan characters until "</", "<%", "<!" or < followed by a
***************
*** 325,331 ****
boolean done;
char ch;
- int length;
- int begin;
- int end;
char quote;
Node ret;
--- 369,372 ----
***************
*** 338,341 ****
--- 379,403 ----
if (0 == ch)
done = true;
+ else if (0x1b == ch) // escape
+ {
+ ch = mPage.getCharacter (cursor);
+ if (0 == ch)
+ done = true;
+ else if ('$' == ch)
+ {
+ ch = mPage.getCharacter (cursor);
+ if (0 == ch)
+ done = true;
+ else if ('B' == ch)
+ scanJIS (cursor);
+ else
+ {
+ cursor.retreat ();
+ cursor.retreat ();
+ }
+ }
+ else
+ cursor.retreat ();
+ }
else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch)))
quote = ch; // enter quoted state
|