[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.22,1.23
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-12-31 14:40:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv31101/lexer Modified Files: Lexer.java Log Message: Fix bug #789439 Japanese page causes OutOfMemory Exception Modified the lexer to skip over JIS escape sequences. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** Lexer.java 8 Dec 2003 01:31:51 -0000 1.22 --- Lexer.java 31 Dec 2003 14:40:49 -0000 1.23 *************** *** 312,315 **** --- 312,359 ---- /** + * Advance the cursor through a JIS escape sequence. + * @param cursor A cursor positioned within the escape sequence. + */ + protected void scanJIS (Cursor cursor) + throws + ParserException + { + boolean done; + char ch; + int state; + + done = false; + state = 0; + while (!done) + { + ch = mPage.getCharacter (cursor); + if (0 == ch) + done = true; + else + switch (state) + { + case 0: + if (0x1b == ch) // escape + state = 1; + break; + case 1: + if ('(' == ch) + state = 2; + else + state = 0; + break; + case 2: + if ('J' == ch) + done = true; + else + state = 0; + break; + default: + throw new IllegalStateException ("how the fuck did we get in state " + state); + } + } + } + + /** * Parse a string node. * Scan characters until "</", "<%", "<!" or < followed by a *************** *** 325,331 **** boolean done; char ch; - int length; - int begin; - int end; char quote; Node ret; --- 369,372 ---- *************** *** 338,341 **** --- 379,403 ---- if (0 == ch) done = true; + else if (0x1b == ch) // escape + { + ch = mPage.getCharacter (cursor); + if (0 == ch) + done = true; + else if ('$' == ch) + { + ch = mPage.getCharacter (cursor); + if (0 == ch) + done = true; + else if ('B' == ch) + scanJIS (cursor); + else + { + cursor.retreat (); + cursor.retreat (); + } + } + else + cursor.retreat (); + } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) quote = ch; // enter quoted state |