[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.15,1.16
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-10-25 15:56:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv3232/lexer Modified Files: Lexer.java Log Message: Handle some broken end tags. Handle some pathological remark nodes. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** Lexer.java 20 Oct 2003 01:28:02 -0000 1.15 --- Lexer.java 25 Oct 2003 15:46:02 -0000 1.16 *************** *** 270,280 **** else { ! probe.retreat (); // remark and tag need this character ! if ('-' == ch) ! ret = parseRemark (probe, quotesmart); else { ! probe.retreat (); // tag needs the previous one too ! ret = parseTag (probe); } } --- 270,285 ---- else { ! if ('>' == ch) // handle <!> ! ret = makeRemark (probe); else { ! probe.retreat (); // remark and tag need this character ! if ('-' == ch) ! ret = parseRemark (probe, quotesmart); ! else ! { ! probe.retreat (); // tag needs the previous one too ! ret = parseTag (probe); ! } } } *************** *** 483,488 **** { case 0: // outside of any attribute ! if ((0 == ch) || ('>' == ch)) { whitespace (attributes, bookmarks); done = true; --- 488,499 ---- { case 0: // outside of any attribute ! if ((0 == ch) || ('>' == ch) || ('<' == ch)) { + if ('<' == ch) + { + // don't consume the opening angle + cursor.retreat (); + bookmarks[state + 1] = cursor.getPosition (); + } whitespace (attributes, bookmarks); done = true; *************** *** 495,500 **** break; case 1: // within attribute name ! if ((0 == ch) || ('>' == ch)) { standalone (attributes, bookmarks); done = true; --- 506,517 ---- break; case 1: // within attribute name ! if ((0 == ch) || ('>' == ch) || ('<' == ch)) { + if ('<' == ch) + { + // don't consume the opening angle + cursor.retreat (); + bookmarks[state + 1] = cursor.getPosition (); + } standalone (attributes, bookmarks); done = true; *************** *** 797,839 **** { ch = mPage.getCharacter (cursor); ! switch (state) ! { ! case 0: // prior to the first open delimiter ! if ('-' == ch) ! state = 1; ! else ! return (parseString (cursor, quotesmart)); ! break; ! case 1: // prior to the second open delimiter ! if ('-' == ch) ! state = 2; ! else ! return (parseString (cursor, quotesmart)); ! break; ! case 2: // prior to the first closing delimiter ! if ('-' == ch) ! state = 3; ! else if (0 == ch) ! return (parseString (cursor, quotesmart)); // no terminator ! break; ! case 3: // prior to the second closing delimiter ! if ('-' == ch) ! state = 4; ! else ! state = 2; ! break; ! case 4: // prior to the terminating > ! if ('>' == ch) ! done = true; ! else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch)) ! { ! // stay in state 4 ! } ! else ! state = 2; ! break; ! default: ! throw new IllegalStateException ("how the fuck did we get in state " + state); ! } } --- 814,873 ---- { ch = mPage.getCharacter (cursor); ! if (0 == ch) ! done = true; ! else ! switch (state) ! { ! case 0: // prior to the first open delimiter ! if ('>' == ch) ! done = true; ! if ('-' == ch) ! state = 1; ! else ! return (parseString (cursor, quotesmart)); ! break; ! case 1: // prior to the second open delimiter ! if ('-' == ch) ! { ! // handle <!--> because netscape does ! ch = mPage.getCharacter (cursor); ! if (0 == ch) ! done = true; ! else if ('>' == ch) ! done = true; ! else ! { ! cursor.retreat (); ! state = 2; ! } ! } ! else ! return (parseString (cursor, quotesmart)); ! break; ! case 2: // prior to the first closing delimiter ! if ('-' == ch) ! state = 3; ! else if (0 == ch) ! return (parseString (cursor, quotesmart)); // no terminator ! break; ! case 3: // prior to the second closing delimiter ! if ('-' == ch) ! state = 4; ! else ! state = 2; ! break; ! case 4: // prior to the terminating > ! if ('>' == ch) ! done = true; ! else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch)) ! { ! // stay in state 4 ! } ! else ! state = 2; ! break; ! default: ! throw new IllegalStateException ("how the fuck did we get in state " + state); ! } } |