[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.46,1.47 Page.java,1.55,1.56
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2006-05-27 17:06:37
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv6402/lexer Modified Files: Lexer.java Page.java Log Message: fix bug #1493884 Lexer returns a TagNode with a 'null' name Use a more careful cursor retreat - Page.ungetCharacter(). Index: Page.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** Page.java 10 Apr 2006 21:38:41 -0000 1.55 --- Page.java 27 May 2006 17:06:28 -0000 1.56 *************** *** 680,684 **** * current source position. * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n, ! * and updates the end-of-line index accordingly * Advances the cursor position by one (or two in the \r\n case). * @param cursor The position to read at. --- 680,684 ---- * current source position. * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n, ! * and updates the end-of-line index accordingly. * Advances the cursor position by one (or two in the \r\n case). * @param cursor The position to read at. *************** *** 686,690 **** * prepare for the next read. If the source is exhausted a zero is returned. * @exception ParserException If an IOException on the underlying source ! * occurs, or an attemp is made to read characters in the future (the * cursor position is ahead of the underlying stream) */ --- 686,690 ---- * prepare for the next read. If the source is exhausted a zero is returned. * @exception ParserException If an IOException on the underlying source ! * occurs, or an attempt is made to read characters in the future (the * cursor position is ahead of the underlying stream) */ *************** *** 793,796 **** --- 793,832 ---- /** + * Return a character. + * Handles end of lines (EOL) specially, retreating the cursor twice for + * the '\r\n' case. + * The cursor position is moved back by one (or two in the \r\n case). + * @param cursor The position to 'unread' at. + * @exception ParserException If an IOException on the underlying source + * occurs. + */ + public void ungetCharacter (Cursor cursor) + throws + ParserException + { + int i; + char ch; + + cursor.retreat (); + i = cursor.getPosition (); + try + { + ch = mSource.getCharacter (i); + if (('\n' == ch) && (0 != i)) + { + ch = mSource.getCharacter (i - 1); + if ('\r' == ch) + cursor.retreat (); + } + } + catch (IOException ioe) + { + throw new ParserException ( + "can't read a character at position " + + cursor.getPosition (), ioe); + } + } + + /** * Get the current encoding being used. * @return The encoding used to convert characters. Index: Lexer.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** Lexer.java 27 May 2006 14:02:27 -0000 1.46 --- Lexer.java 27 May 2006 17:06:28 -0000 1.47 *************** *** 356,370 **** else if ('%' == ch) { ! mCursor.retreat (); ret = parseJsp (start); } else if ('?' == ch) { ! mCursor.retreat (); ret = parsePI (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { ! mCursor.retreat (); ret = parseTag (start); } --- 356,370 ---- else if ('%' == ch) { ! mPage.ungetCharacter (mCursor); ret = parseJsp (start); } else if ('?' == ch) { ! mPage.ungetCharacter (mCursor); ret = parsePI (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { ! mPage.ungetCharacter (mCursor); ret = parseTag (start); } *************** *** 380,389 **** else { ! mCursor.retreat (); // remark/tag need this char if ('-' == ch) ret = parseRemark (start, quotesmart); else { ! mCursor.retreat (); // tag needs prior one too ret = parseTag (start); } --- 380,389 ---- else { ! mPage.ungetCharacter (mCursor); // remark/tag need this char if ('-' == ch) ret = parseRemark (start, quotesmart); else { ! mPage.ungetCharacter (mCursor); // tag needs prior one too ret = parseTag (start); } *************** *** 395,399 **** break; default: ! mCursor.retreat (); // string needs to see leading foreslash ret = parseString (start, quotesmart); break; --- 395,399 ---- break; default: ! mPage.ungetCharacter (mCursor); // string needs to see leading foreslash ret = parseString (start, quotesmart); break; *************** *** 489,493 **** done = true; else if ( (ch != '\\') && (ch != quote)) ! mCursor.retreat (); // unconsume char if character was not an escapable char. } break; --- 489,494 ---- done = true; else if ( (ch != '\\') && (ch != quote)) ! // unconsume char if character was not an escapable char. ! mPage.ungetCharacter (mCursor); } break; *************** *** 511,520 **** ch = mPage.getCharacter (mCursor); if (ch == '*') ! mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mCursor.retreat (); } break; --- 512,521 ---- ch = mPage.getCharacter (mCursor); if (ch == '*') ! mPage.ungetCharacter (mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mPage.ungetCharacter (mCursor); } break; *************** *** 574,580 **** done = true; // back up to the start of ETAGO ! mCursor.retreat (); ! mCursor.retreat (); ! mCursor.retreat (); } else --- 575,581 ---- done = true; // back up to the start of ETAGO ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } else *************** *** 599,608 **** else { ! mCursor.retreat (); ! mCursor.retreat (); } } else ! mCursor.retreat (); } break; --- 600,609 ---- else { ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } } else ! mPage.ungetCharacter (mCursor); } break; *************** *** 749,758 **** else { ! mCursor.retreat (); ! mCursor.retreat (); } } else ! mCursor.retreat (); } else if (quotesmart && (0 == quote) --- 750,759 ---- else { ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } } else ! mPage.ungetCharacter (mCursor); } else if (quotesmart && (0 == quote) *************** *** 767,771 **** && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) ! mCursor.retreat(); // unconsume char if char not an escape } else if (quotesmart && (ch == quote)) --- 768,772 ---- && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) ! mPage.ungetCharacter (mCursor); // unconsume char if char not an escape } else if (quotesmart && (ch == quote)) *************** *** 794,803 **** ch = mPage.getCharacter (mCursor); if (ch == '*') ! mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mCursor.retreat (); } else if ((0 == quote) && ('<' == ch)) --- 795,804 ---- ch = mPage.getCharacter (mCursor); if (ch == '*') ! mPage.ungetCharacter (mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mPage.ungetCharacter (mCursor); } else if ((0 == quote) && ('<' == ch)) *************** *** 811,821 **** { done = true; ! mCursor.retreat (); ! mCursor.retreat (); } else { // it's not a tag, so keep going, but check for quotes ! mCursor.retreat (); } } --- 812,822 ---- { done = true; ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } else { // it's not a tag, so keep going, but check for quotes ! mPage.ungetCharacter (mCursor); } } *************** *** 1013,1017 **** { // don't consume the opening angle ! mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } --- 1014,1018 ---- { // don't consume the opening angle ! mPage.ungetCharacter (mCursor); bookmarks[state + 1] = mCursor.getPosition (); } *************** *** 1031,1035 **** { // don't consume the opening angle ! mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } --- 1032,1036 ---- { // don't consume the opening angle ! mPage.ungetCharacter (mCursor); bookmarks[state + 1] = mCursor.getPosition (); } *************** *** 1121,1125 **** standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mCursor.retreat(); state=0; } --- 1122,1126 ---- standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mPage.ungetCharacter (mCursor); state=0; } *************** *** 1143,1147 **** standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mCursor.retreat(); state=0; } --- 1144,1148 ---- standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mPage.ungetCharacter (mCursor); state=0; } *************** *** 1263,1267 **** else { ! mCursor.retreat (); state = 2; } --- 1264,1268 ---- else { ! mPage.ungetCharacter (mCursor); state = 2; } *************** *** 1442,1453 **** ch = mPage.getCharacter (mCursor); if (ch == '*') ! mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else ! { ! mCursor.retreat (); ! } break; default: // <%???x --- 1443,1452 ---- ch = mPage.getCharacter (mCursor); if (ch == '*') ! mPage.ungetCharacter (mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mPage.ungetCharacter (mCursor); break; default: // <%???x |