[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.36,1.37 Page.java,1.47,1.48 Source
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2005-03-13 14:52:29
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16205/lexer Modified Files: Lexer.java Page.java Source.java Log Message: Bug #1121401 No Parsing with yahoo! By default nio.charset.CharsetDecoder replaces characters it cannot represent in the current encoding with zero, which was the value returned by the page when the Stream reached EOF. This changes the Page return value to (char)Source.EOF (-1) when the end of stream is encountered. Index: Source.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Source.java,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** Source.java 13 Feb 2005 22:45:47 -0000 1.17 --- Source.java 13 Mar 2005 14:51:44 -0000 1.18 *************** *** 110,114 **** * @param off Offset at which to start storing characters * @param len Maximum number of characters to read ! * @return The number of characters read, or {@link #EOF} if the esource is * exhausted. * @exception IOException If an I/O error occurs. --- 110,114 ---- * @param off Offset at which to start storing characters * @param len Maximum number of characters to read ! * @return The number of characters read, or {@link #EOF} if the source is * exhausted. * @exception IOException If an I/O error occurs. *************** *** 121,125 **** * or the source is exhausted. * @param cbuf Destination buffer. ! * @return The number of characters read, or {@link #EOF} if the esource is * exhausted. * @exception IOException If an I/O error occurs. --- 121,125 ---- * or the source is exhausted. * @param cbuf Destination buffer. ! * @return The number of characters read, or {@link #EOF} if the source is * exhausted. * @exception IOException If an I/O error occurs. Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** Page.java 7 Mar 2005 02:18:37 -0000 1.47 --- Page.java 13 Mar 2005 14:51:43 -0000 1.48 *************** *** 69,72 **** --- 69,78 ---- /** + * Character value when the page is exhausted. + * Has a value of {@value}. + */ + public static final char EOF = (char)Source.EOF; + + /** * The URL this page is coming from. * Cached value of <code>getConnection().toExternalForm()</code> or *************** *** 647,652 **** { i = mSource.read (); ! if (0 > i) ! ret = 0; else { --- 653,658 ---- { i = mSource.read (); ! if (Source.EOF == i) ! ret = EOF; else { *************** *** 687,691 **** { i = mSource.read (); ! if (-1 == i) { // do nothing --- 693,697 ---- { i = mSource.read (); ! if (Source.EOF == i) { // do nothing Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** Lexer.java 12 Mar 2005 17:53:08 -0000 1.36 --- Lexer.java 13 Mar 2005 14:51:43 -0000 1.37 *************** *** 261,270 **** switch (ch) { ! case 0: // end of input ret = null; break; case '<': ch = mPage.getCharacter (mCursor); ! if (0 == ch) ret = makeString (start, mCursor.getPosition ()); else if ('%' == ch) --- 261,270 ---- switch (ch) { ! case Page.EOF: ret = null; break; case '<': ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else if ('%' == ch) *************** *** 281,285 **** { ch = mPage.getCharacter (mCursor); ! if (0 == ch) ret = makeString (start, mCursor.getPosition ()); else --- 281,285 ---- { ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) ret = makeString (start, mCursor.getPosition ()); else *************** *** 329,333 **** { ch = mPage.getCharacter (cursor); ! if (0 == ch) done = true; else --- 329,333 ---- { ch = mPage.getCharacter (cursor); ! if (Page.EOF == ch) done = true; else *************** *** 377,391 **** { ch = mPage.getCharacter (mCursor); ! if (0 == ch) done = true; else if (0x1b == ch) // escape { ch = mPage.getCharacter (mCursor); ! if (0 == ch) done = true; else if ('$' == ch) { ch = mPage.getCharacter (mCursor); ! if (0 == ch) done = true; else if ('B' == ch) --- 377,391 ---- { ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) done = true; else if (0x1b == ch) // escape { ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) done = true; else if ('$' == ch) { ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) done = true; else if ('B' == ch) *************** *** 406,410 **** { ch = mPage.getCharacter (mCursor); //try to consume escaped character ! if ( (ch != '\\') // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) --- 406,411 ---- { ch = mPage.getCharacter (mCursor); //try to consume escaped character ! if ((Page.EOF != ch) ! && ('\\' != ch) // escaped backslash && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) *************** *** 418,422 **** // I can't handle single quotations. ch = mPage.getCharacter (mCursor); ! if (0 == ch) done = true; else if ('/' == ch) --- 419,423 ---- // I can't handle single quotations. ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) done = true; else if ('/' == ch) *************** *** 424,428 **** do ch = mPage.getCharacter (mCursor); ! while ((ch != 0) && (ch != '\n')); } else if ('*' == ch) --- 425,429 ---- do ch = mPage.getCharacter (mCursor); ! while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) *************** *** 432,441 **** do ch = mPage.getCharacter (mCursor); ! while ((ch != 0) && (ch != '*')); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } ! while ((ch != 0) && (ch != '/')); } else --- 433,442 ---- do ch = mPage.getCharacter (mCursor); ! while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } ! while ((Page.EOF != ch) && ('/' != ch)); } else *************** *** 445,449 **** { ch = mPage.getCharacter (mCursor); ! if (0 == ch) done = true; // the order of these tests might be optimized for speed: --- 446,450 ---- { ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) done = true; // the order of these tests might be optimized for speed: *************** *** 600,604 **** { case 0: // outside of any attribute ! if ((0 == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) --- 601,605 ---- { case 0: // outside of any attribute ! if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) *************** *** 618,622 **** break; case 1: // within attribute name ! if ((0 == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) --- 619,623 ---- break; case 1: // within attribute name ! if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) *************** *** 640,644 **** break; case 2: // equals hit ! if ((0 == ch) || ('>' == ch)) { empty (attributes, bookmarks); --- 641,645 ---- break; case 2: // equals hit ! if ((Page.EOF == ch) || ('>' == ch)) { empty (attributes, bookmarks); *************** *** 665,669 **** break; case 3: // within naked attribute value ! if ((0 == ch) || ('>' == ch)) { naked (attributes, bookmarks); --- 666,670 ---- break; case 3: // within naked attribute value ! if ((Page.EOF == ch) || ('>' == ch)) { naked (attributes, bookmarks); *************** *** 678,682 **** break; case 4: // within single quoted attribute value ! if (0 == ch) { single_quote (attributes, bookmarks); --- 679,683 ---- break; case 4: // within single quoted attribute value ! if (Page.EOF == ch) { single_quote (attributes, bookmarks); *************** *** 691,695 **** break; case 5: // within double quoted attribute value ! if (0 == ch) { double_quote (attributes, bookmarks); --- 692,696 ---- break; case 5: // within double quoted attribute value ! if (Page.EOF == ch) { double_quote (attributes, bookmarks); *************** *** 708,712 **** case 6: // undecided for state 0 or 2 // we have read white spaces after an attributte name ! if (0 == ch) { // same as last else clause --- 709,713 ---- case 6: // undecided for state 0 or 2 // we have read white spaces after an attributte name ! if (Page.EOF == ch) { // same as last else clause *************** *** 824,828 **** { ch = mPage.getCharacter (mCursor); ! if (0 == ch) done = true; else --- 825,829 ---- { ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) done = true; else *************** *** 842,846 **** // handle <!--> because netscape does ch = mPage.getCharacter (mCursor); ! if (0 == ch) done = true; else if ('>' == ch) --- 843,847 ---- // handle <!--> because netscape does ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) done = true; else if ('>' == ch) *************** *** 858,862 **** if ('-' == ch) state = 3; ! else if (0 == ch) return (parseString (start, quotesmart)); // no terminator break; --- 859,863 ---- if ('-' == ch) state = 3; ! else if (Page.EOF == ch) return (parseString (start, quotesmart)); // no terminator break; *************** *** 946,950 **** state = 1; break; ! // case 0: // <\0 // case '>': // <> default: --- 947,951 ---- state = 1; break; ! // case Page.EOF: // <\0 // case '>': // <> default: *************** *** 956,960 **** switch (ch) { ! case 0: // <%\0 case '>': // <%> done = true; --- 957,961 ---- switch (ch) { ! case Page.EOF: // <%\0 case '>': // <%> done = true; *************** *** 976,980 **** switch (ch) { ! case 0: // <%x\0 case '>': // <%x> done = true; --- 977,981 ---- switch (ch) { ! case Page.EOF: // <%x\0 case '>': // <%x> done = true; *************** *** 994,998 **** switch (ch) { ! case 0: // <%x??%\0 done = true; break; --- 995,999 ---- switch (ch) { ! case Page.EOF: // <%x??%\0 done = true; break; *************** *** 1009,1013 **** switch (ch) { ! case 0: // <%x??"\0 done = true; break; --- 1010,1014 ---- switch (ch) { ! case Page.EOF: // <%x??"\0 done = true; break; *************** *** 1022,1026 **** switch (ch) { ! case 0: // <%x??'\0 done = true; break; --- 1023,1027 ---- switch (ch) { ! case Page.EOF: // <%x??'\0 done = true; break; *************** *** 1110,1114 **** switch (ch) { ! case 0: // end of input done = true; break; --- 1111,1115 ---- switch (ch) { ! case Page.EOF: done = true; break; *************** *** 1132,1137 **** { ch = mPage.getCharacter (mCursor); // try to consume escaped character ! if (0 == ch) ! mCursor.retreat (); else if ( (ch != '\\') && (ch != quote)) mCursor.retreat (); // unconsume char if character was not an escapable char. --- 1133,1138 ---- { ch = mPage.getCharacter (mCursor); // try to consume escaped character ! if (Page.EOF == ch) ! done = true; else if ( (ch != '\\') && (ch != quote)) mCursor.retreat (); // unconsume char if character was not an escapable char. *************** *** 1144,1154 **** // handle multiline and double slash comments (with a quote) ch = mPage.getCharacter (mCursor); ! if (0 == ch) ! mCursor.retreat (); else if ('/' == ch) { do ch = mPage.getCharacter (mCursor); ! while ((ch != 0) && (ch != '\n')); } else if ('*' == ch) --- 1145,1155 ---- // handle multiline and double slash comments (with a quote) ch = mPage.getCharacter (mCursor); ! if (Page.EOF == ch) ! done = true; else if ('/' == ch) { do ch = mPage.getCharacter (mCursor); ! while ((Page.EOF != ch) && ('\n' != ch)); } else if ('*' == ch) *************** *** 1158,1167 **** do ch = mPage.getCharacter (mCursor); ! while ((ch != 0) && (ch != '*')); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } ! while ((ch != 0) && (ch != '/')); } else --- 1159,1168 ---- do ch = mPage.getCharacter (mCursor); ! while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } ! while ((Page.EOF != ch) && ('/' != ch)); } else *************** *** 1185,1189 **** switch (ch) { ! case 0: // end of input done = true; break; --- 1186,1190 ---- switch (ch) { ! case Page.EOF: done = true; break; *************** *** 1197,1201 **** break; case 2: // </ ! if (0 == ch) done = true; else if (Character.isLetter (ch)) --- 1198,1202 ---- break; case 2: // </ ! if (Page.EOF == ch) done = true; else if (Character.isLetter (ch)) |