Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.41,1.42
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2006-03-19 15:01:28
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24770/src/org/htmlparser/lexer Modified Files: Lexer.java Log Message: Incorporated patch #1450095 Fix for Bug 1445309 from Trejkaz Xaoza. Addition of code to parse XML processing instructions. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** Lexer.java 19 Sep 2005 02:35:05 -0000 1.41 --- Lexer.java 19 Mar 2006 15:01:25 -0000 1.42 *************** *** 288,291 **** --- 288,296 ---- ret = parseJsp (start); } + else if ('?' == ch) + { + mCursor.retreat (); + ret = parsePI (start); + } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { *************** *** 470,474 **** // the order of these tests might be optimized for speed: else if ('/' == ch || Character.isLetter (ch) ! || '!' == ch || '%' == ch) { done = true; --- 475,479 ---- // the order of these tests might be optimized for speed: else if ('/' == ch || Character.isLetter (ch) ! || '!' == ch || '%' == ch || '?' == ch) { done = true; *************** *** 1138,1141 **** --- 1143,1271 ---- /** + * Parse an XML processing instruction. + * Scan characters until "?>" is encountered, or the input stream is + * exhausted, in which case <code>null</code> is returned. + * @param start The position at which to start scanning. + * @return The parsed node. + * @exception ParserException If a problem occurs reading from the source. + */ + protected Node parsePI (int start) + throws + ParserException + { + boolean done; + char ch; + int state; + Vector attributes; + int code; + + done = false; + state = 0; + code = 0; + attributes = new Vector (); + // <?xyz?> + // 011112d + while (!done) + { + ch = mPage.getCharacter (mCursor); + switch (state) + { + case 0: // prior to the question mark + switch (ch) + { + case '?': // <? + code = mCursor.getPosition (); + attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); + state = 1; + break; + // case Page.EOF: // <\0 + // case '>': // <> + default: + done = true; + break; + } + break; + case 1: // prior to the closing question mark + switch (ch) + { + case Page.EOF: // <?x\0 + case '>': // <?x> + done = true; + break; + case '\'': + case '"':// <?..." + state = ch; + break; + case '?': // <?...? + state = 2; + break; + default: // <?...x + break; + } + break; + case 2: + switch (ch) + { + case Page.EOF: // <?x..?\0 + done = true; + break; + case '>': + state = 3; + done = true; + break; + default: // <?...?x + state = 1; + break; + } + break; + case '"': + switch (ch) + { + case Page.EOF: // <?x.."\0 + done = true; + break; + case '"': + state = 1; + break; + default: // <?...'.x + break; + } + break; + case '\'': + switch (ch) + { + case Page.EOF: // <?x..'\0 + done = true; + break; + case '\'': + state = 1; + break; + default: // <?..."..x + break; + } + break; + default: + throw new IllegalStateException ("how the fuck did we get in state " + state); + } + } + + if (3 == state) // normal exit + { + if (0 != code) + { + state = mCursor.getPosition () - 2; // reuse state + attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0)); + attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); + } + else + throw new IllegalStateException ("processing instruction with no content"); + } + else + return (parseString (start, true)); // hmmm, true? + + return (makeTag (start, mCursor.getPosition (), attributes)); + } + + /** * Return CDATA as a text node. * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> |