Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.41,1.42

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24770/src/org/htmlparser/lexer

Modified Files:
	Lexer.java 
Log Message:
Incorporated patch #1450095 Fix for Bug 1445309 from Trejkaz Xaoza.
Addition of code to parse XML processing instructions.

Index: Lexer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** Lexer.java	19 Sep 2005 02:35:05 -0000	1.41
--- Lexer.java	19 Mar 2006 15:01:25 -0000	1.42
***************
*** 288,291 ****
--- 288,296 ----
                      ret = parseJsp (start);
                  }
+                 else if ('?' == ch)
+                 {
+                     mCursor.retreat ();
+                     ret = parsePI (start);
+                 }
                  else if ('/' == ch || '%' == ch || Character.isLetter (ch))
                  {
***************
*** 470,474 ****
                  // the order of these tests might be optimized for speed:
                  else if ('/' == ch || Character.isLetter (ch)
!                     || '!' == ch || '%' == ch)
                  {
                      done = true;
--- 475,479 ----
                  // the order of these tests might be optimized for speed:
                  else if ('/' == ch || Character.isLetter (ch)
!                     || '!' == ch || '%' == ch || '?' == ch)
                  {
                      done = true;
***************
*** 1138,1141 ****
--- 1143,1271 ----
  
      /**
+      * Parse an XML processing instruction.
+      * Scan characters until "?&gt;" is encountered, or the input stream is
+      * exhausted, in which case <code>null</code> is returned.
+      * @param start The position at which to start scanning.
+      * @return The parsed node.
+      * @exception ParserException If a problem occurs reading from the source.
+      */
+     protected Node parsePI (int start)
+         throws
+             ParserException
+     {
+         boolean done;
+         char ch;
+         int state;
+         Vector attributes;
+         int code;
+ 
+         done = false;
+         state = 0;
+         code = 0;
+         attributes = new Vector ();
+         // <?xyz?>
+         // 011112d
+         while (!done)
+         {
+             ch = mPage.getCharacter (mCursor);
+             switch (state)
+             {
+                 case 0: // prior to the question mark
+                     switch (ch)
+                     {
+                         case '?': // <?
+                             code = mCursor.getPosition ();
+                             attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0));
+                             state = 1;
+                             break;
+                         // case Page.EOF: // <\0
+                         // case '>': // <>
+                         default:
+                             done = true;
+                             break;
+                     }
+                     break;
+                 case 1: // prior to the closing question mark
+                     switch (ch)
+                     {
+                         case Page.EOF: // <?x\0
+                         case '>': // <?x>
+                             done = true;
+                             break;
+                         case '\'':
+                         case '"':// <?..."
+                             state = ch;
+                             break;
+                         case '?': // <?...?
+                             state = 2;
+                             break;
+                         default:  // <?...x
+                             break;
+                     }
+                     break;
+                 case 2:
+                     switch (ch)
+                     {
+                         case Page.EOF: // <?x..?\0
+                             done = true;
+                             break;
+                         case '>':
+                             state = 3;
+                             done = true;
+                             break;
+                         default:  // <?...?x
+                             state = 1;
+                             break;
+                     }
+                     break;
+                 case '"':
+                     switch (ch)
+                     {
+                         case Page.EOF: // <?x.."\0
+                             done = true;
+                             break;
+                         case '"':
+                             state = 1;
+                             break;
+                         default:  // <?...'.x
+                             break;
+                     }
+                     break;
+                 case '\'':
+                     switch (ch)
+                     {
+                         case Page.EOF: // <?x..'\0
+                             done = true;
+                             break;
+                         case '\'':
+                             state = 1;
+                             break;
+                         default:  // <?..."..x
+                             break;
+                     }
+                     break;
+                 default:
+                     throw new IllegalStateException ("how the fuck did we get in state " + state);
+             }
+         }
+ 
+         if (3 == state) // normal exit
+         {
+             if (0 != code)
+             {
+                 state = mCursor.getPosition () - 2; // reuse state
+                 attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0));
+                 attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0));
+             }
+             else
+                 throw new IllegalStateException ("processing instruction with no content");
+         }
+         else
+             return (parseString (start, true)); // hmmm, true?
+ 
+         return (makeTag (start, mCursor.getPosition (), attributes));
+     }
+ 
+     /**
       * Return CDATA as a text node.
       * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">

Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.41,1.42

htmlparser-cvs