[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,1.46,1.47 Page.java,1.55,1.56

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv6402/lexer

Modified Files:
	Lexer.java Page.java 
Log Message:
fix bug #1493884 Lexer returns a TagNode with a 'null' name
Use a more careful cursor retreat - Page.ungetCharacter().

Index: Page.java
===================================================================
RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v
retrieving revision 1.55
retrieving revision 1.56
diff -C2 -d -r1.55 -r1.56
*** Page.java	10 Apr 2006 21:38:41 -0000	1.55
--- Page.java	27 May 2006 17:06:28 -0000	1.56
***************
*** 680,684 ****
       * current source position.
       * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n,
!      * and updates the end-of-line index accordingly
       * Advances the cursor position by one (or two in the \r\n case).
       * @param cursor The position to read at.
--- 680,684 ----
       * current source position.
       * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n,
!      * and updates the end-of-line index accordingly.
       * Advances the cursor position by one (or two in the \r\n case).
       * @param cursor The position to read at.
***************
*** 686,690 ****
       * prepare for the next read. If the source is exhausted a zero is returned.
       * @exception ParserException If an IOException on the underlying source
!      * occurs, or an attemp is made to read characters in the future (the
       * cursor position is ahead of the underlying stream)
       */
--- 686,690 ----
       * prepare for the next read. If the source is exhausted a zero is returned.
       * @exception ParserException If an IOException on the underlying source
!      * occurs, or an attempt is made to read characters in the future (the
       * cursor position is ahead of the underlying stream)
       */
***************
*** 793,796 ****
--- 793,832 ----
  
      /**
+      * Return a character.
+      * Handles end of lines (EOL) specially, retreating the cursor twice for
+      * the '\r\n' case.
+      * The cursor position is moved back by one (or two in the \r\n case).
+      * @param cursor The position to 'unread' at.
+      * @exception ParserException If an IOException on the underlying source
+      * occurs.
+      */
+     public void ungetCharacter (Cursor cursor)
+         throws
+             ParserException
+     {
+         int i;
+         char ch;
+ 
+         cursor.retreat ();
+         i = cursor.getPosition ();
+         try
+         {
+             ch = mSource.getCharacter (i);
+             if (('\n' == ch) && (0 != i))
+             {
+                 ch = mSource.getCharacter (i - 1);
+                 if ('\r' == ch)
+                     cursor.retreat ();
+             }
+         }
+         catch (IOException ioe)
+         {
+             throw new ParserException (
+                 "can't read a character at position "
+                 + cursor.getPosition (), ioe);
+         }
+     }
+ 
+     /**
       * Get the current encoding being used.
       * @return The encoding used to convert characters.

Index: Lexer.java
===================================================================
RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v
retrieving revision 1.46
retrieving revision 1.47
diff -C2 -d -r1.46 -r1.47
*** Lexer.java	27 May 2006 14:02:27 -0000	1.46
--- Lexer.java	27 May 2006 17:06:28 -0000	1.47
***************
*** 356,370 ****
                  else if ('%' == ch)
                  {
!                     mCursor.retreat ();
                      ret = parseJsp (start);
                  }
                  else if ('?' == ch)
                  {
!                     mCursor.retreat ();
                      ret = parsePI (start);
                  }
                  else if ('/' == ch || '%' == ch || Character.isLetter (ch))
                  {
!                     mCursor.retreat ();
                      ret = parseTag (start);
                  }
--- 356,370 ----
                  else if ('%' == ch)
                  {
!                     mPage.ungetCharacter (mCursor);
                      ret = parseJsp (start);
                  }
                  else if ('?' == ch)
                  {
!                     mPage.ungetCharacter (mCursor);
                      ret = parsePI (start);
                  }
                  else if ('/' == ch || '%' == ch || Character.isLetter (ch))
                  {
!                     mPage.ungetCharacter (mCursor);
                      ret = parseTag (start);
                  }
***************
*** 380,389 ****
                          else
                          {
!                             mCursor.retreat (); // remark/tag need this char
                              if ('-' == ch)
                                  ret = parseRemark (start, quotesmart);
                              else
                              {
!                                 mCursor.retreat (); // tag needs prior one too
                                  ret = parseTag (start);
                              }
--- 380,389 ----
                          else
                          {
!                             mPage.ungetCharacter (mCursor); // remark/tag need this char
                              if ('-' == ch)
                                  ret = parseRemark (start, quotesmart);
                              else
                              {
!                                 mPage.ungetCharacter (mCursor); // tag needs prior one too
                                  ret = parseTag (start);
                              }
***************
*** 395,399 ****
                  break;
              default:
!                 mCursor.retreat (); // string needs to see leading foreslash
                  ret = parseString (start, quotesmart);
                  break;
--- 395,399 ----
                  break;
              default:
!                 mPage.ungetCharacter (mCursor); // string needs to see leading foreslash
                  ret = parseString (start, quotesmart);
                  break;
***************
*** 489,493 ****
                                          done = true;
                                      else if (  (ch != '\\') && (ch != quote))
!                                         mCursor.retreat (); // unconsume char if character was not an escapable char.
                                  }
                              break;
--- 489,494 ----
                                          done = true;
                                      else if (  (ch != '\\') && (ch != quote))
!                                         // unconsume char if character was not an escapable char.
!                                         mPage.ungetCharacter (mCursor);
                                  }
                              break;
***************
*** 511,520 ****
                                              ch = mPage.getCharacter (mCursor);
                                              if (ch == '*')
!                                                 mCursor.retreat ();
                                          }
                                          while ((Page.EOF != ch) && ('/' != ch));
                                      }
                                      else
!                                         mCursor.retreat ();
                                  }
                              break;
--- 512,521 ----
                                              ch = mPage.getCharacter (mCursor);
                                              if (ch == '*')
!                                                 mPage.ungetCharacter (mCursor);
                                          }
                                          while ((Page.EOF != ch) && ('/' != ch));
                                      }
                                      else
!                                         mPage.ungetCharacter (mCursor);
                                  }
                              break;
***************
*** 574,580 ****
                          done = true;
                          // back up to the start of ETAGO
!                         mCursor.retreat ();
!                         mCursor.retreat ();
!                         mCursor.retreat ();
                      }
                      else
--- 575,581 ----
                          done = true;
                          // back up to the start of ETAGO
!                         mPage.ungetCharacter (mCursor);
!                         mPage.ungetCharacter (mCursor);
!                         mPage.ungetCharacter (mCursor);
                      }
                      else
***************
*** 599,608 ****
                              else
                              {
!                                 mCursor.retreat ();
!                                 mCursor.retreat ();
                              }
                          }
                          else
!                             mCursor.retreat ();
                      }
                      break;
--- 600,609 ----
                              else
                              {
!                                 mPage.ungetCharacter (mCursor);
!                                 mPage.ungetCharacter (mCursor);
                              }
                          }
                          else
!                             mPage.ungetCharacter (mCursor);
                      }
                      break;
***************
*** 749,758 ****
                      else
                      {
!                         mCursor.retreat ();
!                         mCursor.retreat ();
                      }
                  }
                  else
!                     mCursor.retreat ();
              }
              else if (quotesmart && (0 == quote)
--- 750,759 ----
                      else
                      {
!                         mPage.ungetCharacter (mCursor);
!                         mPage.ungetCharacter (mCursor);
                      }
                  }
                  else
!                     mPage.ungetCharacter (mCursor);
              }
              else if (quotesmart && (0 == quote)
***************
*** 767,771 ****
                      && (ch != quote)) // escaped quote character
                         // ( reflects ["] or [']  whichever opened the quotation)
!                     mCursor.retreat(); // unconsume char if char not an escape
              }
              else if (quotesmart && (ch == quote))
--- 768,772 ----
                      && (ch != quote)) // escaped quote character
                         // ( reflects ["] or [']  whichever opened the quotation)
!                     mPage.ungetCharacter (mCursor); // unconsume char if char not an escape
              }
              else if (quotesmart && (ch == quote))
***************
*** 794,803 ****
                          ch = mPage.getCharacter (mCursor);
                          if (ch == '*')
!                             mCursor.retreat ();
                      }
                      while ((Page.EOF != ch) && ('/' != ch));
                  }
                  else
!                     mCursor.retreat ();
              }
              else if ((0 == quote) && ('<' == ch))
--- 795,804 ----
                          ch = mPage.getCharacter (mCursor);
                          if (ch == '*')
!                             mPage.ungetCharacter (mCursor);
                      }
                      while ((Page.EOF != ch) && ('/' != ch));
                  }
                  else
!                     mPage.ungetCharacter (mCursor);
              }
              else if ((0 == quote) && ('<' == ch))
***************
*** 811,821 ****
                  {
                      done = true;
!                     mCursor.retreat ();
!                     mCursor.retreat ();
                  }
                  else
                  {
                      // it's not a tag, so keep going, but check for quotes
!                     mCursor.retreat ();
                  }
              }
--- 812,822 ----
                  {
                      done = true;
!                     mPage.ungetCharacter (mCursor);
!                     mPage.ungetCharacter (mCursor);
                  }
                  else
                  {
                      // it's not a tag, so keep going, but check for quotes
!                     mPage.ungetCharacter (mCursor);
                  }
              }
***************
*** 1013,1017 ****
                          {
                              // don't consume the opening angle
!                             mCursor.retreat ();
                              bookmarks[state + 1] = mCursor.getPosition ();
                          }
--- 1014,1018 ----
                          {
                              // don't consume the opening angle
!                             mPage.ungetCharacter (mCursor);
                              bookmarks[state + 1] = mCursor.getPosition ();
                          }
***************
*** 1031,1035 ****
                          {
                              // don't consume the opening angle
!                             mCursor.retreat ();
                              bookmarks[state + 1] = mCursor.getPosition ();
                          }
--- 1032,1036 ----
                          {
                              // don't consume the opening angle
!                             mPage.ungetCharacter (mCursor);
                              bookmarks[state + 1] = mCursor.getPosition ();
                          }
***************
*** 1121,1125 ****
                          standalone (attributes, bookmarks);
                    	    bookmarks[0]=bookmarks[6];
!                   	    mCursor.retreat();
                    	    state=0;
                      }
--- 1122,1126 ----
                          standalone (attributes, bookmarks);
                    	    bookmarks[0]=bookmarks[6];
!                   	    mPage.ungetCharacter (mCursor);
                    	    state=0;
                      }
***************
*** 1143,1147 ****
                    	    standalone (attributes, bookmarks);
                    	    bookmarks[0]=bookmarks[6];
!                   	    mCursor.retreat();
                    	    state=0;
                     	}
--- 1144,1148 ----
                    	    standalone (attributes, bookmarks);
                    	    bookmarks[0]=bookmarks[6];
!                   	    mPage.ungetCharacter (mCursor);
                    	    state=0;
                     	}
***************
*** 1263,1267 ****
                              else
                              {
!                                 mCursor.retreat ();
                                  state = 2;
                              }                        
--- 1264,1268 ----
                              else
                              {
!                                 mPage.ungetCharacter (mCursor);
                                  state = 2;
                              }                        
***************
*** 1442,1453 ****
                                      ch = mPage.getCharacter (mCursor);
                                      if (ch == '*')
!                                         mCursor.retreat ();
                                  }
                                  while ((Page.EOF != ch) && ('/' != ch));
                              }
                              else
!                             {
!                                 mCursor.retreat ();
!                             }
                              break;
                          default:  // <%???x
--- 1443,1452 ----
                                      ch = mPage.getCharacter (mCursor);
                                      if (ch == '*')
!                                         mPage.ungetCharacter (mCursor);
                                  }
                                  while ((Page.EOF != ch) && ('/' != ch));
                              }
                              else
!                                 mPage.ungetCharacter (mCursor);
                              break;
                          default:  // <%???x