[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,NONE,1.1 Cursor.java,1.2,1.3 Page.ja

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1:/tmp/cvs-serv9123/lexer

Modified Files:
	Cursor.java Page.java PageIndex.java Source.java package.html 
Added Files:
	Lexer.java 
Log Message:
Third drop for new i/o subsystem.


--- NEW FILE: Lexer.java ---
// HTMLParser Library v1_4_20030810 - A java-based parser for HTML
// Copyright (C) Dec 31, 2000 Somik Raha
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// For any questions or suggestions, you can write to me at :
// Email :so...@in...
// 
// Postal Address : 
// Somik Raha
// Extreme Programmer & Coach
// Industrial Logic Corporation
// 2583 Cedar Street, Berkeley, 
// CA 94708, USA
// Website : http://www.industriallogic.com

package org.htmlparser.lexer;

import java.io.UnsupportedEncodingException;
import java.net.URLConnection;
import java.util.Vector;

import org.htmlparser.Node;
import org.htmlparser.lexer.Stream;
import org.htmlparser.lexer.nodes.Attribute;
import org.htmlparser.lexer.nodes.RemarkNode;
import org.htmlparser.lexer.nodes.StringNode;
import org.htmlparser.lexer.nodes.TagNode;
import org.htmlparser.util.ParserException;

/**
 * This class parses the HTML stream into nodes.
 * There are three major types of nodes (lexemes):
 * <li>RemarkNode</li>
 * <li>StringNode</li>
 * <li>TagNode</li>
 * Each time <code>nextNode()</code> is called, another node is returned until
 * the stream is exhausted, and <code>null</code> is returned.
 */
public class Lexer
{
    /**
     * The page lexemes are retrieved from.
     */
    protected Page mPage;

    /**
     * The current position on the page.
     */
    protected Cursor mCursor;

    /**
     * Creates a new instance of a Lexer.
     * @param page The page with HTML text.
     */
    public Lexer (Page page)
    {
        mPage = page;
        mCursor = new Cursor (page, 0);
    }

    /**
     * Creates a new instance of a Lexer.
     * @param text The text to parse.
     */
    public Lexer (String text) throws ParserException
    {
        this (new Page (text));
    }

    /**
     * Creates a new instance of a Lexer.
     * @param connection The url to parse.
     */
    public Lexer (URLConnection connection) throws ParserException
    {
        this (new Page (connection));
    }

    /**
     * Get the page this lexer is working on.
     * @return The page that nodes are being read from.
     */
    public Page getPage ()
    {
        return (mPage);
    }

    /**
     * Get the next node from the source.
     * @return A RemarkNode, StringNode or Tag, or <code>null</code> if no
     * more lexemes are present.
     * @exception ParserException If there is a problem with the underlying page.
     */
    public Node nextNode ()
        throws
            ParserException
    {
        Cursor probe;
        char ch;
        Node ret;

        probe = mCursor.dup ();
        ch = mPage.getCharacter (probe);
        switch (ch)
        {
            case 0: // end of input
                ret = null;
                break;
            case '<':
                ch = mPage.getCharacter (probe);
                if (0 == ch)
                    ret = parseString ();
                else if ('/' == ch || '%' == ch || Character.isLetter (ch))
                    ret = parseTag ();
                else if ('!' == ch)
                {
                    ch = mPage.getCharacter (probe);
                    if ('-' == ch)
                        ret = parseRemark ();
                    else
                        ret = parseTag ();
                }
                else
                    ret = parseString ();
                break;
            default:
                ret = parseString ();
                break;
        }

        return (ret);
    }

    /**
     * Parse a string node.
     * Scan characters until "&lt;/", "&lt;%", "&lt;!" or &lt; followed by a
     * letter is encountered, or the input stream is exhausted, in which
     * case <code>null</code> is returned.
     */
    protected Node parseString ()
        throws
            ParserException
    {
        Cursor cursor;
        boolean done;
        char ch;
        int length;
        StringNode ret;
        
        cursor = mCursor.dup ();
        done = false;
        while (!done)
        {
            ch = mPage.getCharacter (cursor);
            if (0 == ch)
                done = true;
            else if ('<' == ch)
            {
                ch = mPage.getCharacter (cursor);
                if (0 == ch)
                    done = true;
                // the order of these tests might be optimized for speed:
                else if ('/' == ch || '%' == ch || Character.isLetter (ch) || '!' == ch)
                {
                    done = true;
                    cursor.retreat ();
                    cursor.retreat ();
                }
                else
                {
                    // it's not a tag, so keep going,
                    // the extra characters consumed are in this string
                }
            }
        }
        length = cursor.getPosition () - mCursor.getPosition ();
        if (0 != length)
        {   // got some characters
            ret = new StringNode (mPage, mCursor.getPosition (), cursor.getPosition ());
            mCursor = cursor;
        }
        else
            ret = null;

        return (ret);
    }

    private void whitespace (Vector attributes, int[] bookmarks)
    {
        if (bookmarks[1] > bookmarks[0])
            attributes.addElement (new Attribute (null, mPage.getText (bookmarks[0], bookmarks[1]), (char)0));
    }

    private void standalone (Vector attributes, int[] bookmarks)
    {
        attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), null, (char)0));
    }

    private void empty (Vector attributes, int[] bookmarks)
    {
        attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), "", (char)0));
    }

    private void naked (Vector attributes, int[] bookmarks)
    {
        attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[3], bookmarks[4]), (char)0));
    }

    private void single_quote (Vector attributes, int[] bookmarks)
    {
        attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[4] + 1, bookmarks[5]), '\''));
    }

    private void double_quote (Vector attributes, int[] bookmarks)
    {
        attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[5] + 1, bookmarks[6]), '"'));
    }

    /**
     * Parse a tag.
     * Parse the name and attributes from a start tag.<p>
     * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">
     * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
     * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2<p>
     * <cite>
     * 3.2.2 Attributes<p>
     * Elements may have associated properties, called attributes, which may
     * have values (by default, or set by authors or scripts). Attribute/value
     * pairs appear before the final ">" of an element's start tag. Any number
     * of (legal) attribute value pairs, separated by spaces, may appear in an
     * element's start tag. They may appear in any order.<p>
     * In this example, the id attribute is set for an H1 element:
     * <code>
     * &lt;H1 id="section1"&gt;
     * </code>
     * This is an identified heading thanks to the id attribute
     * <code>
     * &lt;/H1&gt;
     * </code>
     * By default, SGML requires that all attribute values be delimited using
     * either double quotation marks (ASCII decimal 34) or single quotation
     * marks (ASCII decimal 39). Single quote marks can be included within the
     * attribute value when the value is delimited by double quote marks, and
     * vice versa. Authors may also use numeric character references to
     * represent double quotes (&amp;#34;) and single quotes (&amp;#39;).
     * For doublequotes authors can also use the character entity reference &amp;quot;.<p>
     * In certain cases, authors may specify the value of an attribute without
     * any quotation marks. The attribute value may only contain letters
     * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45),
     * periods (ASCII decimal 46), underscores (ASCII decimal 95),
     * and colons (ASCII decimal 58). We recommend using quotation marks even
     * when it is possible to eliminate them.<p>
     * Attribute names are always case-insensitive.<p>
     * Attribute values are generally case-insensitive. The definition of each
     * attribute in the reference manual indicates whether its value is case-insensitive.<p>
     * All the attributes defined by this specification are listed in the attribute index.<p>
     * </cite>
     * <p>
     * This method uses a state machine with the following states:
     * <ol>
     * <li>state 0 - outside of any attribute</li>
     * <li>state 1 - within attributre name</li>
     * <li>state 2 - equals hit</li>
     * <li>state 3 - within naked attribute value.</li>
     * <li>state 4 - within single quoted attribute value</li>
     * <li>state 5 - within double quoted attribute value</li>
     * </ol>
     * <p>
     * The starting point for the various components is stored in an array
     * of integers that match the initiation point for the states one-for-one,
     * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1
     * began, etc.
     * Attributes are stored in a <code>Vector</code> having
     * one slot for each whitespace or attribute/value pair.
     * The first slot is for attribute name (kind of like a standalone attribute).
     */
    protected Node parseTag ()
        throws
            ParserException
    {
        Cursor cursor;
        boolean done;
        char ch;
        int state;
        int[] bookmarks;
        Vector attributes;
        int length;
        TagNode ret;
        
        cursor = mCursor.dup ();
        // sanity check
        ch = mPage.getCharacter (cursor);
        if ('<' != ch)
            return (parseString ());
        done = false;
        attributes = new Vector ();
        state = 0;
        bookmarks = new int[7];
        bookmarks[0] = cursor.getPosition ();
        while (!done)
        {
            bookmarks[state + 1] = cursor.getPosition ();
            ch = mPage.getCharacter (cursor);
            switch (state)
            {
                case 0: // outside of any attribute
                    if ((0 == ch) || ('>' == ch))
                    {
                        whitespace (attributes, bookmarks);
                        done = true;
                    }
                    else if (!Character.isWhitespace (ch))
                    {
                        whitespace (attributes, bookmarks);
                        state = 1;
                    }
                    break;
                case 1: // within attributre name
                    if ((0 == ch) || ('>' == ch))
                    {
                        standalone (attributes, bookmarks);
                        done = true;
                    }
                    else if (Character.isWhitespace (ch))
                    {
                        standalone (attributes, bookmarks);
                        bookmarks[0] = bookmarks[2];
                        state = 0;
                    }
                    else if ('=' == ch)
                        state = 2;
                    break;
                case 2: // equals hit
                    if ((0 == ch) || ('>' == ch))
                    {
                        empty (attributes, bookmarks);
                        done = true;
                    }
                    else if ('\'' == ch)
                    {
                        state = 4;
                        bookmarks[4] = bookmarks[3];
                    }
                    else if ('"' == ch)
                    {
                        state = 5;
                        bookmarks[5] = bookmarks[3];
                    }
                    else
                        state = 3;
                    break;
                case 3: // within naked attribute value
                    if ('>' == ch)
                    {
                        naked (attributes, bookmarks);
                        done = true;
                    }
                    else if (Character.isWhitespace (ch))
                    {
                        naked (attributes, bookmarks);
                        bookmarks[0] = bookmarks[4];
                        state = 0;
                    }
                    break;
                case 4: // within single quoted attribute value
                    if (0 == ch)
                    {
                        single_quote (attributes, bookmarks);
                        done = true; // complain?
                    }
                    else if ('\'' == ch)
                    {
                        single_quote (attributes, bookmarks);
                        bookmarks[0] = bookmarks[5] + 1;
                        state = 0;
                    }
                    break;
                case 5: // within double quoted attribute value
                    if (0 == ch)
                    {
                        double_quote (attributes, bookmarks);
                        done = true; // complain?
                    }
                    else if ('"' == ch)
                    {
                        double_quote (attributes, bookmarks);
                        bookmarks[0] = bookmarks[6] + 1;
                        state = 0;
                    }
                    break;
                default:
                    throw new IllegalStateException ("how the fuck did we get in state " + state);
            }
        }
        length = cursor.getPosition () - mCursor.getPosition ();
        if (0 != length)
        {   // return tag based on second character, '/', '%', Letter (ch), '!'
            if (2 > length)
                // this is an error
                return (parseString ());
            ret = new TagNode (mPage, mCursor.getPosition (), cursor.getPosition (), attributes);
            mCursor = cursor;
        }
        else
            ret = null;

        return (ret);
    }

    /**
     * Parse a comment.
     * Parse a remark markup.<p>
     * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4">
     * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
     * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4<p>
     * <cite>
     * 3.2.4 Comments<p>
     * HTML comments have the following syntax:<p>
     * <code>
     * &lt;!-- this is a comment --&gt;<p>
     * &lt;!-- and so is this one,<p>
     *     which occupies more than one line --&gt;<p>
     * </code>
     * White space is not permitted between the markup declaration 
     * open delimiter("&lt;!") and the comment open delimiter ("--"),
     * but is permitted between the comment close delimiter ("--") and
     * the markup declaration close delimiter ("&gt;").
     * A common error is to include a string of hyphens ("---") within a comment.
     * Authors should avoid putting two or more adjacent hyphens inside comments.
     * Information that appears between comments has no special meaning
     * (e.g., character references are not interpreted as such).
     * Note that comments are markup.<p>
     * </cite>
     * <p>
     * This method uses a state machine with the following states:
     * <ol>
     * <li>state 0 - prior to the first open delimiter</li>
     * <li>state 1 - prior to the second open delimiter</li>
     * <li>state 2 - prior to the first closing delimiter</li>
     * <li>state 3 - prior to the second closing delimiter</li>
     * <li>state 4 - prior to the terminating &gt;</li>
     * </ol>
     * <p>
     * All comment text (everything excluding the &lt; and &gt;), is included
     * in the remark text.
     * We allow terminators like --!&gt; even though this isn't part of the spec.
     */
    protected Node parseRemark ()
        throws
            ParserException
    {
        Cursor cursor;
        boolean done;
        char ch;
        int state;
        int length;
        RemarkNode ret;
        
        cursor = mCursor.dup ();
        // sanity check
        ch = mPage.getCharacter (cursor);
        if ('<' != ch)
            return (parseString ());
        ch = mPage.getCharacter (cursor);
        if ('!' != ch)
            return (parseString ());
        done = false;
        state = 0;
        while (!done)
        {
            ch = mPage.getCharacter (cursor);
            switch (state)
            {
                case 0: // prior to the first open delimiter
                    if ('-' == ch)
                        state = 1;
                    else
                        return (parseString ());
                    break;
                case 1: // prior to the second open delimiter
                    if ('-' == ch)
                        state = 2;
                    else
                        return (parseString ());
                    break;
                case 2: // prior to the first closing delimiter
                    if ('-' == ch)
                        state = 3;
                    break;
                case 3: // prior to the second closing delimiter
                    if ('-' == ch)
                        state = 4;
                    else
                        state = 2;
                    break;
                case 4: // prior to the terminating >
                    if ('>' == ch)
                        done = true;
                    else if (!Character.isWhitespace (ch) || ('!' == ch))
                        state = 2;
                    break;
                default:
                    throw new IllegalStateException ("how the fuck did we get in state " + state);
            }
        }
        length = cursor.getPosition () - mCursor.getPosition ();
        if (0 != length)
        {   // return tag based on second character, '/', '%', Letter (ch), '!'
            if (2 > length)
                // this is an error
                return (parseString ());
            ret = new RemarkNode (mPage, mCursor.getPosition (), cursor.getPosition ());
            mCursor = cursor;
        }
        else
            ret = null;

        return (ret);
    }

}

Index: Cursor.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Cursor.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** Cursor.java	11 Aug 2003 00:18:28 -0000	1.2
--- Cursor.java	17 Aug 2003 16:09:27 -0000	1.3
***************
*** 79,83 ****
--- 79,130 ----
          return (mPosition);
      }
+ 
+     /**
+      * Move the cursor position ahead one character.
+      */
+     public void advance ()
+     {
+         mPosition++;
+     }
+ 
+     /**
+      * Move the cursor position back one character.
+      */
+     public void retreat ()
+     {
+         mPosition--;
+         if (0 > mPosition)
+             mPosition = 0;
+     }
+ 
+     /**
+      * Make a new cursor just like this one.
+      * @return The new cursor positioned where <code>this</code> one is,
+      * and referring to the same page.
+      */
+     public Cursor dup ()
+     {
+         return (new Cursor (getPage (), getPosition ()));
+     }
+     
+     public String toString ()
+     {
+         int row;
+         int column;
+         StringBuffer ret;
          
+         ret = new StringBuffer (9 * 3 + 3); // three ints and delimiters
+         ret.append (getPosition ());
+         row = mPage.row (this);
+         column = mPage.column (this);
+         ret.append ("[");
+         ret.append (row);
+         ret.append (",");
+         ret.append (column);
+         ret.append ("]");
+         
+         return (ret.toString ());
+     }
+ 
      //
      // Ordered interface

Index: Page.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** Page.java	11 Aug 2003 00:18:28 -0000	1.4
--- Page.java	17 Aug 2003 16:09:27 -0000	1.5
***************
*** 29,32 ****
--- 29,33 ----
  package org.htmlparser.lexer;
  
+ import java.io.ByteArrayInputStream;
  import java.io.IOException;
  import java.io.Reader;
***************
*** 44,51 ****
  /**
   * Represents the contents of an HTML page.
!  * Contains a character array of the page downloaded so far,
!  * a String with those characters in it,
!  * and an index of positions of line separators (actually the first
!  * character position on the next line).
   */
  public class Page
--- 45,50 ----
  /**
   * Represents the contents of an HTML page.
!  * Contains the source of characters and an index of positions of line
!  * separators (actually the first character position on the next line).
   */
  public class Page
***************
*** 70,83 ****
  
      /**
-      * The characters read so far from the source.
-      */
-     protected char[] mCharacters;
-     
-     /**
-      * The string representation of the source.
-      */
-     protected String mString;
- 
-     /**
       * Character positions of the first character in each line.
       */
--- 69,72 ----
***************
*** 102,106 ****
  
      /**
!      * Construct a page reading from a URL.
       * @param connection A fully conditioned connection. The connect()
       * method will be called so it need not be connected yet.
--- 91,95 ----
  
      /**
!      * Construct a page reading from a URL connection.
       * @param connection A fully conditioned connection. The connect()
       * method will be called so it need not be connected yet.
***************
*** 113,119 ****
       */
      public Page (URLConnection connection) throws ParserException
- //        throws
- //            IOException,
- //            UnsupportedEncodingException
      {
          if (null == connection)
--- 102,105 ----
***************
*** 139,148 ****
              throw new ParserException ("oops2", ioe);
          }
-         mCharacters = null;
-         mString = null;
          mIndex = new PageIndex (this);
      }
  
      /**
       * Try and extract the character set from the HTTP header.
       * @param connection The connection with the charset info.
--- 125,270 ----
              throw new ParserException ("oops2", ioe);
          }
          mIndex = new PageIndex (this);
      }
  
      /**
+      * Construct a page from a stream encoded with the given charset.
+      * @param stream The source of bytes.
+      * @param charset The encoding used.
+      * If null, defaults to the <code>DEFAULT_CHARSET</code>.
+      * @exception UnsupportedEncodingException If the given charset is not supported.
+      */
+     public Page (Stream stream, String charset)
+         throws
+             UnsupportedEncodingException
+     {
+         if (null == stream)
+             throw new IllegalArgumentException ("stream cannot be null");
+         if (null == charset)
+             charset = DEFAULT_CHARSET;
+         mSource = new Source (stream, charset);
+         mIndex = new PageIndex (this);
+     }
+ 
+     public Page (String text) throws ParserException
+     {
+         Stream stream;
+         Page ret;
+ 
+         if (null == text)
+             throw new IllegalArgumentException ("text cannot be null");
+         try
+         {
+             stream = new Stream (new ByteArrayInputStream (text.getBytes (Page.DEFAULT_CHARSET)));
+             mSource = new Source (stream, Page.DEFAULT_CHARSET);
+             mIndex = new PageIndex (this);
+         }
+         catch (UnsupportedEncodingException uee)
+         {
+             throw new ParserException ("problem making a page", uee);
+         }
+     }
+ 
+     /**
+      * Get the source this page is reading from.
+      */
+     public Source getSource ()
+     {
+         return (mSource);
+     }
+ 
+     /**
+      * Read the character at the cursor position.
+      * The cursor position can be behind or equal to the current source position.
+      * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n,
+      * and updates the end-of-line index accordingly
+      * Advances the cursor position by one (or two in the \r\n case).
+      * @param cursor The position to read at.
+      * @return The character at that position, and modifies the cursor to
+      * prepare for the next read. If the source is exhausted a zero is returned.
+      * @exception ParserException If an IOException on the underlying source
+      * occurs, or an attemp is made to read characters in the future (the
+      * cursor position is ahead of the underlying stream)
+      */
+     public char getCharacter (Cursor cursor)
+         throws
+             ParserException
+     {
+         int i;
+         char ret;
+         
+         if (mSource.mOffset < cursor.getPosition ())
+             // hmmm, we could skip ahead, but then what about the EOL index
+             throw new ParserException ("attempt to read future characters from source");
+         else if (mSource.mOffset == cursor.getPosition ())
+             try
+             {
+                 i = mSource.read ();
+                 if (-1 == i)
+                     ret = 0;
+                 else
+                 {
+                     ret = (char)i;
+                     cursor.advance ();
+                 }
+             }
+             catch (IOException ioe)
+             {
+                 throw new ParserException (
+                     "problem reading a character at position "
+                     + cursor.getPosition (), ioe);
+             }
+         else
+         {
+             // historic read
+             ret = mSource.mBuffer[cursor.getPosition ()];
+             cursor.advance ();
+         }
+ 
+         // handle \r
+         if ('\r' == ret)
+         {   // switch to single character EOL
+             ret = '\n';
+ 
+             // check for a \n in the next position
+             if (mSource.mOffset == cursor.getPosition ())
+                 try
+                 {
+                     i = mSource.read ();
+                     if (-1 == i)
+                     { 
+                         // do nothing
+                     }
+                     else if ('\n' == (char)i)
+                         cursor.advance ();
+                     else
+                         try
+                         {
+                             mSource.unread ();
+                         }
+                         catch (IOException ioe)
+                         {
+                             throw new ParserException (
+                                 "can't unread a character at position "
+                                 + cursor.getPosition (), ioe);
+                         }
+                 }
+                 catch (IOException ioe)
+                 {
+                     throw new ParserException (
+                         "problem reading a character at position "
+                         + cursor.getPosition (), ioe);
+                 }
+             else if ('\n' == mSource.mBuffer[cursor.getPosition ()])
+                 cursor.advance ();
+         }
+         if ('\n' == ret)
+             // update the EOL index in any case
+             mIndex.add (cursor);
+ 
+         return (ret);
+     }
+ 
+     /**
       * Try and extract the character set from the HTTP header.
       * @param connection The connection with the charset info.
***************
*** 294,297 ****
--- 416,483 ----
  	}
  
+     /**
+      * Get the line number for a cursor.
+      * @param cursor The character offset into the page.
+      * @return The line number the character is in.
+      */
+     public int row (Cursor cursor)
+     {
+         return (mIndex.row (cursor));
+     }
+ 
+     /**
+      * Get the column number for a cursor.
+      * @param cursor The character offset into the page.
+      * @return The character offset into the line this cursor is on.
+      */
+     public int column (Cursor cursor)
+     {
+         return (mIndex.column (cursor));
+     }
+ 
+     /**
+      * Get the text identified by the given limits.
+      * @param start The starting position, zero based.
+      * @param end The ending position
+      * (exclusive, i.e. the character at the ending position is not included),
+      * zero based.
+      * @return The text from <code>start</code> to <code>end</code>.
+      * @see #getText(StringBuffer, int, int)
+      */
+     public String getText (int start, int end)
+     {
+         StringBuffer ret;
+         
+         ret = new StringBuffer (Math.abs (end - start));
+         getText (ret, start, end);
+         
+         return (ret.toString ());
+     }
+ 
+     /**
+      * Put the text identified by the given limits into the given buffer.
+      * @param buffer The accumulator for the characters.
+      * @param start The starting position, zero based.
+      * @param end The ending position
+      * (exclusive, i.e. the character at the ending position is not included),
+      * zero based.
+      */
+     public void getText (StringBuffer buffer, int start, int end)
+     {
+         int length;
+         StringBuffer ret;
+ 
+         if ((mSource.mOffset < start) || (mSource.mOffset < end))
+             throw new IllegalArgumentException ("attempt to extract future characters from source");
+         if (end < start)
+         {
+             length = end;
+             end = start;
+             start = length;
+         }
+         length = end - start;
+         buffer.append (mSource.mBuffer, start, length);
+     }
+ 
      //
      // Bean patterns
***************
*** 307,309 ****
--- 493,496 ----
          return (mLog);
      }
+ 
  }

Index: PageIndex.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/PageIndex.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** PageIndex.java	11 Aug 2003 00:18:28 -0000	1.2
--- PageIndex.java	17 Aug 2003 16:09:27 -0000	1.3
***************
*** 206,210 ****
      public int row (Cursor cursor)
      {
!         return (Sort.bsearch (this, cursor));
      }
  
--- 206,220 ----
      public int row (Cursor cursor)
      {
!         int ret;
!         
!         ret = Sort.bsearch (this, cursor);
!         // handle line transition, the search returns the index if it matches
!         // exactly one of the line end positions, so we advance one line if
!         // it's equal to the offset at the row index, since that position is
!         // actually the beginning of the next line
!         if ((ret < mCount) && (cursor.getPosition () == mIndices[ret]))
!             ret++;
!         
!         return (ret);
      }
  
***************
*** 229,238 ****
          int previous;
  
!         row = Sort.bsearch (this, cursor);
!         // note, this shouldn't be zero if the first element of each index is offset zero
          if (0 != row)
              previous = this.elementAt (row - 1);
          else
!             previous = this.elementAt (0);
          
          return (cursor.getPosition () - previous);
--- 239,247 ----
          int previous;
  
!         row = row (cursor);
          if (0 != row)
              previous = this.elementAt (row - 1);
          else
!             previous = 0;
          
          return (cursor.getPosition () - previous);

Index: Source.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Source.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** Source.java	11 Aug 2003 00:18:28 -0000	1.3
--- Source.java	17 Aug 2003 16:09:27 -0000	1.4
***************
*** 80,84 ****
       * The offset of the next byte returned by read().
       */
!     protected int mOffset;
  
      /**
--- 80,84 ----
       * The offset of the next byte returned by read().
       */
!     public volatile int mOffset;
  
      /**
***************
*** 175,195 ****
  
      /**
!      * Close the stream.  Once a stream has been closed, further read(),
!      * ready(), mark(), or reset() invocations will throw an IOException.
!      * Closing a previously-closed stream, however, has no effect.
!      * @exception IOException  If an I/O error occurs
       */
      public void close () throws IOException
      {
-         mStream = null;
-         if (null != mReader)
-             mReader.close ();
-         mReader = null;
-         mBuffer = null;
-         mLevel = 0;
-         mOffset = 0;
-         mMark = -1;
      }
!     
      /**
       * Read a single character.
--- 175,186 ----
  
      /**
!      * Does nothing.
!      * It's supposed to close the stream, but use destroy() instead.
!      * @see #destroy
       */
      public void close () throws IOException
      {
      }
! 
      /**
       * Read a single character.
***************
*** 342,345 ****
--- 333,370 ----
          
          return (ret);
+     }
+     
+     //
+     // Methods not in your Daddy's Reader
+     //
+ 
+     /**
+      * Undo the read of a single character.
+      * @exception IOException If no characters have been read.
+      */
+     public void unread () throws IOException
+     {
+         if (0 < mOffset)
+             mOffset--;
+         else
+             throw new IOException ("can't unread no characters");
+     }
+ 
+     /**
+      * Close the stream.  Once a stream has been closed, further read(),
+      * ready(), mark(), or reset() invocations will throw an IOException.
+      * Closing a previously-closed stream, however, has no effect.
+      * @exception IOException  If an I/O error occurs
+      */
+     public void destroy () throws IOException
+     {
+         mStream = null;
+         if (null != mReader)
+             mReader.close ();
+         mReader = null;
+         mBuffer = null;
+         mLevel = 0;
+         mOffset = 0;
+         mMark = -1;
      }
  }

Index: package.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/package.html,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** package.html	11 Aug 2003 00:18:28 -0000	1.2
--- package.html	17 Aug 2003 16:09:27 -0000	1.3
***************
*** 75,78 ****
--- 75,94 ----
  <LI>line 11, offset 7, to line 11, offset 8, string node "\n"</LI>
  </OL>
+ <p>Stream, Source, Page and Lexer
+ <p>The package is arranged in four levels, <CODE>Stream</CODE>,
+ <CODE>Source</CODE> <CODE>Page</CODE> and <CODE>Lexer</CODE> in the order of lowest to
+ highest.
+ A <CODE>Stream</CODE> is raw bytes from the URLConnection or file. It has no
+ intelligence. A <CODE>Source</CODE> is raw characters, hence it knows about the
+ encoding scheme used and can be reset if a different encoding is detected after
+ partially reading in the text. A <CODE>Page</CODE> provides characters from the
+ source while maintaining the index of line numbers, and hence can be thought of
+ as an array of strings corresponding to source file lines, but it doesn't
+ actually store any text, relying on the buffering within the
+ <CODE>Source</CODE> instead. The <CODE>Lexer</CODE> contains the actual lexeme parsing
+ code. It reads characters from the page, keeping track of where it is with a
+ <CODE>Cursor</CODE> and creates the array of nodes using various state
+ machines.
+ <p>
  The following are some design goals and 'invariants' within the package, if you
  are attempting to understand or modify it. Things that differ substantially from
***************
*** 88,94 ****
  <DD>Besides complete coverage, the <B>nodes do not contain copies of the text</B>,
  but instead simply contain offsets into a single large buffer that contains the
! text read from the HTML source. Thus there is no lost whitespace or text
! formatting elements either outside or within tags. Upper and lower case text is
! preserved.
  <DT>Line Endings
  <DD><B>End of line characters are just whitespace.</B> There is no distinction
--- 104,110 ----
  <DD>Besides complete coverage, the <B>nodes do not contain copies of the text</B>,
  but instead simply contain offsets into a single large buffer that contains the
! text read from the HTML source. Even within tags, the attributes list can
! contain whitespace, thus there is no lost whitespace or text formatting 
! either outside or within tags. Upper and lower case text is preserved.
  <DT>Line Endings
  <DD><B>End of line characters are just whitespace.</B> There is no distinction
***************
*** 97,121 ****
  multiple lines with no special processing. Line endings are not transformed
  between platforms, i.e. Unix line endings are not converted to Windows line
! endings by this level.  Each node will have a starting and ending
! <CODE>Cursor</CODE>, from which you can get the line number and offset within
! the HTML source, for error messages for example, but in general ignore line
  breaks in the source if at all possible.
- <DT>Stream, Source and Page
- <DD>The package is arranged in three levels, <CODE>Stream</CODE>,
- <CODE>Source</CODE> and <CODE>Page</CODE> in the order of lowest to highest.
- A <CODE>Stream</CODE> is raw bytes from the URLConnection or file. It has no
- intelligence. A <CODE>Source</CODE> is raw characters, hence it knows about the
- encoding scheme used and can be reset if a different encoding is detected after
- partially reading in the text. A <CODE>Page</CODE> is the highest level and
- contains the actual lexeme parsing code. It reads from the source and creates
- the array of nodes (<CODE>NodeList</CODE>) using a state machine. 
  <DT>One Parser, One Scan
! <DD>The major lexeme state machine has the following minor state machines corresponding
  (roughly) to the <B>four parsers it replaces</B> (StringParser, RemarkNodeParser,
! AttributeParser. TagParser):
! <LI>in text</LI>
! <LI>in comment</LI>
! <LI>in quote</LI>
! <LI>in tag</LI>
  By integrating the four state machines into one, a single pass over the text is
  all that's needed for a low level parse of the HTML source. In previous
--- 113,127 ----
  multiple lines with no special processing. Line endings are not transformed
  between platforms, i.e. Unix line endings are not converted to Windows line
! endings by this level.  Each node will has a starting and ending location, which
! the page can use to extract the text. To facilitate formatting error and log messages
! the page can turn these offsets into row and column numbers. In general ignore line
  breaks in the source if at all possible.
  <DT>One Parser, One Scan
! <DD>The Lexer has the following state machines corresponding
  (roughly) to the <B>four parsers it replaces</B> (StringParser, RemarkNodeParser,
! TagParser & AttributeParser):
! <LI>in text - parseString()</LI>
! <LI>in comment - parseRemark()</LI>
! <LI>in tag - parseTag()</LI>
  By integrating the four state machines into one, a single pass over the text is
  all that's needed for a low level parse of the HTML source. In previous

[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,NONE,1.1 Cursor.java,1.2,1.3 Page.ja

[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Lexer.java,NONE,1.1 Cursor.java,1.2,1.3 Page.java,1.4,1.5 PageIndex.java,1.2,1.3 Source.java,1.3,1.4 package.html,1.2,1.3