[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Cursor.java,NONE,1.1 PageIndex.java,NONE,1.1 pa
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-08-10 23:33:40
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv1528/lexer Modified Files: Page.java Added Files: Cursor.java PageIndex.java package.html Log Message: Second drop for new io subsystem. --- NEW FILE: Cursor.java --- // HTMLParser Library v1_4_20030727 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com // // This class was contributed by // Derrick Oswald // package org.htmlparser.lexer; import org.htmlparser.util.sort.Ordered; /** * A bookmark in a page. * This class remembers the page it came from and its position within the page. */ public class Cursor implements Ordered { /** * This cursor's position. */ protected int mPosition; /** * This cursor's page. */ protected Page mPage; /** * Construct a <code>Cursor</code> from the page and position given. * @param page The page this cursor is on. * @param offset The character offset within the page. */ public Cursor (Page page, int offset) { mPage = page; mPosition = offset; } /** * Get this cursor's page. * @return The page associated with this cursor. */ public Page getPage () { return (mPage); } /** * Get the position of this cursor. * @return The cursor position. */ public int getPosition () { return (mPosition); } // // Ordered interface // /** * Compare one reference to another. * @see org.htmlparser.util.sort.Ordered */ public int compare (Object that) { Cursor r = (Cursor)that; return (getPosition () - r.getPosition ()); } } --- NEW FILE: PageIndex.java --- // HTMLParser Library v1_4_20030727 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com // // This class was contributed by // Derrick Oswald // package org.htmlparser.lexer; import org.htmlparser.util.sort.Ordered; import org.htmlparser.util.sort.Sort; import org.htmlparser.util.sort.Sortable; /** * A sorted array of integers which are the positions of end of line characters. * Maintains a list of integers which are (the positions of the first * characters of each line. * To facilitate processing the first element should be maintained at position 0. * Facilities to add, remove, search and determine row and column are provided. * This class provides similar functionality to a Vector but * does not incur the overhead of an <code>Integer</code> object per element. */ public class PageIndex implements Sortable { /** * Increment for allocations. */ protected static final int mIncrement = 10; /** * The number of valid elements. */ protected int mCount; /** * The elements. */ protected int[] mIndices; /** * The page associated with this index. */ protected Page mPage; /** * Create an empty index. * @param page The page associated with this index. */ public PageIndex (Page page) { mPage = page; mIndices = new int[mIncrement]; mCount = 0; } /** * Create an index with the one element given. * @param page The page associated with this index. * @param cursor The single element for the new index. */ public PageIndex (Page page, int cursor) { this (page); mIndices[0] = cursor; mCount = 1; } /** * Create an index with the elements given. * @param page The page associated with this index. * @param cursors The initial elements of the index. * NOTE: The list must be sorted in ascending order. */ public PageIndex (Page page, int[] cursors) { mPage = page; mIndices = cursors; mCount = cursors.length; } /** * Get this index's page. * @return The page associated with this index. */ public Page getPage () { return (mPage); } /** * Get the count of elements. * @return The number of valid elements. */ public int size () { return (mCount); } /** * Get the capacity for elements without reallocation. * @return The number of spaces for elements. */ public int capacity () { return (mIndices.length); } /** * Add an element to the list * @param cursor The element to add. * @return The position at which the element was inserted or * the index of the existing element if it is a duplicate. */ public int add (Cursor cursor) { int position; int ret; // find where it goes ret = Sort.bsearch (this, cursor); // insert, but not twice position = cursor.getPosition (); if (!((ret < size ()) && (position == mIndices[ret]))) insertElementAt (position, ret); return (ret); } /** * Add an element to the list * @param cursor The element to add. * @return The position at which the element was inserted or * the index of the existing element if it is a duplicate. */ public int add (int cursor) { return (add (new Cursor (getPage (), cursor))); } /** * Remove an element from the list * @param cursor The element to remove. */ public void remove (Cursor cursor) { int i; // find it i = Sort.bsearch (this, cursor); // remove if ((i < size ()) && (cursor.getPosition () == mIndices[i])) removeElementAt (i); } /** * Remove an element from the list * @param cursor The element to remove. */ public void remove (int cursor) { remove (new Cursor (getPage (), cursor)); } /** * Get an element from the list. * @param index The index of the element to get. * @return The element. */ public int elementAt (int index) { return (mIndices[index]); } /** * Get the line number for a cursor. * @param cursor The character offset into the page. * @return The line number the character is in. */ public int row (Cursor cursor) { return (Sort.bsearch (this, cursor)); } /** * Get the line number for a position. * @param cursor The character offset into the page. * @return The line number the character is in. */ public int row (int cursor) { return (row (new Cursor (getPage (), cursor))); } /** * Get the column number for a cursor. * @param cursor The character offset into the page. * @return The character offset into the line this cursor is on. */ public int column (Cursor cursor) { int row; int previous; row = Sort.bsearch (this, cursor); // note, this shouldn't be zero if the first element of each index is offset zero if (0 != row) previous = this.elementAt (row - 1); else previous = this.elementAt (0); return (cursor.getPosition () - previous); } /** * Get the column number for a position. * @param cursor The character offset into the page. * @return The character offset into the line this cursor is on. */ public int column (int cursor) { return (column (new Cursor (getPage (), cursor))); } /** * Get the elements as an array of int. * @return A new array containing the elements, * i.e. a snapshot of the index. */ public int[] get () { int[] ret = new int[size ()]; System.arraycopy (mIndices, 0, ret, 0, size ()); return (ret); } /** * Binary search for the element. * @param cursor The element to search for. * @return The index at which the element was found or is to be inserted. */ protected int bsearch (int cursor) { return (Sort.bsearch (this, new Cursor (getPage (), cursor))); } /** * Binary search for the element. * @param cursor The element to search for. * @param first The index to start at. * @param last The index to stop at. * @return The index at which the element was found or is to be inserted. */ protected int bsearch (int cursor, int first, int last) { return (Sort.bsearch (this, new Cursor (getPage (), cursor), first, last)); } /** * Inserts an element into the list. * The index must be a value greater than or equal to 0 and less than * or equal to the current size of the array. * @param cursor The element to insert. * @param index The index in the list to insert it at. */ protected void insertElementAt (int cursor, int index) { if ((index >= capacity ()) || (size () == capacity ())) { // allocate more space int new_values[] = new int[Math.max (capacity () + mIncrement, index + 1)]; if (index < capacity ()) { // copy and shift up in two pieces System.arraycopy (mIndices, 0, new_values, 0, index); System.arraycopy (mIndices, index, new_values, index + 1, capacity () - index); } else System.arraycopy (mIndices, 0, new_values, 0, capacity ()); mIndices = new_values; } else if (index < size ()) // shift up System.arraycopy (mIndices, index, mIndices, index + 1, capacity () - (index + 1)); mIndices[index] = cursor; mCount++; } /** * Remove an element from the list. * @param index The index of the item to remove. */ protected void removeElementAt (int index) { // shift System.arraycopy (mIndices, index + 1, mIndices, index, capacity () - (index + 1)); mIndices[capacity() - 1] = 0; mCount--; } // // Sortable interface // /** * Returns the first index of the Sortable. * @return The index of the first element. */ public int first () { return (0); } /** * Returns the last index of the Sortable. * @return The index of the last element. * If this were an array object this would be (object.length - 1). */ public int last () { return (mCount - 1); } /** * Fetch the object at the given index. * @param index The item number to get. * @param reuse If this argument is not null, it is an object * acquired from a previous fetch that is no longer needed and * may be returned as the result if it makes mores sense to alter * and return it than to fetch or create a new element. That is, the * reuse object is garbage and may be used to avoid allocating a new * object if that would normally be the strategy. * @return The Ordered object at that index. */ public Ordered fetch (int index, Ordered reuse) { Cursor ret; if (null != reuse) { ret = (Cursor)reuse; ret.mPosition = mIndices[index]; ret.mPage = getPage (); // redundant } else ret = new Cursor (getPage (), mIndices[index]); return (ret); } /** * Swaps the elements at the given indicies. * @param i One index. * @param j The other index. */ public void swap (int i, int j) { int temp = mIndices[i]; mIndices[i] = mIndices[j]; mIndices[j] = temp; } } --- NEW FILE: package.html --- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <HTML> <HEAD> <!-- @(#)package.html 1.60 98/01/27 HTMLParser Library v1_4_20030727 - A java-based parser for HTML Copyright (C) Dec 31, 2000 Somik Raha This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA For any questions or suggestions, you can write to me at : Email :so...@in... Postal Address : Somik Raha Extreme Programmer & Coach Industrial Logic Corporation 2583 Cedar Street, Berkeley, CA 94708, USA Website : http://www.industriallogic.com --> <TITLE>Lexer Package</TITLE> </HEAD> <BODY> The lexer package will eventually be the base level I/O subsystem. <EM>It is currently under development.</EM> <P>The lexer package is responsible for reading characters from the HTML source and identifying the node lexemes. For example, the HTML code below would return the list of nodes shown:</P> <PRE> <html><head><title>Humoresque</title></head> <body bgcolor='silver'> Passengers will please refrain from flushing toilets while the train is standing in the station. I love you! <p> We encourage constipation while the train is in the station If the train can't go then why should you. </body> </html> </PRE> <OL> <LI>line 0, offset 0, to line 0, offset 6, html tag</LI> <LI>line 0, offset 6, to line 0, offset 12, head tag</LI> <LI>line 0, offset 12, to line 0, offset 19, title tag</LI> <LI>line 0, offset 19, to line 0, offset 29, string node "Humoresque"</LI> <LI>line 0, offset 29, to line 0, offset 37, end title tag</LI> <LI>line 0, offset 37, to line 0, offset 44, end head tag</LI> <LI>line 0, offset 44, to line 0, offset 45, string node "\n"</LI> <LI>line 1, offset 0, to line 1, offset 23, body tag</LI> <LI>line 1, offset 23, to line 4, offset 40, string node "\nPassengers...you!\n"</LI> <LI>line 5, offset 0, to line 5, offset 2, paragraph tag</LI> <LI>line 5, offset 3, to line 9, offset 21, string node "\nWe...you.\n"</LI> <LI>line 10, offset 0, to line 10, offset 7, end body tag</LI> <LI>line 10, offset 8, to line 10, offset 9, string "\n"</LI> <LI>line 11, offset 0, to line 11, offset 7, html tag</LI> <LI>line 11, offset 7, to line 11, offset 8, string node "\n"</LI> </OL> The following are some design goals and 'invariants' within the package, if you are attempting to understand or modify it. Things that differ substantially from previous implementations are highlighted in <B>bold</B>. <DL> <DT>Contiguous Nodes <DD><B>Adjacent nodes have no characters between them.</B> The list of nodes forms an uninterrupted chain that, by start and end definitions, completely covers the characters that were read from the HTML source. Despite this, the nodes are not stored in a linked list, but rather an array to ease any editing tasks that may be performed. <DT>Text Fidelity <DD>Besides complete coverage, the <B>nodes do not contain copies of the text</B>, but instead simply contain offsets into a single large buffer that contains the text read from the HTML source. Thus there is no lost whitespace or text formatting elements either outside or within tags. Upper and lower case text is preserved. <DT>Line Endings <DD><B>End of line characters are just whitespace.</B> There is no distinction made between end of line characters (or pairs of characters on Windows) and other whitespace. The text is not read in line by line so nodes (tags) can easily span multiple lines with no special processing. Line endings are not transformed between platforms, i.e. Unix line endings are not converted to Windows line endings by this level. Each node will have a starting and ending <CODE>Cursor</CODE>, from which you can get the line number and offset within the HTML source, for error messages for example, but in general ignore line breaks in the source if at all possible. <DT>Stream, Source and Page <DD>The package is arranged in three levels, <CODE>Stream</CODE>, <CODE>Source</CODE> and <CODE>Page</CODE> in the order of lowest to highest. A <CODE>Stream</CODE> is raw bytes from the URLConnection or file. It has no intelligence. A <CODE>Source</CODE> is raw characters, hence it knows about the encoding scheme used and can be reset if a different encoding is detected after partially reading in the text. A <CODE>Page</CODE> is the highest level and contains the actual lexeme parsing code. It reads from the source and creates the array of nodes (<CODE>NodeList</CODE>) using a state machine. <DT>One Parser, One Scan <DD>The major lexeme state machine has the following minor state machines corresponding (roughly) to the <B>four parsers it replaces</B> (StringParser, RemarkNodeParser, AttributeParser. TagParser): <LI>in text</LI> <LI>in comment</LI> <LI>in quote</LI> <LI>in tag</LI> By integrating the four state machines into one, a single pass over the text is all that's needed for a low level parse of the HTML source. In previous implementations, the attributes were parsed on a second scan after the initial tag was extracted. <DT>Two Jars <DD>For elementary operations at the node level, a minimalist jar file containing <B>only the lexer and base tag classes</B> is split out from the larger <CODE>htmlparser.jar</CODE>. In this way, simple parsing and output is handled with a jar file that is under 40 kilobytes, but anything beyond peephole manipulation, i.e. closing tag detection and other semantic reasoning will need the full set of scanners, nodes and ancillary classes, which now stands at 160 kilobytes. </DL> </BODY> </HTML> Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** Page.java 27 Jul 2003 19:19:17 -0000 1.2 --- Page.java 10 Aug 2003 23:33:36 -0000 1.3 *************** *** 35,42 **** --- 35,45 ---- import java.lang.reflect.Method; import java.net.URLConnection; + import java.net.UnknownHostException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; + import org.htmlparser.util.ParserException; + /** * Represents the contents of an HTML page. *************** *** 79,88 **** * Character positions of the first character in each line. */ ! protected int mIndex[]; ! ! /** ! * The index position to be used next. ! */ ! protected int mIndexLength; /** --- 82,86 ---- * Character positions of the first character in each line. */ ! protected PageIndex mIndex; /** *************** *** 109,128 **** * @exception IOException If an i/o exception occurs creating the * source. ! * @exception UnsupportedEncodingException if the character set specified in the * HTTP header is not supported. */ ! public Page (URLConnection connection) ! throws ! IOException, ! UnsupportedEncodingException { if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); ! connection.connect (); ! mSource = new Source (new Stream (connection.getInputStream ()), getCharacterSet (connection)); mCharacters = null; mString = null; ! mIndex = null; ! mIndexLength = 0; } --- 107,145 ---- * @exception IOException If an i/o exception occurs creating the * source. ! * @exception ParserException An exception object wrapping a number of ! * possible error conditions, some of which are outlined below. ! * UnsupportedEncodingException if the character set specified in the * HTTP header is not supported. */ ! public Page (URLConnection connection) throws ParserException ! // throws ! // IOException, ! // UnsupportedEncodingException { if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); ! try ! { ! connection.connect (); ! } ! catch (UnknownHostException uhe) ! { ! throw new ParserException ("the host (" + connection.getURL ().getHost () + ") was not found", uhe); ! } ! catch (IOException ioe) ! { ! throw new ParserException ("oops", ioe); ! } ! try ! { ! mSource = new Source (new Stream (connection.getInputStream ()), getCharacterSet (connection)); ! } ! catch (IOException ioe) ! { ! throw new ParserException ("oops2", ioe); ! } mCharacters = null; mString = null; ! mIndex = new PageIndex (this); } *************** *** 168,172 **** * If the charset parameter is not found in the given string, the default * character set is returned. ! * @see ParserHelper#findCharset * @see #DEFAULT_CHARSET */ --- 185,189 ---- * If the charset parameter is not found in the given string, the default * character set is returned. ! * @see #findCharset * @see #DEFAULT_CHARSET */ |