[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer InputStreamSource.java,NONE,1.1 StringSource.ja
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2004-07-03 13:56:31
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14744/lexer Modified Files: Page.java Source.java Added Files: InputStreamSource.java StringSource.java Log Message: Further fix to bug #973137 Double-bytes characters are messed after parsing. Created a proper String based source with the encoding only optionally specified. A string is no longer converted to a byte array and then back to characters. Index: Source.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Source.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** Source.java 2 Jan 2004 16:24:53 -0000 1.15 --- Source.java 3 Jul 2004 13:56:08 -0000 1.16 *************** *** 27,52 **** package org.htmlparser.lexer; - import java.io.ByteArrayInputStream; import java.io.IOException; - import java.io.InputStream; - import java.io.InputStreamReader; - import java.io.ObjectInputStream; - import java.io.ObjectOutputStream; import java.io.Reader; import java.io.Serializable; ! import java.io.UnsupportedEncodingException; /** * A buffered source of characters. ! * A Source is very similar to a the following construct: * <pre> ! * new InputStreamReader (new BufferedInputStream (connection.getInputStream ()), charset) * </pre> ! * It differs from the above, in two ways: ! * <li>the fetching of bytes from the connection's input stream may be asynchronous</li> * <li>the character set may be changed, which resets the input stream</li> ! * */ ! public class Source extends Reader --- 27,50 ---- package org.htmlparser.lexer; import java.io.IOException; import java.io.Reader; import java.io.Serializable; ! ! import org.htmlparser.util.ParserException; /** * A buffered source of characters. ! * A Source is very similar to a Reader, like: * <pre> ! * new InputStreamReader (connection.getInputStream (), charset) * </pre> ! * It differs from the above, in three ways: ! * <ul> ! * <li>the fetching of bytes may be asynchronous</li> * <li>the character set may be changed, which resets the input stream</li> ! * <li>characters may be requested more than once, so in general they will be buffered</li> ! * </ul> */ ! public abstract class Source extends Reader *************** *** 55,258 **** { /** ! * An initial buffer size. ! */ ! public static int BUFFER_SIZE = 16384; ! ! /** ! * Return value when no more characters are left. */ public static final int EOF = -1; /** - * The stream of bytes. - */ - protected transient InputStream mStream; - - /** - * The character set in use. - */ - protected String mEncoding; - - /** - * The converter from bytes to characters. - */ - protected transient InputStreamReader mReader; - - /** - * The characters read so far. - */ - public /*volatile*/ char[] mBuffer; - - /** - * The number of valid bytes in the buffer. - */ - public /*volatile*/ int mLevel; - - /** - * The offset of the next byte returned by read(). - */ - public /*volatile*/ int mOffset; - - /** - * The bookmark. - */ - protected int mMark; - - /** - * Create a source of characters using the default character set. - * @param stream The stream of bytes to use. - */ - public Source (InputStream stream) - throws - UnsupportedEncodingException - { - this (stream, null, BUFFER_SIZE); - } - - /** - * Create a source of characters. - * @param stream The stream of bytes to use. - * @param charset The character set used in encoding the stream. - */ - public Source (InputStream stream, String charset) - throws - UnsupportedEncodingException - { - this (stream, charset, BUFFER_SIZE); - } - /** - * Create a source of characters. - * @param stream The stream of bytes to use. - * @param charset The character set used in encoding the stream. - */ - public Source (InputStream stream, String charset, int buffer_size) - throws - UnsupportedEncodingException - { - if (null == stream) - stream = new Stream (null); - mStream = stream; - if (null == charset) - { - mReader = new InputStreamReader (stream); - mEncoding = mReader.getEncoding (); - } - else - { - mEncoding = charset; - mReader = new InputStreamReader (stream, charset); - } - mBuffer = new char[buffer_size]; - mLevel = 0; - mOffset = 0; - mMark = -1; - } - - // - // Serialization support - // - - private void writeObject (ObjectOutputStream out) - throws - IOException - { - int offset; - char[] buffer; - - if (null != mStream) - { - // remember the offset, drain the input stream, restore the offset - offset = mOffset; - buffer = new char[4096]; - while (-1 != read (buffer)) - ; - mOffset = offset; - } - - out.defaultWriteObject (); - } - - private void readObject (ObjectInputStream in) - throws - IOException, - ClassNotFoundException - { - in.defaultReadObject (); - if (null != mBuffer) // buffer is null when destroy's been called - // pretend we're open, mStream goes null when exhausted - mStream = new ByteArrayInputStream (new byte[0]); - } - - /** - * Get the input stream being used. - * @return The current input stream. - */ - public InputStream getStream () - { - return (mStream); - } - - /** * Get the encoding being used to convert characters. * @return The current encoding. */ ! public String getEncoding () ! { ! return (mEncoding); ! } /** ! * Fetch more characters from the underlying reader. ! * Has no effect if the underlying reader has been drained. ! * @param min The minimum to read. ! * @exception IOException If the underlying reader read() throws one. */ ! protected void fill (int min) throws ! IOException ! { ! char[] buffer; ! int size; ! int read; ! ! if (null != mReader) // mReader goes null when it's been sucked dry ! { ! size = mBuffer.length - mLevel; // available space ! if (size < min) // oops, better get some buffer space ! { ! // unknown length... keep doubling ! size = mBuffer.length * 2; ! read = mLevel + min; ! if (size < read) // or satisfy min, whichever is greater ! size = read; ! else ! min = size - mLevel; // read the max ! buffer = new char[size]; ! } ! else ! { ! buffer = mBuffer; ! min = size; ! } ! ! // read into the end of the 'new' buffer ! read = mReader.read (buffer, mLevel, min); ! if (-1 == read) ! { ! mReader.close (); ! mReader = null; ! } ! else ! { ! if (mBuffer != buffer) ! { // copy the bytes previously read ! System.arraycopy (mBuffer, 0, buffer, 0, mLevel); ! mBuffer = buffer; ! } ! mLevel += read; ! } ! // todo, should repeat on read shorter than original min ! } ! } // --- 53,84 ---- { /** ! * Return value when the source is exhausted. ! * Has a value of {@value}. */ public static final int EOF = -1; /** * Get the encoding being used to convert characters. * @return The current encoding. */ ! public abstract String getEncoding (); /** ! * Set the encoding to the given character set. ! * If the current encoding is the same as the requested encoding, ! * this method is a no-op. Otherwise any subsequent characters read from ! * this source will have been decoded using the given character set.<p> ! * If characters have already been consumed from this source, it is expected ! * that an exception will be thrown if the characters read so far would ! * be different if the encoding being set was used from the start. ! * @param character_set The character set to use to convert characters. ! * @exception ParserException If a character mismatch occurs between ! * characters already provided and those that would have been returned ! * had the new character set been in effect from the beginning. An ! * exception is also thrown if the character set is not recognized. */ ! public abstract void setEncoding (String character_set) throws ! ParserException; // *************** *** 262,350 **** /** * Does nothing. ! * It's supposed to close the stream, but use destroy() instead. * @see #destroy */ ! public void close () throws IOException ! { ! } /** * Read a single character. * This method will block until a character is available, ! * an I/O error occurs, or the end of the stream is reached. * @return The character read, as an integer in the range 0 to 65535 ! * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has ! * been reached * @exception IOException If an I/O error occurs. */ ! public int read () throws IOException ! { ! int ret; ! ! if (mLevel - mOffset < 1) ! { ! if (null == mStream) // mStream goes null on close() ! throw new IOException ("reader is closed"); ! fill (1); ! if (mOffset >= mLevel) ! ret = EOF; ! else ! ret = mBuffer[mOffset++]; ! } ! else ! ret = mBuffer[mOffset++]; ! ! return (ret); ! } /** * Read characters into a portion of an array. This method will block ! * until some input is available, an I/O error occurs, or the end of the ! * stream is reached. * @param cbuf Destination buffer * @param off Offset at which to start storing characters * @param len Maximum number of characters to read ! * @return The number of characters read, or -1 if the end of the ! * stream has been reached * @exception IOException If an I/O error occurs. */ ! public int read (char[] cbuf, int off, int len) throws IOException ! { ! int ret; ! ! if (null == mStream) // mStream goes null on close() ! throw new IOException ("reader is closed"); ! if ((null == cbuf) || (0 > off) || (0 > len)) ! throw new IOException ("illegal argument read (" ! + ((null == cbuf) ? "null" : "cbuf") ! + ", " + off + ", " + len + ")"); ! if (mLevel - mOffset < len) ! fill (len - (mLevel - mOffset)); // minimum to satisfy this request ! if (mOffset >= mLevel) ! ret = EOF; ! else ! { ! ret = Math.min (mLevel - mOffset, len); ! System.arraycopy (mBuffer, mOffset, cbuf, off, ret); ! mOffset += ret; ! } ! ! return (ret); ! } /** * Read characters into an array. * This method will block until some input is available, an I/O error occurs, ! * or the end of the stream is reached. * @param cbuf Destination buffer. ! * @return The number of characters read, or -1 if the end of the stream has ! * been reached. * @exception IOException If an I/O error occurs. */ ! public int read (char[] cbuf) throws IOException ! { ! return (read (cbuf, 0, cbuf.length)); ! } /** --- 88,138 ---- /** * Does nothing. ! * It's supposed to close the source, but use {@link #destroy} instead. * @see #destroy */ ! public abstract void close () throws IOException; /** * Read a single character. * This method will block until a character is available, ! * an I/O error occurs, or the source is exhausted. * @return The character read, as an integer in the range 0 to 65535 ! * (<tt>0x00-0xffff</tt>), or {@link #EOF} if the source is exhausted. * @exception IOException If an I/O error occurs. */ ! public abstract int read () throws IOException; /** * Read characters into a portion of an array. This method will block ! * until some input is available, an I/O error occurs, or the source is ! * exhausted. * @param cbuf Destination buffer * @param off Offset at which to start storing characters * @param len Maximum number of characters to read ! * @return The number of characters read, or {@link #EOF} if the esource is ! * exhausted. * @exception IOException If an I/O error occurs. */ ! public abstract int read (char[] cbuf, int off, int len) throws IOException; /** * Read characters into an array. * This method will block until some input is available, an I/O error occurs, ! * or the source is exhausted. * @param cbuf Destination buffer. ! * @return The number of characters read, or {@link #EOF} if the esource is ! * exhausted. * @exception IOException If an I/O error occurs. */ + public abstract int read (char[] cbuf) throws IOException; ! /** ! * Tell whether this source is ready to be read. ! * @return <code>true</code> if the next read() is guaranteed not to block ! * for input, <code>false</code> otherwise. ! * Note that returning false does not guarantee that the next read will block. ! * @exception IOException If an I/O error occurs. ! */ ! public abstract boolean ready () throws IOException; /** *************** *** 353,408 **** * @exception IllegalStateException If the source has been closed. */ ! public void reset () ! { ! if (null == mStream) // mStream goes null on close() ! throw new IllegalStateException ("source is closed"); ! if (-1 != mMark) ! mOffset = mMark; ! else ! mOffset = 0; ! } ! ! /** ! * Tell whether this stream supports the mark() operation. ! * @return <code>true</code> if and only if this stream supports the mark operation. ! */ ! public boolean markSupported () ! { ! return (true); ! } /** ! * Mark the present position in the stream. Subsequent calls to reset() ! * will attempt to reposition the stream to this point. Not all ! * character-input streams support the mark() operation. ! * @param readAheadLimit <em>Not used.</em> ! * @exception IOException <em>Never thrown</em>. ! * */ ! public void mark (int readAheadLimit) throws IOException ! { ! if (null == mStream) // mStream goes null on close() ! throw new IOException ("reader is closed"); ! mMark = mOffset; ! } /** ! * Tell whether this stream is ready to be read. ! * @return <code>true</code> if the next read() is guaranteed not to block ! * for input, <code>false</code> otherwise. ! * Note that returning false does not guarantee that the next read will block. ! * @exception IOException <em>Never thrown</em>. */ ! public boolean ready () throws IOException ! { ! if (null == mStream) // mStream goes null on close() ! throw new IOException ("reader is closed"); ! return (mOffset < mLevel); ! } /** * Skip characters. * This method will block until some characters are available, ! * an I/O error occurs, or the end of the stream is reached. * <em>Note: n is treated as an int</em> * @param n The number of characters to skip. --- 141,168 ---- * @exception IllegalStateException If the source has been closed. */ ! public abstract void reset (); /** ! * Tell whether this source supports the mark() operation. ! * @return <code>true</code> if and only if this source supports the mark ! * operation. */ ! public abstract boolean markSupported (); /** ! * Mark the present position. ! * Subsequent calls to {@link #reset} ! * will attempt to reposition the source to this point. Not all ! * sources support the mark() operation. ! * @param readAheadLimit The minimum number of characters that can be read ! * before this mark becomes invalid. ! * @exception IOException If an I/O error occurs. */ ! public abstract void mark (int readAheadLimit) throws IOException; /** * Skip characters. * This method will block until some characters are available, ! * an I/O error occurs, or the source is exhausted. * <em>Note: n is treated as an int</em> * @param n The number of characters to skip. *************** *** 411,432 **** * @exception IOException If an I/O error occurs. */ ! public long skip (long n) throws IOException ! { ! long ret; ! ! if (null == mStream) // mStream goes null on close() ! throw new IOException ("reader is closed"); ! if (mLevel - mOffset < n) ! fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request ! if (mOffset >= mLevel) ! ret = EOF; ! else ! { ! ret = Math.min (mLevel - mOffset, n); ! mOffset += ret; ! } ! ! return (ret); ! } // --- 171,175 ---- * @exception IOException If an I/O error occurs. */ ! public abstract long skip (long n) throws IOException; // *************** *** 436,475 **** /** * Undo the read of a single character. ! * @exception IOException If no characters have been read. */ ! public void unread () throws IOException ! { ! if (0 < mOffset) ! mOffset--; ! else ! throw new IOException ("can't unread no characters"); ! } /** ! * Close the stream. Once a stream has been closed, further read(), ! * ready(), mark(), or reset() invocations will throw an IOException. ! * Closing a previously-closed stream, however, has no effect. ! * @exception IOException If an I/O error occurs */ ! public void destroy () throws IOException ! { ! mStream = null; ! if (null != mReader) ! mReader.close (); ! mReader = null; ! mBuffer = null; ! mLevel = 0; ! mOffset = 0; ! mMark = -1; ! } /** * Get the position (in characters). ! * @return The number of characters that have been read. */ ! public int offset () ! { ! return (mOffset); ! } /** --- 179,247 ---- /** * Undo the read of a single character. ! * @exception IOException If the source is closed or no characters have ! * been read. */ ! public abstract void unread () throws IOException; /** ! * Retrieve a character again. ! * @param offset The offset of the character. ! * @return The character at <code>offset</code>. ! * @exception IOException If the source is closed or the offset is beyond ! * {@link #offset()}. */ ! public abstract char getCharacter (int offset) throws IOException; ! ! /** ! * Retrieve characters again. ! * @param array The array of characters. ! * @param offset The starting position in the array where characters are to be placed. ! * @param start The starting position, zero based. ! * @param end The ending position ! * (exclusive, i.e. the character at the ending position is not included), ! * zero based. ! * @exception IOException If the source is closed or the start or end is ! * beyond {@link #offset()}. ! */ ! public abstract void getCharacters (char[] array, int offset, int start, int end) throws IOException; ! ! /** ! * Retrieve a string comprised of characters already read. ! * @param offset The offset of the first character. ! * @param length The number of characters to retrieve. ! * @return A string containing the <code>length</code> characters at <code>offset</code>. ! * @exception IOException If the source is closed. ! */ ! public abstract String getString (int offset, int length) throws IOException; ! ! /** ! * Append characters already read into a <code>StringBuffer</code>. ! * @param buffer The buffer to append to. ! * @param offset The offset of the first character. ! * @param length The number of characters to retrieve. ! * @return A string containing the <code>length</code> characters at <code>offset</code>. ! * @exception IOException If the source is closed or the offset or ! * (offset + length) is beyond {@link #offset()}. ! */ ! public abstract void getCharacters (StringBuffer buffer, int offset, int length) throws IOException; ! ! /** ! * Close the source. ! * Once a source has been closed, further {@link #read() read}, ! * {@link #ready ready}, {@link #mark mark}, {@link #reset reset}, ! * {@link #skip skip}, {@link #unread unread}, ! * {@link #getCharacter getCharacter} or {@link #getString getString} ! * invocations will throw an IOException. ! * Closing a previously-closed source, however, has no effect. ! * @exception IOException If an I/O error occurs. ! */ ! public abstract void destroy () throws IOException; /** * Get the position (in characters). ! * @return The number of characters that have already been read, or ! * {@link #EOF} if the source is closed. */ ! public abstract int offset (); /** *************** *** 477,483 **** * @return The number of characters that can be read without blocking. */ ! public int available () ! { ! return (mLevel - mOffset); ! } } --- 249,252 ---- * @return The number of characters that can be read without blocking. */ ! public abstract int available (); } --- NEW FILE: InputStreamSource.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/InputStreamSource.java,v $ // $Author: derrickoswald $ // $Date: 2004/07/03 13:56:08 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.lexer; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Reader; import java.io.Serializable; import java.io.UnsupportedEncodingException; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.ParserException; /** * A source of characters based on an InputStream such as from a URLConnection. */ public class InputStreamSource extends Source { /** * An initial buffer size. * Has a default value of 16384. */ public static int BUFFER_SIZE = 16384; /** * The stream of bytes. * Set to <code>null</code> when the source is closed. */ protected transient InputStream mStream; /** * The character set in use. */ protected String mEncoding; /** * The converter from bytes to characters. */ protected transient InputStreamReader mReader; /** * The characters read so far. */ public /*volatile*/ char[] mBuffer; /** * The number of valid bytes in the buffer. */ public /*volatile*/ int mLevel; /** * The offset of the next byte returned by read(). */ public /*volatile*/ int mOffset; /** * The bookmark. */ protected int mMark; /** * Create a source of characters using the default character set. * @param stream The stream of bytes to use. * @exception UnsupportedEncodingException If the default character set is unsupported. */ public InputStreamSource (InputStream stream) throws UnsupportedEncodingException { this (stream, null, BUFFER_SIZE); } /** * Create a source of characters. * @param stream The stream of bytes to use. * @param charset The character set used in encoding the stream. * @exception UnsupportedEncodingException If the character set is unsupported. */ public InputStreamSource (InputStream stream, String charset) throws UnsupportedEncodingException { this (stream, charset, BUFFER_SIZE); } /** * Create a source of characters. * @param stream The stream of bytes to use. * @param charset The character set used in encoding the stream. * @param buffer_size The initial character buffer size. * @exception UnsupportedEncodingException If the character set is unsupported. */ public InputStreamSource (InputStream stream, String charset, int buffer_size) throws UnsupportedEncodingException { if (null == stream) stream = new Stream (null); mStream = stream; if (null == charset) { mReader = new InputStreamReader (stream); mEncoding = mReader.getEncoding (); } else { mEncoding = charset; mReader = new InputStreamReader (stream, charset); } mBuffer = new char[buffer_size]; mLevel = 0; mOffset = 0; mMark = -1; } // // Serialization support // private void writeObject (ObjectOutputStream out) throws IOException { int offset; char[] buffer; if (null != mStream) { // remember the offset, drain the input stream, restore the offset offset = mOffset; buffer = new char[4096]; while (EOF != read (buffer)) ; mOffset = offset; } out.defaultWriteObject (); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject (); if (null != mBuffer) // buffer is null when destroy's been called // pretend we're open, mStream goes null when exhausted mStream = new ByteArrayInputStream (new byte[0]); } /** * Get the input stream being used. * @return The current input stream. */ public InputStream getStream () { return (mStream); } /** * Get the encoding being used to convert characters. * @return The current encoding. */ public String getEncoding () { return (mEncoding); } /** * Begins reading from the source with the given character set. * If the current encoding is the same as the requested encoding, * this method is a no-op. Otherwise any subsequent characters read from * this page will have been decoded using the given character set.<p> * Some magic happens here to obtain this result if characters have already * been consumed from this source. * Since a Reader cannot be dynamically altered to use a different character * set, the underlying stream is reset, a new Source is constructed * and a comparison made of the characters read so far with the newly * read characters up to the current position. * If a difference is encountered, or some other problem occurs, * an exception is thrown. * @param character_set The character set to use to convert bytes into * characters. * @exception ParserException If a character mismatch occurs between * characters already provided and those that would have been returned * had the new character set been in effect from the beginning. An * exception is also thrown if the underlying stream won't put up with * these shenanigans. */ public void setEncoding (String character_set) throws ParserException { String encoding; InputStream stream; char[] buffer; int offset; char[] new_chars; encoding = getEncoding (); if (!encoding.equalsIgnoreCase (character_set)) { stream = getStream (); try { buffer = mBuffer; offset = mOffset; stream.reset (); mEncoding = character_set; mReader = new InputStreamReader (stream, character_set); mBuffer = new char[mBuffer.length]; mLevel = 0; mOffset = 0; mMark = -1; if (0 != offset) { new_chars = new char[offset]; if (offset != read (new_chars)) throw new ParserException ("reset stream failed"); for (int i = 0; i < offset; i++) if (new_chars[i] != buffer[i]) throw new EncodingChangeException ("character mismatch (new: " + new_chars[i] + " != old: " + buffer[i] + ") for encoding change from " + encoding + " to " + character_set + " at character offset " + offset); } } catch (IOException ioe) { throw new ParserException (ioe.getMessage (), ioe); } } } /** * Fetch more characters from the underlying reader. * Has no effect if the underlying reader has been drained. * @param min The minimum to read. * @exception IOException If the underlying reader read() throws one. */ protected void fill (int min) throws IOException { char[] buffer; int size; int read; if (null != mReader) // mReader goes null when it's been sucked dry { size = mBuffer.length - mLevel; // available space if (size < min) // oops, better get some buffer space { // unknown length... keep doubling size = mBuffer.length * 2; read = mLevel + min; if (size < read) // or satisfy min, whichever is greater size = read; else min = size - mLevel; // read the max buffer = new char[size]; } else { buffer = mBuffer; min = size; } // read into the end of the 'new' buffer read = mReader.read (buffer, mLevel, min); if (EOF == read) { mReader.close (); mReader = null; } else { if (mBuffer != buffer) { // copy the bytes previously read System.arraycopy (mBuffer, 0, buffer, 0, mLevel); mBuffer = buffer; } mLevel += read; } // todo, should repeat on read shorter than original min } } // // Reader overrides // /** * Does nothing. * It's supposed to close the source, but use destroy() instead. * @see #destroy */ public void close () throws IOException { } /** * Read a single character. * This method will block until a character is available, * an I/O error occurs, or the end of the stream is reached. * @return The character read, as an integer in the range 0 to 65535 * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has * been reached * @exception IOException If an I/O error occurs. */ public int read () throws IOException { int ret; if (mLevel - mOffset < 1) { if (null == mStream) throw new IOException ("source is closed"); fill (1); if (mOffset >= mLevel) ret = EOF; else ret = mBuffer[mOffset++]; } else ret = mBuffer[mOffset++]; return (ret); } /** * Read characters into a portion of an array. This method will block * until some input is available, an I/O error occurs, or the end of the * stream is reached. * @param cbuf Destination buffer * @param off Offset at which to start storing characters * @param len Maximum number of characters to read * @return The number of characters read, or {@link #EOF EOF} if the end of * the stream has been reached * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf, int off, int len) throws IOException { int ret; if (null == mStream) throw new IOException ("source is closed"); if ((null == cbuf) || (0 > off) || (0 > len)) throw new IOException ("illegal argument read (" + ((null == cbuf) ? "null" : "cbuf") + ", " + off + ", " + len + ")"); if (mLevel - mOffset < len) fill (len - (mLevel - mOffset)); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, len); System.arraycopy (mBuffer, mOffset, cbuf, off, ret); mOffset += ret; } return (ret); } /** * Read characters into an array. * This method will block until some input is available, an I/O error occurs, * or the end of the stream is reached. * @param cbuf Destination buffer. * @return The number of characters read, or {@link #EOF EOF} if the end of * the stream has been reached. * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf) throws IOException { return (read (cbuf, 0, cbuf.length)); } /** * Reset the source. * Repositions the read point to begin at zero. * @exception IllegalStateException If the source has been closed. */ public void reset () { if (null == mStream) throw new IllegalStateException ("source is closed"); if (-1 != mMark) mOffset = mMark; else mOffset = 0; } /** * Tell whether this source supports the mark() operation. * @return <code>true</code>. */ public boolean markSupported () { return (true); } /** * Mark the present position in the source. * Subsequent calls to {@link #reset()} * will attempt to reposition the source to this point. * @param readAheadLimit <em>Not used.</em> * @exception IOException If the source is closed. * */ public void mark (int readAheadLimit) throws IOException { if (null == mStream) throw new IOException ("source is closed"); mMark = mOffset; } /** * Tell whether this source is ready to be read. * @return <code>true</code> if the next read() is guaranteed not to block * for input, <code>false</code> otherwise. * Note that returning false does not guarantee that the next read will block. * @exception IOException If the source is closed. */ public boolean ready () throws IOException { if (null == mStream) throw new IOException ("source is closed"); return (mOffset < mLevel); } /** * Skip characters. * This method will block until some characters are available, * an I/O error occurs, or the end of the stream is reached. * <em>Note: n is treated as an int</em> * @param n The number of characters to skip. * @return The number of characters actually skipped * @exception IllegalArgumentException If <code>n</code> is negative. * @exception IOException If an I/O error occurs. */ public long skip (long n) throws IOException { long ret; if (null == mStream) throw new IOException ("source is closed"); if (mLevel - mOffset < n) fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, n); mOffset += ret; } return (ret); } // // Methods not in your Daddy's Reader // /** * Undo the read of a single character. * @exception IOException If the source is closed or no characters have * been read. */ public void unread () throws IOException { if (null == mStream) throw new IOException ("source is closed"); if (0 < mOffset) mOffset--; else throw new IOException ("can't unread no characters"); } /** * Retrieve a character again. * @param offset The offset of the character. * @return The character at <code>offset</code>. * @exception IOException If the offset is beyond {@link #offset()} or the * source is closed. */ public char getCharacter (int offset) throws IOException { char ret; if (null == mStream) throw new IOException ("source is closed"); if (offset >= mBuffer.length) throw new IOException ("illegal read ahead"); else ret = mBuffer[offset]; return (ret); } /** * Retrieve characters again. * @param array The array of characters. * @param offset The starting position in the array where characters are to be placed. * @param start The starting position, zero based. * @param end The ending position * (exclusive, i.e. the character at the ending position is not included), * zero based. * @exception IOException If the start or end is beyond {@link #offset()} * or the source is closed. */ public void getCharacters (char[] array, int offset, int start, int end) throws IOException { if (null == mStream) throw new IOException ("source is closed"); System.arraycopy (mBuffer, start, array, offset, end - start); } /** * Retrieve a string. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @return A string containing the <code>length</code> characters at <code>offset</code>. * @exception IOException If the offset or (offset + length) is beyond * {@link #offset()} or the source is closed. */ public String getString (int offset, int length) throws IOException { String ret; if (null == mStream) throw new IOException ("source is closed"); if (offset + length >= mBuffer.length) throw new IOException ("illegal read ahead"); else ret = new String (mBuffer, offset, length); return (ret); } /** * Append characters already read into a <code>StringBuffer</code>. * @param buffer The buffer to append to. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @return A string containing the <code>length</code> characters at <code>offset</code>. * @exception IOException If the offset or (offset + length) is beyond * {@link #offset()} or the source is closed. */ public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException { if (null == mStream) throw new IOException ("source is closed"); buffer.append (mBuffer, offset, length); } /** * Close the source. * Once a source has been closed, further {@link #read() read}, * {@link #ready ready}, {@link #mark mark}, {@link #reset reset}, * {@link #skip skip}, {@link #unread unread}, * {@link #getCharacter getCharacter} or {@link #getString getString} * invocations will throw an IOException. * Closing a previously-closed source, however, has no effect. * @exception IOException If an I/O error occurs */ public void destroy () throws IOException { mStream = null; if (null != mReader) mReader.close (); mReader = null; mBuffer = null; mLevel = 0; mOffset = 0; mMark = -1; } /** * Get the position (in characters). * @return The number of characters that have already been read, or * {@link #EOF EOF} if the source is closed. */ public int offset () { int ret; if (null == mStream) ret = EOF; else ret = mOffset; return (ret); } /** * Get the number of available characters. * @return The number of characters that can be read without blocking or * zero if the source is closed. */ public int available () { int ret; if (null == mStream) ret = 0; else ret = mLevel - mOffset; return (ret); } } --- NEW FILE: StringSource.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/StringSource.java,v $ // $Author: derrickoswald $ // $Date: 2004/07/03 13:56:08 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.lexer; import java.io.IOException; import org.htmlparser.util.ParserException; /** * A source of characters based on a String. */ public class StringSource extends Source { /** * The source of characters. */ protected String mString; /** * The current offset into the string. */ protected int mOffset; /** * The encoding to report. * Only used by {@link #getEncoding}. */ protected String mEncoding; /** * The bookmark. */ protected int mMark; /** * Construct a source using the provided string. * Until it is set, the encoding will be reported as ISO-8859-1. * @param string The source of characters. */ public StringSource (String string) { this (string, "ISO-8859-1"); } /** * Construct a source using the provided string and encoding. * The encoding is only used by {@link #getEncoding}. * @param string The source of characters. * @param character_set The encoding to report. */ public StringSource (String string, String character_set) { mString = (null == string) ? "" : string; mOffset = 0; mEncoding = character_set; mMark = -1; } /** * Get the encoding being used to convert characters. * @return The current encoding. */ public String getEncoding () { return (mEncoding); } /** * Set the encoding to the given character set. * This simply sets the encoding reported by {@link #getEncoding}. * @param character_set The character set to use to convert characters. * @exception ParserException <em>Not thrown</em>. */ public void setEncoding (String character_set) throws ParserException { mEncoding = character_set; } // // Reader overrides // /** * Does nothing. * It's supposed to close the source, but use destroy() instead. * @see #destroy */ public void close () throws IOException { } /** * Read a single character. * @return The character read, as an integer in the range 0 to 65535 * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the source is exhausted. * @exception IOException If an I/O error occurs. */ public int read () throws IOException { int ret; if (null == mString) throw new IOException ("source is closed"); else if (mOffset >= mString.length ()) ret = EOF; else { ret = mString.charAt (mOffset); mOffset++; } return (ret); } /** * Read characters into a portion of an array. * @param cbuf Destination buffer * @param off Offset at which to start storing characters * @param len Maximum number of characters to read * @return The number of characters read, or {@link #EOF EOF} if the source * is exhausted. * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf, int off, int len) throws IOException { int length; int ret; if (null == mString) throw new IOException ("source is closed"); else { length = mString.length (); if (mOffset >= length) ret = EOF; else { if (len > length - mOffset) len = length - mOffset; mString.getChars (mOffset, mOffset + len, cbuf, off); mOffset += len; ret = len; } } return (ret); } /** * Read characters into an array. * @param cbuf Destination buffer. * @return The number of characters read, or {@link #EOF EOF} if the source * is exhausted. * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf) throws IOException { return (read (cbuf, 0, cbuf.length)); } /** * Tell whether this source is ready to be read. * @return Equivalent to a non-zero {@link #available()}, i.e. there are * still more characters to read. * @exception IOException Thrown if the source is closed. */ public boolean ready () throws IOException { if (null == mString) throw new IOException ("source is closed"); return (mOffset < mString.length ()); } /** * Reset the source. * Repositions the read point to begin at zero. * @exception IllegalStateException If the source has been closed. */ public void reset () { if (null == mString) throw new IllegalStateException ("source is closed"); else if (-1 != mMark) mOffset = mMark; else mOffset = 0; } /** * Tell whether this source supports the mark() operation. * @return <code>true</code>. */ public boolean markSupported () { return (true); } /** * Mark the present position in the source. * Subsequent calls to {@link #reset()} * will attempt to reposition the source to this point. * @param readAheadLimit <em>Not used.</em> * @exception IOException Thrown if the source is closed. * */ public void mark (int readAheadLimit) throws IOException { if (null == mString) throw new IOException ("source is closed"); mMark = mOffset; } /** * Skip characters. * <em>Note: n is treated as an int</em> * @param n The number of characters to skip. * @return The number of characters actually skipped * @exception IllegalArgumentException If <code>n</code> is negative. * @exception IOException If the source is closed. */ public long skip (long n) throws IOException { int length; long ret; if (null == mString) throw new IOException ("source is closed"); if (n < 0) throw new IllegalArgumentException ("cannot skip backwards"); else { length = mString.length (); if (mOffset >= length) n = 0L; else if (n > length - mOffset) n = length - mOffset; mOffset += n; ret = n; } return (ret); } // // Methods not in your Daddy's Reader // /** * Undo the read of a single character. * @exception IOException If no characters have been read or the source is closed. */ public void unread () throws IOException { if (null == mString) throw new IOException ("source is closed"); else if (mOffset <= 0) throw new IOException ("can't unread no characters"); else mOffset--; } /** * Retrieve a character again. * @param offset The offset of the character. * @return The character at <code>offset</code>. * @exception IOException If the source is closed or an attempt is made to * read beyond {@link #offset()}. */ public char getCharacter (int offset) throws IOException { char ret; if (null == mString) throw new IOException ("source is closed"); else if (offset >= mOffset) throw new IOException ("read beyond current offset"); else ret = mString.charAt (offset); return (ret); } /** * Retrieve characters again. * @param array The array of characters. * @param offset The starting position in the array where characters are to be placed. * @param start The starting position, zero based. * @param end The ending position * (exclusive, i.e. the character at the ending position is not included), * zero based. * @exception IOException If the source is closed or an attempt is made to * read beyond {@link #offset()}. */ public void getCharacters (char[] array, int offset, int start, int end) throws IOException { if (null == mString) throw new IOException ("source is closed"); else { if (end > mOffset) throw new IOException ("read beyond current offset"); else mString.getChars (start, end, array, offset); } } /** * Retrieve a string comprised of characters already read. * Asking for characters ahead of {@link #offset()} will throw an exception. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @return A string containing the <code>length</code> characters at <code>offset</code>. * @exception IOException If the source is closed or an attempt is made to * read beyond {@link #offset()}. */ public String getString (int offset, int length) throws IOException { String ret; if (null == mString) throw new IOException ("source is closed"); else { if (offset + length > mOffset) throw new IOException ("read beyond end of string"); else ret = mString.substring (offset, offset + length); } return (ret); } /** * Append characters already read into a <code>StringBuffer</code>. * Asking for characters ahead of {@link #offset()} will throw an exception. * @param buffer The buffer to append to. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @return A string containing the <code>length</code> characters at <code>offset</code>. * @exception IOException If the source is closed or an attempt is made to * read beyond {@link #offset()}. */ public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException { if (null == mString) throw new IOException ("source is closed"); else { if (offset + length > mOffset) throw new IOException ("read beyond end of string"); else buffer.append (mString.substring (offset, offset + length)); } } /** * Close the source. * Once a source has been closed, further {@link #read() read}, * {@link #ready ready}, {@link #mark mark}, {@link #reset reset}, * {@link #skip skip}, {@link #unread unread}, * {@link #getCharacter getCharacter} or {@link #getString getString} * invocations will throw an IOException. * Closing a previously-closed source, however, has no effect. * @exception IOException <em>Not thrown</em> */ public void destroy () throws IOException { mString = null; } /** * Get the position (in characters). * @return The number of characters that have already been read, or * {@link #EOF EOF} if the source is closed. */ public int offset () { int ret; if (null == mString) ret = EOF; else ret = mOffset; return (ret); } /** * Get the number of available characters. * @return The number of characters that can be read or zero if the source * is closed. */ public int available () { int ret; if (null == mString) ret = 0; else ret = mString.length () - mOffset; return (ret); } } Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** Page.java 8 Jun 2004 10:20:18 -0000 1.37 --- Page.java 3 Jul 2004 13:56:08 -0000 1.38 *************** *** 27,31 **** package org.htmlparser.lexer; - import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.IOException; --- 27,30 ---- *************** *** 43,47 **** import java.util.zip.InflaterInputStream; - import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.ParserException; --- 42,45 ---- *************** *** 57,61 **** /** * The default charset. ! * This should be <code>ISO-8859-1</code>, * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 * Another alias is "8859_1". --- 55,59 ---- /** * The default charset. ! * This should be <code>{@value}</code>, * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 * Another alias is "8859_1". *************** *** 65,69 **** /** * The default content type. ! * In the absence of alternate information, assume html content. */ public static final String DEFAULT_CONTENT_TYPE = "text/html"; --- 63,67 ---- /** * The default content type. ! * In the absence of alternate information, assume html content ({@value}). */ public static final String DEFAULT_CONTENT_TYPE = "text/html"; *************** *** 155,159 **** if (null == charset) charset = DEFAULT_CHARSET; ! mSource = new Source (stream, charset); mIndex = new PageIndex (this); mConnection = null; --- 153,157 ---- if (null == charset) charset = DEFAULT_CHARSET; ! mSource = new InputStreamSource (stream, charset); mIndex = new PageIndex (this); mConnection = null; *************** *** 162,166 **** } ! public Page (String text) { InputStream stream; --- 160,171 ---- } ! /** ! * Construct a page from the given string. ! * @param text The HTML text. ! * @param charset <em>Optional</em>. The character set encoding that will ! * be reported by {@link #getEncoding}. If charset is <code>null</code> ! * the default character set is used. ! */ ! public Page (String text, String charset) { InputStream stream; *************** *** 168,182 **** if (null == text) throw new IllegalArgumentException ("text cannot be null"); ! try ! { ! stream = new ByteArrayInputStream (text.getBytes (Page.DEFAULT_CHARSET)); ! mSource = new Source (stream, Page.DEFAULT_CHARSET, text.length () + 1); ! mIndex = new PageIndex (this); ! } ! catch (UnsupportedEncodingException uee) ! { ! // this is unlikely, so we cover it up with a runtime exception ! throw new IllegalStateException (uee.getMessage ()); ! } mConnection = null; mUrl = null; --- 173,180 ---- if (null == text) throw new IllegalArgumentException ("text cannot be null"); ! if (null == charset) ! charset = DEFAULT_CHARSET; ! mSource = new StringSource (text, charset); ! mIndex = new PageIndex (this); mConnection = null; mUrl = null; *************** *** 184,187 **** --- 182,196 ---- } + /** + * Construct a page from the given string. + * The page will report that it is using an encoding of + * {@link #DEFAULT_CHARSET}. + * @param text The HTML text. + */ + public Page (String text) + { + this (text, null); + } + // // Serialization support *************** *** 369,373 **** try { ! mSource = new Source (stream, charset); } catch (UnsupportedEncodingException uee) --- 378,382 ---- try { ! mSource = new InputStreamSource (stream, charset); } catch (UnsupportedEncodingException uee) *************** *** 383,387 **** ... [truncated message content] |