[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Page.java,NONE,1.1 Source.java,NONE,1.1 Stream.
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-07-17 01:42:21
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv31631/org/htmlparser/lexer Added Files: Page.java Source.java Stream.java Log Message: Initial drop of new io subsystem. --- NEW FILE: Page.java --- // HTMLParser Library v1_4_20030525 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser.lexer; import java.io.IOException; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.net.URLConnection; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Represents the contents of an HTML page. * Contains a character array of the page downloaded so far, * a String with those characters in it, * and an index of positions of line separators (actually the first * character position on the next line). */ public class Page { /** * The default charset. * This should be <code>ISO-8859-1</code>, * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 * Another alias is "8859_1". */ public static final String DEFAULT_CHARSET = "ISO-8859-1"; /** * The logging object. */ protected static Log mLog = null; /** * The source of characters. */ protected Source mSource; /** * The characters read so far from the source. */ protected char[] mCharacters; /** * The string representation of the source. */ protected String mString; /** * Character positions of the first character in each line. */ protected int mIndex[]; /** * The index position to be used next. */ protected int mIndexLength; /** * Messages for page not there (404). */ private String[] mFourOhFour = { "The web site you seek cannot be located, but countless more exist", "You step in the stream, but the water has moved on. This page is not here.", "Yesterday the page existed. Today it does not. The internet is like that.", "That page was so big. It might have been very useful. But now it is gone.", "Three things are certain: death, taxes and broken links. Guess which has occured.", "Chaos reigns within. Reflect, repent and enter the correct URL. Order shall return.", "Stay the patient course. Of little worth is your ire. The page is not found.", "A non-existant URL reduces your expensive computer to a simple stone.", "Many people have visited that page. Today, you are not one of the lucky ones.", "Cutting the wind with a knife. Bookmarking a URL. Both are ephemeral.", }; /** * Construct a page reading from a URL. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @exception IOException If an i/o exception occurs creating the * source. * @exception UnsupportedEncodingException if the character set specified in the * HTTP header is not supported. */ public Page (URLConnection connection) throws IOException, UnsupportedEncodingException { if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); connection.connect (); mSource = new Source (new Stream (connection.getInputStream ()), getCharacterSet (connection)); mCharacters = null; mString = null; mIndex = null; mIndexLength = 0; } /** * Try and extract the character set from the HTTP header. * @param connection The connection with the charset info. * @return The character set name to use for this HTML page. */ protected String getCharacterSet (URLConnection connection) { final String CONTENT_TYPE_STRING = "Content-Type"; String string; String ret; ret = DEFAULT_CHARSET; string = connection.getHeaderField (CONTENT_TYPE_STRING); if (null != string) ret = getCharset (string); return (ret); } /** * Get a CharacterSet name corresponding to a charset parameter. * @param content A text line of the form: * <pre> * text/html; charset=Shift_JIS * </pre> * which is applicable both to the HTTP header field Content-Type and * the meta tag http-equiv="Content-Type". * Note this method also handles non-compliant quoted charset directives such as: * <pre> * text/html; charset="UTF-8" * </pre> * and * <pre> * text/html; charset='UTF-8' * </pre> * @return The character set name to use when reading the input stream. * For JDKs that have the Charset class this is qualified by passing * the name to findCharset() to render it into canonical form. * If the charset parameter is not found in the given string, the default * character set is returned. * @see ParserHelper#findCharset * @see #DEFAULT_CHARSET */ protected String getCharset (String content) { final String CHARSET_STRING = "charset"; int index; String ret; ret = DEFAULT_CHARSET; if (null != content) { index = content.indexOf (CHARSET_STRING); if (index != -1) { content = content.substring (index + CHARSET_STRING.length ()).trim (); if (content.startsWith ("=")) { content = content.substring (1).trim (); index = content.indexOf (";"); if (index != -1) content = content.substring (0, index); //remove any double quotes from around charset string if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ())) content = content.substring (1, content.length () - 1); //remove any single quote from around charset string if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ())) content = content.substring (1, content.length () - 1); ret = findCharset (content, ret); // Charset names are not case-sensitive; // that is, case is always ignored when comparing charset names. if (!ret.equalsIgnoreCase (content)) { getLog ().info ( "detected charset \"" + content + "\", using \"" + ret + "\""); } } } } return (ret); } /** * Lookup a character set name. * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em> * This uses reflection so the code will still run under prior JDK's but * in that case the default is always returned. * @param name The name to look up. One of the aliases for a character set. * @param _default The name to return if the lookup fails. */ public String findCharset (String name, String _default) { String ret; try { Class cls; Method method; Object object; cls = Class.forName ("java.nio.charset.Charset"); method = cls.getMethod ("forName", new Class[] { String.class }); object = method.invoke (null, new Object[] { name }); method = cls.getMethod ("name", new Class[] { }); object = method.invoke (object, new Object[] { }); ret = (String)object; } catch (ClassNotFoundException cnfe) { // for reflection exceptions, assume the name is correct ret = name; } catch (NoSuchMethodException nsme) { // for reflection exceptions, assume the name is correct ret = name; } catch (IllegalAccessException ia) { // for reflection exceptions, assume the name is correct ret = name; } catch (InvocationTargetException ita) { // java.nio.charset.IllegalCharsetNameException // and java.nio.charset.UnsupportedCharsetException // return the default ret = _default; getLog ().debug ( "unable to determine cannonical charset name for " + name + " - using " + _default, ita); } return (ret); } // // Bean patterns // public Log getLog () { if (null == mLog) mLog = LogFactory.getLog (this.getClass ()); // String name = this.getClass ().getName (); // java.util.logging.Logger logger = java.util.logging.Logger.getLogger (name); // logger.setLevel (java.util.logging.Level.FINEST); return (mLog); } } --- NEW FILE: Source.java --- // HTMLParser Library v1_4_20030525 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser.lexer; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; /** * A buffered source of characters. * A Source is very similar to a the following construct: * <pre> * new InputStreamReader (new BufferedInputStream (connection.getInputStream ()), charset) * </pre> * It differs from the above, in two ways: * <li>the fetching of bytes from the connection's input stream may be asynchronous</li> * <li>the character set may be changed, which resets the input stream</li> * */ public class Source extends Reader { /** * An initial buffer size. */ protected static final int BUFFER_SIZE = 4096; /** * Return value when no more characters are left. */ public static final int EOF = -1; /** * The stream of bytes. */ protected Stream mStream; /** * The converter from bytes to characters. */ protected InputStreamReader mReader; /** * The characters read so far. */ public volatile char[] mBuffer; /** * The number of valid bytes in the buffer. */ public volatile int mLevel; /** * The offset of the next byte returned by read(). */ protected int mOffset; /** * The bookmark. */ protected int mMark; /** * Create a source of characters using the default character set. * @param stream The stream of bytes to use. */ public Source (Stream stream) throws UnsupportedEncodingException { this (stream, null); } /** * Create a source of characters. * @param stream The stream of bytes to use. * @param charset The character set used in encoding the stream. */ public Source (Stream stream, String charset) throws UnsupportedEncodingException { if (null == stream) stream = new Stream (null); mStream = stream; if (null == charset) mReader = new InputStreamReader (stream); else mReader = new InputStreamReader (stream, charset); mBuffer = null; mLevel = 0; mOffset = 0; mMark = -1; } /** * Fetch more characters from the underlying reader. * Has no effect if the underlying reader has been drained. * @param min The minimum to read. * @exception IOException If the underlying reader read() throws one. */ protected void fill (int min) throws IOException { char[] buffer; int read; if (null != mReader) // mReader goes null when it's been sucked dry { // get some buffer space // unknown length... keep doubling if (null == mBuffer) { mBuffer = new char[Math.max (BUFFER_SIZE, min)]; buffer = mBuffer; } else { read = Math.max (BUFFER_SIZE / 2, min); if (mBuffer.length - mLevel < read) buffer = new char[Math.max (mBuffer.length * 2, mBuffer.length + min)]; else buffer = mBuffer; } // read into the end of the 'new' buffer read = mReader.read (buffer, mLevel, buffer.length - mLevel); if (-1 == read) { mReader.close (); mReader = null; } else { if (mBuffer != buffer) { // copy the bytes previously read System.arraycopy (mBuffer, 0, buffer, 0, mLevel); mBuffer = buffer; } mLevel += read; } } } // // Reader overrides // /** * Close the stream. Once a stream has been closed, further read(), * ready(), mark(), or reset() invocations will throw an IOException. * Closing a previously-closed stream, however, has no effect. * @exception IOException If an I/O error occurs */ public void close () throws IOException { mStream = null; if (null != mReader) mReader.close (); mReader = null; mBuffer = null; mLevel = 0; mOffset = 0; mMark = -1; } /** * Read a single character. * This method will block until a character is available, * an I/O error occurs, or the end of the stream is reached. * @return The character read, as an integer in the range 0 to 65535 * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has * been reached * @exception IOException If an I/O error occurs. */ public int read () throws IOException { int ret; if (null == mStream) // mStream goes null on close() throw new IOException ("reader is closed"); if (mLevel - mOffset < 1) fill (1); if (mOffset >= mLevel) ret = EOF; else { ret = mBuffer[mOffset]; mOffset++; } return (ret); } /** * Read characters into a portion of an array. This method will block * until some input is available, an I/O error occurs, or the end of the * stream is reached. * @param cbuf Destination buffer * @param off Offset at which to start storing characters * @param len Maximum number of characters to read * @return The number of characters read, or -1 if the end of the * stream has been reached * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf, int off, int len) throws IOException { int ret; if (null == mStream) // mStream goes null on close() throw new IOException ("reader is closed"); if ((null == cbuf) || (0 > off) || (0 > len)) throw new IOException ("illegal argument read (" + ((null == cbuf) ? "null" : "cbuf") + ", " + off + ", " + len + ")"); if (mLevel - mOffset < len) fill (len - (mLevel - mOffset)); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, len); System.arraycopy (mBuffer, mOffset, cbuf, off, ret); mOffset += ret; } return (ret); } /** * Reset the stream. If the stream has been marked, then attempt to * reposition it at the mark. If the stream has not been marked, then * attempt to reset it in some way appropriate to the particular stream, * for example by repositioning it to its starting point. Not all * character-input streams support the reset() operation, and some support * reset() without supporting mark(). * @exception IOException If the stream has not been marked, * or if the mark has been invalidated, * or if the stream does not support reset(), * or if some other I/O error occurs */ public void reset () throws IOException { if (null == mStream) // mStream goes null on close() throw new IOException ("reader is closed"); if (-1 != mMark) mOffset = mMark; else mOffset = 0; } /** * Tell whether this stream supports the mark() operation. * @return <code>true</code> if and only if this stream supports the mark operation. */ public boolean markSupported () { return (true); } /** * Mark the present position in the stream. Subsequent calls to reset() * will attempt to reposition the stream to this point. Not all * character-input streams support the mark() operation. * @param readAheadLimit <em>Not used.</em> * @exception IOException <em>Never thrown</em>. * */ public void mark (int readAheadLimit) throws IOException { if (null == mStream) // mStream goes null on close() throw new IOException ("reader is closed"); mMark = mOffset; } /** * Tell whether this stream is ready to be read. * @return <code>true</code> if the next read() is guaranteed not to block * for input, <code>false</code> otherwise. * Note that returning false does not guarantee that the next read will block. * @exception IOException <em>Never thrown</em>. */ public boolean ready () throws IOException { if (null == mStream) // mStream goes null on close() throw new IOException ("reader is closed"); return (mOffset < mLevel); } /** * Skip characters. * This method will block until some characters are available, * an I/O error occurs, or the end of the stream is reached. * <em>Note: n is treated as an int</em> * @param n The number of characters to skip. * @return The number of characters actually skipped * @exception IllegalArgumentException If <code>n</code> is negative. * @exception IOException If an I/O error occurs. */ public long skip (long n) throws IOException { long ret; if (null == mStream) // mStream goes null on close() throw new IOException ("reader is closed"); if (mLevel - mOffset < n) fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, n); mOffset += ret; } return (ret); } } --- NEW FILE: Stream.java --- // HTMLParser Library v1_4_20030525 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser.lexer; import java.io.IOException; import java.io.InputStream; import java.lang.Runnable; /** * Provides for asynchronous fetching from a stream. * */ public class Stream extends InputStream implements Runnable { public int fills = 0; public int reallocations = 0; public int synchronous = 0; /** * An initial buffer size. */ protected static final int BUFFER_SIZE = 4096; /** * Return value when no more characters are left. */ protected static final int EOF = -1; /** * The underlying stream. */ protected volatile InputStream mIn; /** * The bytes read so far. */ public volatile byte[] mBuffer; /** * The number of valid bytes in the buffer. */ public volatile int mLevel; /** * The offset of the next byte returned by read(). */ protected int mOffset; /** * The content length from the HTTP header. */ protected int mContentLength; /** * The bookmark. */ protected int mMark; /** * Construct a stream with no assumptions about the number of bytes available. * @param in The input stream to use. */ public Stream (InputStream in) { this (in, 0); } /** * Construct a stream to read the given number of bytes. * @param in The input stream to use. * @param bytes The maximum number of bytes to read. * This should be set to the ContentLength from the HTTP header. * A negative or zero value indicates an unknown number of bytes. */ public Stream (InputStream in, int bytes) { mIn = in; mBuffer = null; mLevel = 0; mOffset = 0; mContentLength = bytes < 0 ? 0 : bytes; mMark = -1; } /** * Fetch more bytes from the underlying stream. * Has no effect if the underlying stream has been drained. * @param force If <code>true</code>, an attempt is made to read from the * underlying stream, even if bytes are available, If <code>false</code>, * a read of the underlying stream will not occur if there are already * bytes available. * @return <code>true</code> if not at the end of the input stream. * @exception IOException If the underlying stream read() or available() throws one. */ protected synchronized boolean fill (boolean force) throws IOException { int size; byte[] buffer; int read; boolean ret; ret = false; if (null != mIn) // mIn goes null when it's been sucked dry { if (!force) { // check for change of state while waiting on the monitor in a synchronous call if (0 != available ()) return (true); synchronous++; } // get some buffer space if (0 == mContentLength) { // unknown content length... keep doubling if (null == mBuffer) { mBuffer = new byte[Math.max (BUFFER_SIZE, mIn.available ())]; buffer = mBuffer; } else { if (mBuffer.length - mLevel < BUFFER_SIZE / 2) buffer = new byte[Math.max (mBuffer.length * 2, mBuffer.length + mIn.available ())]; else buffer = mBuffer; } size = buffer.length - mLevel; } else { // known content length... allocate once size = mContentLength - mLevel; if (null == mBuffer) mBuffer = new byte[size]; buffer = mBuffer; } // read into the end of the 'new' buffer read = mIn.read (buffer, mLevel, size); if (-1 == read) { mIn.close (); mIn = null; } else { if (mBuffer != buffer) { // copy the bytes previously read System.arraycopy (mBuffer, 0, buffer, 0, mLevel); mBuffer = buffer; reallocations++; } mLevel += read; if ((0 != mContentLength) && (mLevel == mContentLength)) { mIn.close (); mIn = null; } ret = true; fills++; } } return (ret); } // // Runnable interface // /** * Continually read the underlying stream untill exhausted. * @see java.lang.Thread#run() */ public void run () { boolean filled; do { // keep hammering the socket with no delay, it's metered upstream try { filled = fill (true); } catch (IOException ioe) { ioe.printStackTrace (); // exit the thread if there is a problem, // let the synchronous reader find out about it filled = false; } } while (filled); } // // InputStream overrides // /** * Reads the next byte of data from the input stream. The value byte is * returned as an <code>int</code> in the range <code>0</code> to * <code>255</code>. If no byte is available because the end of the stream * has been reached, the value <code>-1</code> is returned. This method * blocks until input data is available, the end of the stream is detected, * or an exception is thrown. * @return The next byte of data, or <code>-1</code> if the end of the * stream is reached. * @exception IOException If an I/O error occurs. */ public int read () throws IOException { int ret; // The following is unsynchronized code. // Some would argue that unsynchronized access isn't thread safe // but I think I can rationalize it in this case... // The two volatile members are mLevel and mBuffer (besides mIn). // If (mOffset >= mLevel) turns false after the test, fill is // superflously called, but it's synchronized and figures it out. // (mOffset < mLevel) only goes more true by the operation of the // background thread, it increases the value of mLevel // and volatile int access is atomic. // If mBuffer changes by the operation of the background thread, // the array pointed to can only be bigger than the previous buffer, // and hence no array bounds exception can be raised. if (0 == available ()) fill (false); if (0 != available ()) ret = mBuffer[mOffset++]; else ret = EOF; return (ret); } /** * Returns the number of bytes that can be read (or skipped over) from * this input stream without blocking by the next caller of a method for * this input stream. The next caller might be the same thread or or * another thread. * @return The number of bytes that can be read from this input stream * without blocking. * @exception IOException If an I/O error occurs. */ public int available () throws IOException { return (mLevel - mOffset); } /** * Closes this input stream and releases any system resources associated * with the stream. * @exception IOException If an I/O error occurs. */ public synchronized void close () throws IOException { if (null != mIn) { mIn.close (); mIn = null; } mBuffer = null; mLevel = 0; mOffset = 0; mContentLength =0; mMark = -1; } /** * Repositions this stream to the position at the time the * <code>mark</code> method was last called on this input stream. * * <p> The general contract of <code>reset</code> is: * * <p><ul> * * <li> If the method <code>markSupported</code> returns * <code>true</code>, then: * * <ul><li> If the method <code>mark</code> has not been called since * the stream was created, or the number of bytes read from the stream * since <code>mark</code> was last called is larger than the argument * to <code>mark</code> at that last call, then an * <code>IOException</code> might be thrown. * * <li> If such an <code>IOException</code> is not thrown, then the * stream is reset to a state such that all the bytes read since the * most recent call to <code>mark</code> (or since the start of the * file, if <code>mark</code> has not been called) will be resupplied * to subsequent callers of the <code>read</code> method, followed by * any bytes that otherwise would have been the next input data as of * the time of the call to <code>reset</code>. </ul> * * <li> If the method <code>markSupported</code> returns * <code>false</code>, then: * * <ul><li> The call to <code>reset</code> may throw an * <code>IOException</code>. * * <li> If an <code>IOException</code> is not thrown, then the stream * is reset to a fixed state that depends on the particular type of the * input stream and how it was created. The bytes that will be supplied * to subsequent callers of the <code>read</code> method depend on the * particular type of the input stream. </ul></ul> * * @exception IOException <em>Never thrown. Just for subclassers.</em> * @see java.io.InputStream#mark(int) * @see java.io.IOException * */ public void reset () throws IOException { if (-1 != mMark) mOffset = mMark; else mOffset = 0; } /** * Tests if this input stream supports the <code>mark</code> and * <code>reset</code> methods. Whether or not <code>mark</code> and * <code>reset</code> are supported is an invariant property of a * particular input stream instance. The <code>markSupported</code> method * of <code>InputStream</code> returns <code>false</code>. * * @return <code>true</code>. * @see java.io.InputStream#mark(int) * @see java.io.InputStream#reset() * */ public boolean markSupported () { return (true); } /** * Marks the current position in this input stream. A subsequent call to * the <code>reset</code> method repositions this stream at the last marked * position so that subsequent reads re-read the same bytes. * * <p> The <code>readlimit</code> arguments tells this input stream to * allow that many bytes to be read before the mark position gets * invalidated. * * <p> The general contract of <code>mark</code> is that, if the method * <code>markSupported</code> returns <code>true</code>, the stream somehow * remembers all the bytes read after the call to <code>mark</code> and * stands ready to supply those same bytes again if and whenever the method * <code>reset</code> is called. However, the stream is not required to * remember any data at all if more than <code>readlimit</code> bytes are * read from the stream before <code>reset</code> is called. * * @param readlimit <em>Not used.</em> * @see java.io.InputStream#reset() * */ public void mark (int readlimit) { mMark = mOffset; } } |