[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Cursor.java,1.3,1.4 Lexer.java,1.1,1.2 Page.jav
Brought to you by:
derrickoswald
|
From: <der...@us...> - 2003-08-22 03:37:46
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1:/tmp/cvs-serv6515/lexer
Modified Files:
Cursor.java Lexer.java Page.java PageIndex.java Source.java
Log Message:
Fourth drop for new i/o subsystem.
Index: Cursor.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Cursor.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** Cursor.java 17 Aug 2003 16:09:27 -0000 1.3
--- Cursor.java 21 Aug 2003 01:52:23 -0000 1.4
***************
*** 39,43 ****
* This class remembers the page it came from and its position within the page.
*/
! public class Cursor implements Ordered
{
/**
--- 39,43 ----
* This class remembers the page it came from and its position within the page.
*/
! public class Cursor implements Ordered, Cloneable
{
/**
***************
*** 105,109 ****
public Cursor dup ()
{
! return (new Cursor (getPage (), getPosition ()));
}
--- 105,116 ----
public Cursor dup ()
{
! try
! {
! return ((Cursor)clone ());
! }
! catch (CloneNotSupportedException cnse)
! {
! return (new Cursor (getPage (), getPosition ()));
! }
}
Index: Lexer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** Lexer.java 17 Aug 2003 16:09:27 -0000 1.1
--- Lexer.java 21 Aug 2003 01:52:23 -0000 1.2
***************
*** 159,162 ****
--- 159,164 ----
char ch;
int length;
+ int begin;
+ int end;
StringNode ret;
***************
*** 174,178 ****
done = true;
// the order of these tests might be optimized for speed:
! else if ('/' == ch || '%' == ch || Character.isLetter (ch) || '!' == ch)
{
done = true;
--- 176,180 ----
done = true;
// the order of these tests might be optimized for speed:
! else if ('/' == ch || Character.isLetter (ch) || '!' == ch || '%' == ch)
{
done = true;
***************
*** 187,194 ****
}
}
! length = cursor.getPosition () - mCursor.getPosition ();
if (0 != length)
{ // got some characters
! ret = new StringNode (mPage, mCursor.getPosition (), cursor.getPosition ());
mCursor = cursor;
}
--- 189,198 ----
}
}
! begin = mCursor.getPosition ();
! end = cursor.getPosition ();
! length = end - begin;
if (0 != length)
{ // got some characters
! ret = new StringNode (mPage, begin, end);
mCursor = cursor;
}
***************
*** 202,231 ****
{
if (bookmarks[1] > bookmarks[0])
! attributes.addElement (new Attribute (null, mPage.getText (bookmarks[0], bookmarks[1]), (char)0));
}
private void standalone (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), null, (char)0));
}
private void empty (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), "", (char)0));
}
private void naked (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[3], bookmarks[4]), (char)0));
}
private void single_quote (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[4] + 1, bookmarks[5]), '\''));
}
private void double_quote (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[5] + 1, bookmarks[6]), '"'));
}
--- 206,241 ----
{
if (bookmarks[1] > bookmarks[0])
! attributes.addElement (new Attribute (mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0));
! //attributes.addElement (new Attribute (null, mPage.getText (bookmarks[0], bookmarks[1]), (char)0));
}
private void standalone (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0));
! //attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), null, (char)0));
}
private void empty (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, bookmarks[2] + 1, (char)0));
! //attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), "", (char)0));
}
private void naked (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage, bookmarks[1], bookmarks[2], bookmarks[3], bookmarks[4], (char)0));
! //attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[3], bookmarks[4]), (char)0));
}
private void single_quote (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1, bookmarks[5], '\''));
! //attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[4] + 1, bookmarks[5]), '\''));
}
private void double_quote (Vector attributes, int[] bookmarks)
{
! attributes.addElement (new Attribute (mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1, bookmarks[6], '"'));
! //attributes.addElement (new Attribute (mPage.getText (bookmarks[1], bookmarks[2]), mPage.getText (bookmarks[5] + 1, bookmarks[6]), '"'));
}
***************
*** 510,514 ****
if ('>' == ch)
done = true;
! else if (!Character.isWhitespace (ch) || ('!' == ch))
state = 2;
break;
--- 520,528 ----
if ('>' == ch)
done = true;
! else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch))
! {
! // stay in state 4
! }
! else
state = 2;
break;
Index: Page.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** Page.java 17 Aug 2003 16:09:27 -0000 1.5
--- Page.java 21 Aug 2003 01:52:23 -0000 1.6
***************
*** 31,34 ****
--- 31,36 ----
import java.io.ByteArrayInputStream;
import java.io.IOException;
+ import java.io.InputStream;
+ import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
***************
*** 76,80 ****
* Messages for page not there (404).
*/
! private String[] mFourOhFour =
{
"The web site you seek cannot be located, but countless more exist",
--- 78,82 ----
* Messages for page not there (404).
*/
! static private String[] mFourOhFour =
{
"The web site you seek cannot be located, but countless more exist",
***************
*** 135,139 ****
* @exception UnsupportedEncodingException If the given charset is not supported.
*/
! public Page (Stream stream, String charset)
throws
UnsupportedEncodingException
--- 137,141 ----
* @exception UnsupportedEncodingException If the given charset is not supported.
*/
! public Page (InputStream stream, String charset)
throws
UnsupportedEncodingException
***************
*** 149,153 ****
public Page (String text) throws ParserException
{
! Stream stream;
Page ret;
--- 151,155 ----
public Page (String text) throws ParserException
{
! InputStream stream;
Page ret;
***************
*** 156,161 ****
try
{
! stream = new Stream (new ByteArrayInputStream (text.getBytes (Page.DEFAULT_CHARSET)));
! mSource = new Source (stream, Page.DEFAULT_CHARSET);
mIndex = new PageIndex (this);
}
--- 158,163 ----
try
{
! stream = new ByteArrayInputStream (text.getBytes (Page.DEFAULT_CHARSET));
! mSource = new Source (stream, Page.DEFAULT_CHARSET, text.length () + 1);
mIndex = new PageIndex (this);
}
***************
*** 193,205 ****
int i;
char ret;
!
! if (mSource.mOffset < cursor.getPosition ())
// hmmm, we could skip ahead, but then what about the EOL index
throw new ParserException ("attempt to read future characters from source");
! else if (mSource.mOffset == cursor.getPosition ())
try
{
i = mSource.read ();
! if (-1 == i)
ret = 0;
else
--- 195,208 ----
int i;
char ret;
!
! i = cursor.getPosition ();
! if (mSource.mOffset < i)
// hmmm, we could skip ahead, but then what about the EOL index
throw new ParserException ("attempt to read future characters from source");
! else if (mSource.mOffset == i)
try
{
i = mSource.read ();
! if (0 > i)
ret = 0;
else
***************
*** 218,222 ****
{
// historic read
! ret = mSource.mBuffer[cursor.getPosition ()];
cursor.advance ();
}
--- 221,225 ----
{
// historic read
! ret = mSource.mBuffer[i];
cursor.advance ();
}
***************
*** 466,470 ****
{
int length;
- StringBuffer ret;
if ((mSource.mOffset < start) || (mSource.mOffset < end))
--- 469,472 ----
***************
*** 478,481 ****
--- 480,508 ----
length = end - start;
buffer.append (mSource.mBuffer, start, length);
+ }
+
+ /**
+ * Get all text read so far from the source.
+ * @return The text from the source.
+ * @see #getText(StringBuffer)
+ */
+ public String getText ()
+ {
+ StringBuffer ret;
+
+ ret = new StringBuffer (mSource.mOffset);
+ getText (ret);
+
+ return (ret.toString ());
+ }
+
+ /**
+ * Put all text read so far from the source into the given buffer.
+ * @param buffer The accumulator for the characters.
+ * @see #getText(StringBuffer,int,int)
+ */
+ public void getText (StringBuffer buffer)
+ {
+ getText (buffer, 0, mSource.mOffset);
}
Index: PageIndex.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/PageIndex.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** PageIndex.java 17 Aug 2003 16:09:27 -0000 1.3
--- PageIndex.java 21 Aug 2003 01:52:23 -0000 1.4
***************
*** 51,55 ****
* Increment for allocations.
*/
! protected static final int mIncrement = 10;
/**
--- 51,55 ----
* Increment for allocations.
*/
! protected static final int mIncrement = 100;
/**
Index: Source.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Source.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** Source.java 17 Aug 2003 16:09:27 -0000 1.4
--- Source.java 21 Aug 2003 01:52:23 -0000 1.5
***************
*** 30,33 ****
--- 30,34 ----
import java.io.IOException;
+ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
***************
*** 50,55 ****
* An initial buffer size.
*/
! protected static final int BUFFER_SIZE = 4096;
!
/**
* Return value when no more characters are left.
--- 51,56 ----
* An initial buffer size.
*/
! public static int BUFFER_SIZE = 16384;
!
/**
* Return value when no more characters are left.
***************
*** 60,64 ****
* The stream of bytes.
*/
! protected Stream mStream;
/**
--- 61,65 ----
* The stream of bytes.
*/
! protected InputStream mStream;
/**
***************
*** 70,84 ****
* The characters read so far.
*/
! public volatile char[] mBuffer;
/**
* The number of valid bytes in the buffer.
*/
! public volatile int mLevel;
/**
* The offset of the next byte returned by read().
*/
! public volatile int mOffset;
/**
--- 71,85 ----
* The characters read so far.
*/
! public /*volatile*/ char[] mBuffer;
/**
* The number of valid bytes in the buffer.
*/
! public /*volatile*/ int mLevel;
/**
* The offset of the next byte returned by read().
*/
! public /*volatile*/ int mOffset;
/**
***************
*** 91,99 ****
* @param stream The stream of bytes to use.
*/
! public Source (Stream stream)
throws
UnsupportedEncodingException
{
! this (stream, null);
}
--- 92,100 ----
* @param stream The stream of bytes to use.
*/
! public Source (InputStream stream)
throws
UnsupportedEncodingException
{
! this (stream, null, BUFFER_SIZE);
}
***************
*** 103,107 ****
* @param charset The character set used in encoding the stream.
*/
! public Source (Stream stream, String charset)
throws
UnsupportedEncodingException
--- 104,119 ----
* @param charset The character set used in encoding the stream.
*/
! public Source (InputStream stream, String charset)
! throws
! UnsupportedEncodingException
! {
! this (stream, charset, BUFFER_SIZE);
! }
! /**
! * Create a source of characters.
! * @param stream The stream of bytes to use.
! * @param charset The character set used in encoding the stream.
! */
! public Source (InputStream stream, String charset, int buffer_size)
throws
UnsupportedEncodingException
***************
*** 114,118 ****
else
mReader = new InputStreamReader (stream, charset);
! mBuffer = null;
mLevel = 0;
mOffset = 0;
--- 126,130 ----
else
mReader = new InputStreamReader (stream, charset);
! mBuffer = new char[buffer_size];
mLevel = 0;
mOffset = 0;
***************
*** 131,156 ****
{
char[] buffer;
int read;
if (null != mReader) // mReader goes null when it's been sucked dry
{
! // get some buffer space
! // unknown length... keep doubling
! if (null == mBuffer)
{
! mBuffer = new char[Math.max (BUFFER_SIZE, min)];
! buffer = mBuffer;
}
else
{
! read = Math.max (BUFFER_SIZE / 2, min);
! if (mBuffer.length - mLevel < read)
! buffer = new char[Math.max (mBuffer.length * 2, mBuffer.length + min)];
! else
! buffer = mBuffer;
}
// read into the end of the 'new' buffer
! read = mReader.read (buffer, mLevel, buffer.length - mLevel);
if (-1 == read)
{
--- 143,171 ----
{
char[] buffer;
+ int size;
int read;
if (null != mReader) // mReader goes null when it's been sucked dry
{
! size = mBuffer.length - mLevel; // available space
! if (size < min) // oops, better get some buffer space
{
! // unknown length... keep doubling
! size = mBuffer.length * 2;
! read = mLevel + min;
! if (size < read) // or satisfy min, whichever is greater
! size = read;
! else
! min = size - mLevel; // read the max
! buffer = new char[size];
}
else
{
! buffer = mBuffer;
! min = size;
}
// read into the end of the 'new' buffer
! read = mReader.read (buffer, mLevel, min);
if (-1 == read)
{
***************
*** 167,170 ****
--- 182,186 ----
mLevel += read;
}
+ // todo, should repeat on read shorter than original min
}
}
***************
*** 196,211 ****
int ret;
- if (null == mStream) // mStream goes null on close()
- throw new IOException ("reader is closed");
if (mLevel - mOffset < 1)
- fill (1);
- if (mOffset >= mLevel)
- ret = EOF;
- else
{
! ret = mBuffer[mOffset];
! mOffset++;
}
!
return (ret);
}
--- 212,228 ----
int ret;
if (mLevel - mOffset < 1)
{
! if (null == mStream) // mStream goes null on close()
! throw new IOException ("reader is closed");
! fill (1);
! if (mOffset >= mLevel)
! ret = EOF;
! else
! ret = mBuffer[mOffset++];
}
! else
! ret = mBuffer[mOffset++];
!
return (ret);
}
***************
*** 245,249 ****
--- 262,281 ----
return (ret);
}
+
+ /**
+ * Read characters into an array.
+ * This method will block until some input is available, an I/O error occurs,
+ * or the end of the stream is reached.
+ * @param cbuf Destination buffer.
+ * @return The number of characters read, or -1 if the end of the stream has
+ * been reached.
+ * @exception IOException If an I/O error occurs.
+ */
+ public int read (char[] cbuf) throws IOException
+ {
+ return (read (cbuf, 0, cbuf.length));
+ }
+
/**
* Reset the stream. If the stream has been marked, then attempt to
***************
*** 367,370 ****
--- 399,411 ----
mOffset = 0;
mMark = -1;
+ }
+
+ /**
+ * Get the number of available characters.
+ * @return The number of characters that can be read without blocking.
+ */
+ public int available ()
+ {
+ return (mLevel - mOffset);
}
}
|