[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Cursor.java,1.8,1.9 Lexer.java,1.9,1.10 Page.ja
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-09-28 15:34:59
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv30684/lexer Modified Files: Cursor.java Lexer.java Page.java Source.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: Cursor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Cursor.java,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** Cursor.java 22 Sep 2003 02:39:59 -0000 1.8 --- Cursor.java 28 Sep 2003 15:33:57 -0000 1.9 *************** *** 81,84 **** --- 81,93 ---- /** + * Set the position of this cursor. + * @param The new cursor position. + */ + public void setPosition (int position) + { + mPosition = position; + } + + /** * Move the cursor position ahead one character. */ Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** Lexer.java 22 Sep 2003 02:39:59 -0000 1.9 --- Lexer.java 28 Sep 2003 15:33:57 -0000 1.10 *************** *** 33,45 **** package org.htmlparser.lexer; ! import java.io.*; ! import java.net.*; ! import java.util.*; ! import org.htmlparser.*; ! import org.htmlparser.lexer.nodes.*; import org.htmlparser.lexer.nodes.RemarkNode; import org.htmlparser.lexer.nodes.StringNode; ! import org.htmlparser.util.*; /** --- 33,50 ---- package org.htmlparser.lexer; ! import java.io.IOException; ! import java.net.MalformedURLException; ! import java.net.URL; ! import java.net.URLConnection; ! import java.util.Vector; ! import org.htmlparser.Node; ! import org.htmlparser.lexer.nodes.AbstractNode; ! import org.htmlparser.lexer.nodes.Attribute; ! import org.htmlparser.lexer.nodes.NodeFactory; import org.htmlparser.lexer.nodes.RemarkNode; import org.htmlparser.lexer.nodes.StringNode; ! import org.htmlparser.lexer.nodes.TagNode; ! import org.htmlparser.util.ParserException; /** *************** *** 53,56 **** --- 58,63 ---- */ public class Lexer + implements + NodeFactory { /** *************** *** 65,68 **** --- 72,80 ---- /** + * The factory for new nodes. + */ + protected NodeFactory mFactory; + + /** * Creates a new instance of a Lexer. * @param page The page with HTML text. *************** *** 70,75 **** public Lexer (Page page) { ! mPage = page; ! mCursor = new Cursor (page, 0); } --- 82,88 ---- public Lexer (Page page) { ! setPage (page); ! setCursor (new Cursor (page, 0)); ! setNodeFactory (this); } *************** *** 78,82 **** * @param text The text to parse. */ ! public Lexer (String text) throws ParserException { this (new Page (text)); --- 91,95 ---- * @param text The text to parse. */ ! public Lexer (String text) { this (new Page (text)); *************** *** 93,96 **** --- 106,120 ---- /** + * Reset the lexer to start parsing from the beginning again. + * The underlying components are reset such that the next call to + * <code>nextNode()</code> will return the first lexeme on the page. + */ + public void reset () + { + getPage ().reset (); + setCursor (new Cursor (getPage (), 0)); + } + + /** * Get the page this lexer is working on. * @return The page that nodes are being read from. *************** *** 102,105 **** --- 126,211 ---- /** + * Set the page this lexer is working on. + * @return The page that nodes will be read from. + */ + public void setPage (Page page) + { + if (null == page) + throw new IllegalArgumentException ("page cannot be null"); + // todo: sanity checks + mPage = page; + } + + /** + * Get the current scanning position. + * @return The lexer's cursor position. + */ + public Cursor getCursor () + { + return (mCursor); + } + + /** + * Set the current scanning position. + * @param cursor The lexer's new cursor position. + */ + public void setCursor (Cursor cursor) + { + if (null == cursor) + throw new IllegalArgumentException ("cursor cannot be null"); + // todo: sanity checks + mCursor = cursor; + } + + /** + * Get the current node factory. + * @return The lexer's node factory. + */ + public NodeFactory getNodeFactory () + { + return (mFactory); + } + + /** + * Get the current node factory. + * @return The lexer's cursor position. + */ + public void setNodeFactory (NodeFactory factory) + { + if (null == factory) + throw new IllegalArgumentException ("node factory cannot be null"); + mFactory = factory; + } + + public int getPosition () + { + return (getCursor ().getPosition ()); + } + + public void setPosition (int position) + { + // todo: sanity checks + getCursor ().setPosition (position); + } + + /** + * Get the current line number. + * @return The line number the lexer's working on. + */ + public int getCurrentLineNumber () + { + return (getPage ().row (getCursor ())); + } + + /** + * Get the current line. + * @return The string the lexer's working on. + */ + public String getCurrentLine () + { + return (getPage ().getLine (getCursor ())); + } + + /** * Get the next node from the source. * @return A RemarkNode, StringNode or TagNode, or <code>null</code> if no *************** *** 125,144 **** ch = mPage.getCharacter (probe); if (0 == ch) ! ret = parseString (); else if ('/' == ch || '%' == ch || Character.isLetter (ch)) ! ret = parseTag (); else if ('!' == ch) { ch = mPage.getCharacter (probe); ! if ('-' == ch) ! ret = parseRemark (); else ! ret = parseTag (); } else ! ret = parseString (); break; default: ! ret = parseString (); break; } --- 231,262 ---- ch = mPage.getCharacter (probe); if (0 == ch) ! ret = makeString (probe); else if ('/' == ch || '%' == ch || Character.isLetter (ch)) ! { ! probe.retreat (); ! ret = parseTag (probe); ! } else if ('!' == ch) { ch = mPage.getCharacter (probe); ! if (0 == ch) ! ret = makeString (probe); else ! { ! probe.retreat (); // remark and tag need this character ! if ('-' == ch) ! ret = parseRemark (probe); ! else ! { ! probe.retreat (); // tag needs the previous one too ! ret = parseTag (probe); ! } ! } } else ! ret = parseString (probe); break; default: ! ret = parseString (probe); break; } *************** *** 153,161 **** * case <code>null</code> is returned. */ ! protected Node parseString () throws ParserException { - Cursor cursor; boolean done; char ch; --- 271,278 ---- * case <code>null</code> is returned. */ ! protected Node parseString (Cursor cursor) throws ParserException { boolean done; char ch; *************** *** 163,169 **** int begin; int end; ! StringNode ret; - cursor = mCursor.dup (); done = false; while (!done) --- 280,285 ---- int begin; int end; ! Node ret; done = false; while (!done) *************** *** 191,194 **** --- 307,326 ---- } } + + return (makeString (cursor)); + } + + /** + * Create a string node based on the current cursor and the one provided. + */ + protected Node makeString (Cursor cursor) + throws + ParserException + { + int length; + int begin; + int end; + Node ret; + begin = mCursor.getPosition (); end = cursor.getPosition (); *************** *** 196,205 **** if (0 != length) { // got some characters - ret = new StringNode (mPage, begin, end); mCursor = cursor; } else ret = null; ! return (ret); } --- 328,337 ---- if (0 != length) { // got some characters mCursor = cursor; + ret = getNodeFactory ().createStringNode (this, begin, end); } else ret = null; ! return (ret); } *************** *** 300,308 **** * The first slot is for attribute name (kind of like a standalone attribute). */ ! protected Node parseTag () throws ParserException { - Cursor cursor; boolean done; char ch; --- 432,439 ---- * The first slot is for attribute name (kind of like a standalone attribute). */ ! protected Node parseTag (Cursor cursor) throws ParserException { boolean done; char ch; *************** *** 310,321 **** int[] bookmarks; Vector attributes; - int length; - TagNode ret; - cursor = mCursor.dup (); - // sanity check - ch = mPage.getCharacter (cursor); - if ('<' != ch) - return (parseString ()); done = false; attributes = new Vector (); --- 441,445 ---- *************** *** 418,429 **** } } ! length = cursor.getPosition () - mCursor.getPosition (); if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (parseString ()); ! ret = new TagNode (mPage, mCursor.getPosition (), cursor.getPosition (), attributes); mCursor = cursor; } else --- 542,571 ---- } } ! ! return (makeTag (cursor, attributes)); ! } ! ! /** ! * Create a tag node based on the current cursor and the one provided. ! */ ! protected Node makeTag (Cursor cursor, Vector attributes) ! throws ! ParserException ! { ! int length; ! int begin; ! int end; ! Node ret; ! ! begin = mCursor.getPosition (); ! end = cursor.getPosition (); ! length = end - begin; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (cursor)); mCursor = cursor; + ret = getNodeFactory ().createTagNode (this, begin, end, attributes); } else *************** *** 471,493 **** * We allow terminators like --!> even though this isn't part of the spec. */ ! protected Node parseRemark () throws ParserException { - Cursor cursor; boolean done; char ch; int state; - int length; - RemarkNode ret; - cursor = mCursor.dup (); - // sanity check - ch = mPage.getCharacter (cursor); - if ('<' != ch) - return (parseString ()); - ch = mPage.getCharacter (cursor); - if ('!' != ch) - return (parseString ()); done = false; state = 0; --- 613,624 ---- * We allow terminators like --!> even though this isn't part of the spec. */ ! protected Node parseRemark (Cursor cursor) throws ParserException { boolean done; char ch; int state; done = false; state = 0; *************** *** 501,505 **** state = 1; else ! return (parseString ()); break; case 1: // prior to the second open delimiter --- 632,636 ---- state = 1; else ! return (parseString (cursor)); break; case 1: // prior to the second open delimiter *************** *** 507,515 **** state = 2; else ! return (parseString ()); break; case 2: // prior to the first closing delimiter if ('-' == ch) state = 3; break; case 3: // prior to the second closing delimiter --- 638,648 ---- state = 2; else ! return (parseString (cursor)); break; case 2: // prior to the first closing delimiter if ('-' == ch) state = 3; + else if (0 == ch) + return (parseString (cursor)); // no terminator break; case 3: // prior to the second closing delimiter *************** *** 533,555 **** } } ! length = cursor.getPosition () - mCursor.getPosition (); if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (parseString ()); ! ret = new RemarkNode (mPage, mCursor.getPosition (), cursor.getPosition ()); mCursor = cursor; } else ret = null; ! return (ret); } /** * Mainline for command line operation */ ! public static void main (String[] args) throws IOException, ParserException { URL url; --- 666,748 ---- } } ! ! return (makeRemark (cursor)); ! } ! ! /** ! * Create a remark node based on the current cursor and the one provided. ! */ ! protected Node makeRemark (Cursor cursor) ! throws ! ParserException ! { ! int length; ! int begin; ! int end; ! Node ret; ! ! begin = mCursor.getPosition (); ! end = cursor.getPosition (); ! length = end - begin; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (cursor)); mCursor = cursor; + ret = getNodeFactory ().createRemarkNode (this, begin, end); } else ret = null; ! return (ret); } + // + // NodeFactory interface + // + + /** + * Create a new string node. + * @param lexer The lexer parsing this string. + * @param start The beginning position of the string. + * @param end The ending positiong of the string. + */ + public Node createStringNode (Lexer lexer, int start, int end) + { + return (new StringNode (lexer.getPage (), start, end)); + } + + /** + * Create a new remark node. + * @param lexer The lexer parsing this remark. + * @param start The beginning position of the remark. + * @param end The ending positiong of the remark. + */ + public Node createRemarkNode (Lexer lexer, int start, int end) + { + return (new RemarkNode (lexer.getPage (), start, end)); + } + + /** + * Create a new tag node. + * @param lexer The lexer parsing this tag. + * @param start The beginning position of the tag. + * @param end The ending positiong of the tag. + * @param attributes The attributes contained in this tag. + */ + public Node createTagNode (Lexer lexer, int start, int end, Vector attributes) + { + return (new TagNode (lexer.getPage (), start, end, attributes)); + } + /** * Mainline for command line operation */ ! public static void main (String[] args) ! throws ! MalformedURLException, ! IOException, ! ParserException { URL url; Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** Page.java 22 Sep 2003 02:39:59 -0000 1.15 --- Page.java 28 Sep 2003 15:33:57 -0000 1.16 *************** *** 34,37 **** --- 34,38 ---- import java.io.*; + import java.io.IOException; import java.lang.reflect.*; import java.net.*; *************** *** 55,58 **** --- 56,66 ---- /** + * The URL this page is coming from. + * Cached value of <code>getConnection().toExternalForm()</code> or + * <code>setUrl()</code>. + */ + protected String mUrl; + + /** * The source of characters. */ *************** *** 63,71 **** */ protected PageIndex mIndex; /** * Messages for page not there (404). */ ! static private String[] mFourOhFour = { "The web site you seek cannot be located, but countless more exist", --- 71,84 ---- */ protected PageIndex mIndex; + + /** + * The connection this page is coming from or <code>null</code>. + */ + protected URLConnection mConnection; /** * Messages for page not there (404). */ ! static private final String[] mFourOhFour = { "The web site you seek cannot be located, but countless more exist", *************** *** 96,121 **** if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); ! try ! { ! connection.connect (); ! } ! catch (UnknownHostException uhe) ! { ! int message = (int)(Math.random () * mFourOhFour.length); ! throw new ParserException (mFourOhFour[message], uhe); ! } ! catch (IOException ioe) ! { ! throw new ParserException (ioe.getMessage (), ioe); ! } ! try ! { ! mSource = new Source (new Stream (connection.getInputStream ()), getCharacterSet (connection)); ! } ! catch (IOException ioe) ! { ! throw new ParserException (ioe.getMessage (), ioe); ! } ! mIndex = new PageIndex (this); } --- 109,113 ---- if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); ! setConnection (connection); } *************** *** 137,143 **** mSource = new Source (stream, charset); mIndex = new PageIndex (this); } ! public Page (String text) throws ParserException { InputStream stream; --- 129,137 ---- mSource = new Source (stream, charset); mIndex = new PageIndex (this); + mConnection = null; + mUrl = null; } ! public Page (String text) { InputStream stream; *************** *** 153,158 **** catch (UnsupportedEncodingException uee) { ! throw new ParserException ("problem making a page", uee); } } --- 147,264 ---- catch (UnsupportedEncodingException uee) { ! // this is unlikely, so we cover it up with a runtime exception ! throw new IllegalStateException (uee.getMessage ()); } + mConnection = null; + mUrl = null; + } + + /** + * Reset the page by resetting the source of characters. + */ + public void reset () + { + getSource ().reset (); + mIndex = new PageIndex (this); // todo: is this really necessary? + } + + /** + * Get the connection, if any. + * @return The connection object for this page, or null if this page + * is built from a stream or a string. + */ + public URLConnection getConnection () + { + return (mConnection); + } + + /** + * Set the URLConnection to be used by this page. + * @param connection The connection to use. + * It will be connected by this method. + * @exception ParserException If the <code>connect()</code> method fails, + * or an I/O error occurs opening the input stream or the character set + * designated in the HTTP header is unsupported. + */ + public void setConnection (URLConnection connection) + throws + ParserException + { + Stream stream; + String charset; + + + mUrl = null; + mConnection = connection; + try + { + getConnection ().connect (); + } + catch (UnknownHostException uhe) + { + int message = (int)(Math.random () * mFourOhFour.length); + throw new ParserException (mFourOhFour[message], uhe); + } + catch (IOException ioe) + { + throw new ParserException (ioe.getMessage (), ioe); + } + charset = getCharacterSet (); + try + { + stream = new Stream (getConnection ().getInputStream ()); + try + { + mSource = new Source (stream, charset); + } + catch (UnsupportedEncodingException uee) + { + StringBuffer msg; + String message; + + msg = new StringBuffer (1024); + msg.append (getConnection ().getURL ().toExternalForm ()); + msg.append (" has an encoding ("); + msg.append (charset); + msg.append (") which is not supported, using "); + msg.append (DEFAULT_CHARSET); + System.out.println (msg.toString ()); + charset = DEFAULT_CHARSET; + mSource = new Source (stream, charset); + } + } + catch (IOException ioe) + { + throw new ParserException (ioe.getMessage (), ioe); + } + mIndex = new PageIndex (this); + } + + /** + * Get the URL for this page. + * @return The url for the connection, or <code>null</code> if there is none. + */ + public String getUrl () + { + URLConnection connection; + if (null == mUrl) + { + connection = getConnection (); + if (null != connection) + mUrl = connection.getURL ().toExternalForm (); + } + + return (mUrl); + } + + /** + * Set the URL for this page. + * This doesn't affect the contents of the page, just the interpretation + * of relative links from this point forward. + * @param url The new URL. + */ + public void setUrl (String url) + { + mUrl = url; } *************** *** 260,277 **** /** * Try and extract the character set from the HTTP header. - * @param connection The connection with the charset info. * @return The character set name to use for this HTML page. */ ! protected String getCharacterSet (URLConnection connection) { final String CONTENT_TYPE_STRING = "Content-Type"; ! String string; String ret; ret = DEFAULT_CHARSET; ! string = connection.getHeaderField (CONTENT_TYPE_STRING); ! if (null != string) ! ret = getCharset (string); return (ret); --- 366,386 ---- /** * Try and extract the character set from the HTTP header. * @return The character set name to use for this HTML page. */ ! public String getCharacterSet () { final String CONTENT_TYPE_STRING = "Content-Type"; ! URLConnection connection; String string; String ret; ret = DEFAULT_CHARSET; ! connection = getConnection (); ! if (null != connection) ! { ! string = connection.getHeaderField (CONTENT_TYPE_STRING); ! if (null != string) ! ret = getCharset (string); ! } return (ret); *************** *** 302,306 **** * @see #DEFAULT_CHARSET */ ! protected String getCharset (String content) { final String CHARSET_STRING = "charset"; --- 411,415 ---- * @see #DEFAULT_CHARSET */ ! public String getCharset (String content) { final String CHARSET_STRING = "charset"; *************** *** 408,411 **** --- 517,598 ---- /** + * Get the current encoding being used. + * @return The encoding used to convert characters. + */ + public String getEncoding () + { + return (mSource.getEncoding ()); + } + + /** + * Try and extract the character set from the HTTP header. + * @param connection The connection with the charset info. + * @return The character set name to use for this HTML page. + */ + public void setEncoding (String character_set) + throws + ParserException + { + InputStream stream; + + stream = getSource ().getStream (); + try + { + stream.reset (); + mIndex = new PageIndex (this); + mSource = new Source (stream, character_set); + } + catch (IOException ioe) + { + throw new ParserException (ioe.getMessage (), ioe); + } + + // code from Parser: + + // /* If there is no connection (getConnection() returns null) it simply sets + // * the character set name stored in the parser (Note: the lexer object + // * which must have been set in the constructor or by <code>setLexer()</code>, + // * may or may not be using this character set). + //// * Otherwise (getConnection() doesn't return null) it does this by reopening the + //// * input stream of the connection and creating a reader that uses this + //// * character set. In this case, this method sets two of the fields in the + //// * parser object; <code>character_set</code> and <code>reader</code>. + //// * It does not adjust <code>resourceLocn</code>, <code>url_conn</code>, + //// * <code>scanners</code> or <code>feedback</code>. The two fields are set + //// * atomicly by this method, either they are both set or none of them is set. + //// * Trying to set the encoding to null or an empty string is a noop. + //// * @exception ParserException If the opening of the reader + // */ + // String chs; + // BufferedInputStream in; + // + // if ((null != encoding) && !"".equals (encoding)) + // if (null == getConnection ()) + // character_set = encoding; + // else + // { + // chs = getEncoding (); + // in = input; + // try + // { + // character_set = encoding; + // if (null != getLexer ()) + // getLexer ().getPage ().setCharset (encoding); + // } + // catch (IOException ioe) + // { + // String msg = "setEncoding() : Error in opening a connection to " + getConnection ().getURL ().toExternalForm (); + // ParserException ex = new ParserException (msg, ioe); + // feedback.error (msg, ex); + // character_set = chs; + // input = in; + // throw ex; + // } + // } + // } + // + } + + /** * Get the line number for a cursor. * @param cursor The character offset into the page. *************** *** 418,421 **** --- 605,618 ---- /** + * Get the line number for a cursor. + * @param position The character offset into the page. + * @return The line number the character is in. + */ + public int row (int position) + { + return (mIndex.row (position)); + } + + /** * Get the column number for a cursor. * @param cursor The character offset into the page. *************** *** 428,431 **** --- 625,638 ---- /** + * Get the column number for a cursor. + * @param position The character offset into the page. + * @return The character offset into the line this cursor is on. + */ + public int column (int position) + { + return (mIndex.column (position)); + } + + /** * Get the text identified by the given limits. * @param start The starting position, zero based. *************** *** 494,496 **** --- 701,858 ---- getText (buffer, 0, mSource.mOffset); } + + /** + * Get the text line the position of the cursor lies on. + * @param cursor The position to calculate for. + * @return The contents of the URL or file corresponding to the line number + * containg the cursor position. + */ + public String getLine (Cursor cursor) + { + int line; + int start; + int end; + + line = row (cursor); + start = mIndex.elementAt (line); + line++; + end = mIndex.last (); + if (end <= line) + end = mIndex.elementAt (end); + else + end = mSource.mOffset; + return (getText (start, end)); + } + + // todo refactor into common code method: + + /** + * Get the text line the position of the cursor lies on. + * @param cursor The position to calculate for. + * @return The contents of the URL or file corresponding to the line number + * containg the cursor position. + */ + public String getLine (int position) + { + int line; + int start; + int end; + + line = row (position); + start = mIndex.elementAt (line); + line++; + end = mIndex.last (); + if (end <= line) + end = mIndex.elementAt (end); + else + end = mSource.mOffset; + return (getText (start, end)); + } } + + // /** + // * The default charset. + // * This should be <code>ISO-8859-1</code>, + // * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 + // * Another alias is "8859_1". + // */ + // protected static final String DEFAULT_CHARSET = "ISO-8859-1"; + // + // /** + // * Trigger for charset detection. + // */ + // protected static final String CHARSET_STRING = "charset"; + // + // + // /** + // * Try and extract the character set from the HTTP header. + // * @param connection The connection with the charset info. + // * @return The character set name to use for this HTML page. + // */ + // protected String getCharacterSet (URLConnection connection) + // { + // final String field = "Content-Type"; + // + // String string; + // String ret; + // + // ret = DEFAULT_CHARSET; + // string = connection.getHeaderField (field); + // if (null != string) + // ret = getCharset (string); + // + // return (ret); + // } + // + // /** + // * Get a CharacterSet name corresponding to a charset parameter. + // * @param content A text line of the form: + // * <pre> + // * text/html; charset=Shift_JIS + // * </pre> + // * which is applicable both to the HTTP header field Content-Type and + // * the meta tag http-equiv="Content-Type". + // * Note this method also handles non-compliant quoted charset directives such as: + // * <pre> + // * text/html; charset="UTF-8" + // * </pre> + // * and + // * <pre> + // * text/html; charset='UTF-8' + // * </pre> + // * @return The character set name to use when reading the input stream. + // * For JDKs that have the Charset class this is qualified by passing + // * the name to findCharset() to render it into canonical form. + // * If the charset parameter is not found in the given string, the default + // * character set is returned. + // * @see ParserHelper#findCharset + // * @see #DEFAULT_CHARSET + // */ + // protected String getCharset(String content) + // { + // int index; + // String ret; + // + // ret = DEFAULT_CHARSET; + // if (null != content) + // { + // index = content.indexOf(CHARSET_STRING); + // + // if (index != -1) + // { + // content = content.substring(index + CHARSET_STRING.length()).trim(); + // if (content.startsWith("=")) + // { + // content = content.substring(1).trim(); + // index = content.indexOf(";"); + // if (index != -1) + // content = content.substring(0, index); + // + // //remove any double quotes from around charset string + // if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ())) + // content = content.substring (1, content.length () - 1); + // + // //remove any single quote from around charset string + // if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ())) + // content = content.substring (1, content.length () - 1); + // + // ret = ParserHelper.findCharset(content, ret); + // // Charset names are not case-sensitive; + // // that is, case is always ignored when comparing charset names. + // if (!ret.equalsIgnoreCase(content)) + // { + // feedback.info ( + // "detected charset \"" + // + content + // + "\", using \"" + // + ret + // + "\""); + // } + // } + // } + // } + // + // return (ret); + // } + // + Index: Source.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Source.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** Source.java 22 Sep 2003 02:39:59 -0000 1.9 --- Source.java 28 Sep 2003 15:33:57 -0000 1.10 *************** *** 64,67 **** --- 64,72 ---- /** + * The character set in use. + */ + protected String mEncoding; + + /** * The converter from bytes to characters. */ *************** *** 123,129 **** --- 128,140 ---- mStream = stream; if (null == charset) + { mReader = new InputStreamReader (stream); + mEncoding = mReader.getEncoding (); + } else + { + mEncoding = charset; mReader = new InputStreamReader (stream, charset); + } mBuffer = new char[buffer_size]; mLevel = 0; *************** *** 133,136 **** --- 144,165 ---- /** + * Get the input stream being used. + * @return The current input stream. + */ + public InputStream getStream () + { + return (mStream); + } + + /** + * Get the encoding being used to convert characters. + * @return The current encoding. + */ + public String getEncoding () + { + return (mEncoding); + } + + /** * Fetch more characters from the underlying reader. * Has no effect if the underlying reader has been drained. *************** *** 279,297 **** /** ! * Reset the stream. If the stream has been marked, then attempt to ! * reposition it at the mark. If the stream has not been marked, then ! * attempt to reset it in some way appropriate to the particular stream, ! * for example by repositioning it to its starting point. Not all ! * character-input streams support the reset() operation, and some support ! * reset() without supporting mark(). ! * @exception IOException If the stream has not been marked, ! * or if the mark has been invalidated, ! * or if the stream does not support reset(), ! * or if some other I/O error occurs */ ! public void reset () throws IOException { if (null == mStream) // mStream goes null on close() ! throw new IOException ("reader is closed"); if (-1 != mMark) mOffset = mMark; --- 308,319 ---- /** ! * Reset the source. ! * Repositions the read point to begin at zero. ! * @exception IllegalStateException If the source has been closed. */ ! public void reset () { if (null == mStream) // mStream goes null on close() ! throw new IllegalStateException ("source is closed"); if (-1 != mMark) mOffset = mMark; |