[Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Page.java,1.29,1.30
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-12-29 14:18:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv29167/lexer Modified Files: Page.java Log Message: Add simplistic web site capture example application. Demonstration of using custom tags in the NodeFactory. Fixed various issues with URL rewriting. Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** Page.java 16 Dec 2003 02:29:55 -0000 1.29 --- Page.java 29 Dec 2003 14:18:09 -0000 1.30 *************** *** 60,63 **** --- 60,69 ---- /** + * The default content type. + * In the absence of alternate information, assume html content. + */ + public static final String DEFAULT_CONTENT_TYPE = "text/html"; + + /** * The URL this page is coming from. * Cached value of <code>getConnection().toExternalForm()</code> or *************** *** 310,315 **** { Stream stream; String charset; - mConnection = connection; --- 316,321 ---- { Stream stream; + String type; String charset; mConnection = connection; *************** *** 327,331 **** throw new ParserException (ioe.getMessage (), ioe); } ! charset = getCharacterSet (); try { --- 333,343 ---- throw new ParserException (ioe.getMessage (), ioe); } ! type = getContentType (); ! if (!type.startsWith ("text")) ! throw new ParserException ( ! "URL " ! + connection.getURL ().toExternalForm () ! + " does not contain text"); ! charset = getCharset (type); try { *************** *** 391,394 **** --- 403,423 ---- /** + * Try and extract the content type from the HTTP header. + * @return The content type. + */ + public String getContentType () + { + URLConnection connection; + String ret; + + ret = DEFAULT_CONTENT_TYPE; + connection = getConnection (); + if (null != connection) + ret = connection.getContentType (); + + return (ret); + } + + /** * Read the character at the cursor position. * The cursor position can be behind or equal to the current source position. *************** *** 479,505 **** // update the EOL index in any case mIndex.add (cursor); - - return (ret); - } - - /** - * Try and extract the character set from the HTTP header. - * @return The character set name to use for this HTML page. - */ - public String getCharacterSet () - { - final String CONTENT_TYPE_STRING = "Content-Type"; - URLConnection connection; - String string; - String ret; - - ret = DEFAULT_CHARSET; - connection = getConnection (); - if (null != connection) - { - string = connection.getHeaderField (CONTENT_TYPE_STRING); - if (null != string) - ret = getCharset (string); - } return (ret); --- 508,511 ---- |