Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1:/tmp/cvs-serv29167/lexer
Modified Files:
Page.java
Log Message:
Add simplistic web site capture example application.
Demonstration of using custom tags in the NodeFactory.
Fixed various issues with URL rewriting.
Index: Page.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** Page.java 16 Dec 2003 02:29:55 -0000 1.29
--- Page.java 29 Dec 2003 14:18:09 -0000 1.30
***************
*** 60,63 ****
--- 60,69 ----
/**
+ * The default content type.
+ * In the absence of alternate information, assume html content.
+ */
+ public static final String DEFAULT_CONTENT_TYPE = "text/html";
+
+ /**
* The URL this page is coming from.
* Cached value of <code>getConnection().toExternalForm()</code> or
***************
*** 310,315 ****
{
Stream stream;
String charset;
-
mConnection = connection;
--- 316,321 ----
{
Stream stream;
+ String type;
String charset;
mConnection = connection;
***************
*** 327,331 ****
throw new ParserException (ioe.getMessage (), ioe);
}
! charset = getCharacterSet ();
try
{
--- 333,343 ----
throw new ParserException (ioe.getMessage (), ioe);
}
! type = getContentType ();
! if (!type.startsWith ("text"))
! throw new ParserException (
! "URL "
! + connection.getURL ().toExternalForm ()
! + " does not contain text");
! charset = getCharset (type);
try
{
***************
*** 391,394 ****
--- 403,423 ----
/**
+ * Try and extract the content type from the HTTP header.
+ * @return The content type.
+ */
+ public String getContentType ()
+ {
+ URLConnection connection;
+ String ret;
+
+ ret = DEFAULT_CONTENT_TYPE;
+ connection = getConnection ();
+ if (null != connection)
+ ret = connection.getContentType ();
+
+ return (ret);
+ }
+
+ /**
* Read the character at the cursor position.
* The cursor position can be behind or equal to the current source position.
***************
*** 479,505 ****
// update the EOL index in any case
mIndex.add (cursor);
-
- return (ret);
- }
-
- /**
- * Try and extract the character set from the HTTP header.
- * @return The character set name to use for this HTML page.
- */
- public String getCharacterSet ()
- {
- final String CONTENT_TYPE_STRING = "Content-Type";
- URLConnection connection;
- String string;
- String ret;
-
- ret = DEFAULT_CHARSET;
- connection = getConnection ();
- if (null != connection)
- {
- string = connection.getHeaderField (CONTENT_TYPE_STRING);
- if (null != string)
- ret = getCharset (string);
- }
return (ret);
--- 508,511 ----
|