Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Page.java,1.29,1.30

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer
In directory sc8-pr-cvs1:/tmp/cvs-serv29167/lexer

Modified Files:
	Page.java 
Log Message:
Add simplistic web site capture example application.
Demonstration of using custom tags in the NodeFactory.
Fixed various issues with URL rewriting.


Index: Page.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** Page.java	16 Dec 2003 02:29:55 -0000	1.29
--- Page.java	29 Dec 2003 14:18:09 -0000	1.30
***************
*** 60,63 ****
--- 60,69 ----
  
      /**
+      * The default content type.
+      * In the absence of alternate information, assume html content.
+      */
+     public static final String DEFAULT_CONTENT_TYPE = "text/html";
+ 
+     /**
       * The URL this page is coming from.
       * Cached value of <code>getConnection().toExternalForm()</code> or
***************
*** 310,315 ****
      {
          Stream stream;
          String charset;
-         
  
          mConnection = connection;
--- 316,321 ----
      {
          Stream stream;
+         String type;
          String charset;
  
          mConnection = connection;
***************
*** 327,331 ****
              throw new ParserException (ioe.getMessage (), ioe);
          }
!         charset = getCharacterSet ();
          try
          {
--- 333,343 ----
              throw new ParserException (ioe.getMessage (), ioe);
          }
!         type = getContentType ();
!         if (!type.startsWith ("text"))
!             throw new ParserException (
!                 "URL "
!                 + connection.getURL ().toExternalForm ()
!                 + " does not contain text");
!         charset = getCharset (type);
          try
          {
***************
*** 391,394 ****
--- 403,423 ----
  
      /**
+      * Try and extract the content type from the HTTP header.
+      * @return The content type.
+      */
+     public String getContentType ()
+     {
+         URLConnection connection;
+         String ret;
+ 
+         ret = DEFAULT_CONTENT_TYPE;
+         connection = getConnection ();
+         if (null != connection)
+             ret = connection.getContentType ();
+ 
+         return (ret);
+     }
+ 
+     /**
       * Read the character at the cursor position.
       * The cursor position can be behind or equal to the current source position.
***************
*** 479,505 ****
              // update the EOL index in any case
              mIndex.add (cursor);
- 
-         return (ret);
-     }
- 
-     /**
-      * Try and extract the character set from the HTTP header.
-      * @return The character set name to use for this HTML page.
-      */
-     public String getCharacterSet ()
-     {
-         final String CONTENT_TYPE_STRING = "Content-Type";
-         URLConnection connection;
-         String string;
-         String ret;
- 
-         ret = DEFAULT_CHARSET;
-         connection = getConnection ();
-         if (null != connection)
-         {
-             string = connection.getHeaderField (CONTENT_TYPE_STRING);
-             if (null != string)
-                 ret = getCharset (string);
-         }
  
          return (ret);
--- 508,511 ----

Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/lexer Page.java,1.29,1.30

htmlparser-cvs