Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser Parser.java,1.98,1.99
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2004-09-02 02:28:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv29769/src/org/htmlparser Modified Files: Parser.java Log Message: Implemented: RFE #1017249 HTML Client Doesn't Support Cookies but will follow redirect RFE #1010586 Add support for password protected URL and RFE #1000739 Add support for proxy scenario A new http package is added, the primary class being Connectionmanager which handles proxies, passwords and cookies. Some testing still needed. Also removed some line separator cruft. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.98 retrieving revision 1.99 diff -C2 -d -r1.98 -r1.99 *** Parser.java 29 Jul 2004 02:01:02 -0000 1.98 --- Parser.java 2 Sep 2004 02:28:08 -0000 1.99 *************** *** 27,42 **** package org.htmlparser; - import java.io.File; - import java.io.IOException; import java.io.Serializable; ! import java.net.MalformedURLException; ! import java.net.URL; import java.net.URLConnection; - import java.util.HashMap; - import java.util.Iterator; - import java.util.Map; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; --- 27,38 ---- package org.htmlparser; import java.io.Serializable; ! import java.net.HttpURLConnection; import java.net.URLConnection; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.filters.NodeClassFilter; + import org.htmlparser.http.ConnectionManager; + import org.htmlparser.http.ConnectionMonitor; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; *************** *** 59,67 **** * This is a thread-safe way, and you only get the control back after a * particular element is parsed and returned, which could be the entire body. ! * @see Parser#elements() */ public class Parser implements ! Serializable { // Please don't change the formatting of the version variables below. --- 55,64 ---- * This is a thread-safe way, and you only get the control back after a * particular element is parsed and returned, which could be the entire body. ! * @see Parser#elements() */ public class Parser implements ! Serializable, ! ConnectionMonitor { // Please don't change the formatting of the version variables below. *************** *** 99,112 **** /** - * Default Request header fields. - * So far this is just "User-Agent". - */ - protected static Map mDefaultRequestProperties = new HashMap (); - static - { - mDefaultRequestProperties.put ("User-Agent", "HTMLParser/" + VERSION_NUMBER); - } - - /** * Feedback object. */ --- 96,99 ---- *************** *** 119,130 **** /** - * Variable to store lineSeparator. - * This is setup to read <code>line.separator</code> from the System property. - * However it can also be changed using the mutator methods. - * This will be used in the toHTML() methods in all the sub-classes of Node. - */ - protected static String lineSeparator = System.getProperty("line.separator", "\n"); - - /** * A quiet message sink. * Use this for no feedback. --- 106,109 ---- *************** *** 143,154 **** /** - * @param lineSeparatorString New Line separator to be used - */ - public static void setLineSeparator(String lineSeparatorString) - { - lineSeparator = lineSeparatorString; - } - - /** * Return the version string of this parser. * @return A string of the form: --- 122,125 ---- *************** *** 173,237 **** /** ! * Get the current default request header properties. ! * A String-to-String map of header keys and values. ! * These fields are set by the parser when creating a connection. */ ! public static Map getDefaultRequestProperties () { ! return (mDefaultRequestProperties); } /** ! * Set the default request header properties. ! * A String-to-String map of header keys and values. ! * These fields are set by the parser when creating a connection. ! * Some of these can be set directly on a <code>URLConnection</code>, ! * i.e. If-Modified-Since is set with setIfModifiedSince(long), ! * but since the parser transparently opens the connection on behalf ! * of the developer, these properties are not available before the ! * connection is fetched. Setting these request header fields affects all ! * subsequent connections opened by the parser. For more direct control ! * create a <code>URLConnection</code> and set it on the parser.<p> ! * From <a href="http://www.ietf.org/rfc/rfc2616.txt">RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1</a>: ! * <pre> ! * 5.3 Request Header Fields ! * ! * The request-header fields allow the client to pass additional ! * information about the request, and about the client itself, to the ! * server. These fields act as request modifiers, with semantics ! * equivalent to the parameters on a programming language method ! * invocation. ! * ! * request-header = Accept ; Section 14.1 ! * | Accept-Charset ; Section 14.2 ! * | Accept-Encoding ; Section 14.3 ! * | Accept-Language ; Section 14.4 ! * | Authorization ; Section 14.8 ! * | Expect ; Section 14.20 ! * | From ; Section 14.22 ! * | Host ; Section 14.23 ! * | If-Match ; Section 14.24 ! * | If-Modified-Since ; Section 14.25 ! * | If-None-Match ; Section 14.26 ! * | If-Range ; Section 14.27 ! * | If-Unmodified-Since ; Section 14.28 ! * | Max-Forwards ; Section 14.31 ! * | Proxy-Authorization ; Section 14.34 ! * | Range ; Section 14.35 ! * | Referer ; Section 14.36 ! * | TE ; Section 14.39 ! * | User-Agent ; Section 14.43 ! * ! * Request-header field names can be extended reliably only in ! * combination with a change in the protocol version. However, new or ! * experimental header fields MAY be given the semantics of request- ! * header fields if all parties in the communication recognize them to ! * be request-header fields. Unrecognized header fields are treated as ! * entity-header fields. ! * </pre> */ ! public static void setDefaultRequestProperties (Map properties) { ! mDefaultRequestProperties = properties; } --- 144,181 ---- /** ! * Get the connection manager all Parsers use. ! * @return The connection manager. */ ! public static ConnectionManager getConnectionManager () { ! return (Page.getConnectionManager ()); } /** ! * Set the connection manager all Parsers use. ! * @return The connection manager. */ ! public static void setConnectionManager (ConnectionManager manager) { ! Page.setConnectionManager (manager); ! } ! ! /** ! * Creates the parser on an input string. ! * @param html The string containing HTML. ! * @param charset <em>Optional</em>. The character set encoding that will ! * be reported by {@link #getEncoding}. If charset is <code>null</code> ! * the default character set is used. ! * @return A parser with the <code>html</code> string as input. ! */ ! public static Parser createParser (String html, String charset) ! { ! Parser ret; ! ! if (null == html) ! throw new IllegalArgumentException ("html cannot be null"); ! ret = new Parser (new Lexer (new Page (html, charset))); ! ! return (ret); } *************** *** 271,275 **** * is provided. */ ! public Parser(Lexer lexer, ParserFeedback fb) { setFeedback (fb); --- 215,219 ---- * is provided. */ ! public Parser (Lexer lexer, ParserFeedback fb) { setFeedback (fb); *************** *** 303,309 **** * @see #Parser(URLConnection,ParserFeedback) */ ! public Parser(String resourceLocn, ParserFeedback feedback) throws ParserException { ! this (openConnection (resourceLocn, feedback), feedback); } --- 247,253 ---- * @see #Parser(URLConnection,ParserFeedback) */ ! public Parser (String resourceLocn, ParserFeedback feedback) throws ParserException { ! this (getConnectionManager ().openConnection (resourceLocn), feedback); } *************** *** 313,317 **** * @param resourceLocn Either the URL or the filename (autodetects). */ ! public Parser(String resourceLocn) throws ParserException { this (resourceLocn, stdout); --- 257,261 ---- * @param resourceLocn Either the URL or the filename (autodetects). */ ! public Parser (String resourceLocn) throws ParserException { this (resourceLocn, stdout); *************** *** 395,399 **** { if ((null != url) && !"".equals (url)) ! setConnection (openConnection (url, getFeedback ())); } --- 339,343 ---- { if ((null != url) && !"".equals (url)) ! setConnection (Page.getConnectionManager ().openConnection (url)); } *************** *** 573,748 **** } - /** - * Opens a connection using the given url. - * @param url The url to open. - * @param feedback The ibject to use for messages or <code>null</code>. - * @exception ParserException if an i/o exception occurs accessing the url. - */ - public static URLConnection openConnection (URL url, ParserFeedback feedback) - throws - ParserException - { - Map properties; - String key; - String value; - URLConnection ret; - - try - { - ret = url.openConnection (); - properties = getDefaultRequestProperties (); - if (null != properties) - for (Iterator iterator = properties.keySet ().iterator (); iterator.hasNext (); ) - { - key = (String)iterator.next (); - value = (String)properties.get (key); - ret.setRequestProperty (key, value); - } - } - catch (IOException ioe) - { - String msg = "HTMLParser.openConnection() : Error in opening a connection to " + url.toExternalForm (); - ParserException ex = new ParserException (msg, ioe); - if (null != feedback) - feedback.error (msg, ex); - throw ex; - } - - return (ret); - } - - /** - * Turn spaces into %20. - * @param url The url containing spaces. - * @return The URL with spaces as %20 sequences. - */ - public static String fixSpaces (String url) - { - int index; - int length; - char ch; - StringBuffer returnURL; - - index = url.indexOf (' '); - if (-1 != index) - { - length = url.length (); - returnURL = new StringBuffer (length * 3); - returnURL.append (url.substring (0, index)); - for (int i = index; i < length; i++) - { - ch = url.charAt (i); - if (ch==' ') - returnURL.append ("%20"); - else - returnURL.append (ch); - } - url = returnURL.toString (); - } - - return (url); - } - - /** - * Opens a connection based on a given string. - * The string is either a file, in which case <code>file://localhost</code> - * is prepended to a canonical path derived from the string, or a url that - * begins with one of the known protocol strings, i.e. <code>http://</code>. - * Embedded spaces are silently converted to %20 sequences. - * @param string The name of a file or a url. - * @param feedback The object to use for messages or <code>null</code> for no feedback. - * @exception ParserException if the string is not a valid url or file. - */ - public static URLConnection openConnection (String string, ParserFeedback feedback) - throws - ParserException - { - final String prefix = "file://localhost"; - String resource; - URL url; - StringBuffer buffer; - URLConnection ret; - - try - { - url = new URL (fixSpaces (string)); - ret = openConnection (url, feedback); - } - catch (MalformedURLException murle) - { // try it as a file - try - { - File file = new File (string); - resource = file.getCanonicalPath (); - buffer = new StringBuffer (prefix.length () + resource.length ()); - buffer.append (prefix); - if (!resource.startsWith ("/")) - buffer.append ("/"); - buffer.append (resource); - url = new URL (fixSpaces (buffer.toString ())); - ret = openConnection (url, feedback); - if (null != feedback) - feedback.info (url.toExternalForm ()); - } - catch (MalformedURLException murle2) - { - String msg = "HTMLParser.openConnection() : Error in opening a connection to " + string; - ParserException ex = new ParserException (msg, murle2); - if (null != feedback) - feedback.error (msg, ex); - throw ex; - } - catch (IOException ioe) - { - String msg = "HTMLParser.openConnection() : Error in opening a connection to " + string; - ParserException ex = new ParserException (msg, ioe); - if (null != feedback) - feedback.error (msg, ex); - throw ex; - } - } - - return (ret); - } - - /** - * The main program, which can be executed from the command line - */ - public static void main(String [] args) - { - System.out.println("HTMLParser v"+VERSION_STRING); - if (args.length<1 || args[0].equals("-help")) - { - System.out.println(); - System.out.println("Syntax : java -jar htmlparser.jar <resourceLocn/website> [node_type]"); - System.out.println(" <resourceLocn/website> the URL or file to be parsed"); - System.out.println(" node_type an optional node name, for example:"); - System.out.println(" A - Show only the link tags extracted from the document"); - System.out.println(" IMG - Show only the image tags extracted from the document"); - System.out.println(" TITLE - Extract the title from the document"); - System.out.println(); - System.out.println("Example : java -jar htmlparser.jar http://www.yahoo.com"); - System.out.println(); - System.out.println("For support, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page..."); - System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); - System.out.println(); - System.exit(-1); - } - try - { - Parser parser = new Parser (args[0]); - System.out.println ("Parsing " + parser.getURL ()); - NodeFilter filter; - if (1 < args.length) - filter = new TagNameFilter (args[1]); - else - filter = null; - parser.parse (filter); - } - catch (ParserException e) { - e.printStackTrace(); - } - } - public void visitAllNodesWith(NodeVisitor visitor) throws ParserException { Node node; --- 517,520 ---- *************** *** 798,825 **** } /** ! * Creates the parser on an input string. ! * @param html The string containing HTML. ! * @param charset <em>Optional</em>. The character set encoding that will ! * be reported by {@link #getEncoding}. If charset is <code>null</code> ! * the default character set is used. ! * @return A parser with the <code>html</code> string as input. */ ! public static Parser createParser (String html, String charset) ! { ! Parser ret; ! ! if (null == html) ! throw new IllegalArgumentException ("html cannot be null"); ! ret = new Parser (new Lexer (new Page (html, charset))); ! return (ret); } /** ! * @return String lineSeparator that will be used in toHTML() */ ! public static String getLineSeparator() { ! return lineSeparator; } } --- 570,652 ---- } + // + // ConnectionMonitor interface + // + /** ! * Called just prior to calling connect. ! * The connection has been conditioned with proxy, URL user/password, ! * and cookie information. It is still possible to adjust the ! * connection to alter the request method for example. ! * @param connection The connection which is about to be connected. ! * @exception This exception is thrown if the connection monitor ! * wants the ConnectionManager to bail out. */ ! public void preConnect (HttpURLConnection connection) ! throws ! ParserException ! { ! if (null != getFeedback ()) ! getFeedback ().info (ConnectionManager.getRequestHeader (connection)); ! } ! /** Called just after calling connect. ! * The response code and header fields can be examined. ! * @param connection The connection that was just connected. ! * @exception This exception is thrown if the connection monitor ! * wants the ConnectionManager to bail out. ! */ ! public void postConnect (HttpURLConnection connection) ! throws ! ParserException ! { ! if (null != getFeedback ()) ! getFeedback ().info (ConnectionManager.getResponseHeader (connection)); } /** ! * The main program, which can be executed from the command line */ ! public static void main (String [] args) ! { ! Parser parser; ! NodeFilter filter; ! ! if (args.length < 1 || args[0].equals ("-help")) ! { ! System.out.println ("HTML Parser v" + VERSION_STRING + "\n"); ! System.out.println (); ! System.out.println ("Syntax : java -jar htmlparser.jar <resourceLocn/website> [node_type]"); ! System.out.println (" <resourceLocn/website> the URL or file to be parsed"); ! System.out.println (" node_type an optional node name, for example:"); ! System.out.println (" A - Show only the link tags extracted from the document"); ! System.out.println (" IMG - Show only the image tags extracted from the document"); ! System.out.println (" TITLE - Extract the title from the document"); ! System.out.println (); ! System.out.println ("Example : java -jar htmlparser.jar http://www.yahoo.com"); ! System.out.println (); ! System.out.println ("For support, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page..."); ! System.out.println ("HTML Parser home page : http://htmlparser.org"); ! System.out.println (); ! } ! else ! try ! { ! parser = new Parser (); ! if (1 < args.length) ! filter = new TagNameFilter (args[1]); ! else ! { // for a simple dump, use more verbose settings ! filter = null; ! parser.setFeedback (Parser.stdout); ! getConnectionManager ().setMonitor (parser); ! } ! parser.setURL (args[0]); ! parser.parse (filter); ! } ! catch (ParserException e) ! { ! e.printStackTrace (); ! } } } |