[Htmlparser-cvs] htmlparser/src/org/htmlparser Parser.java, 1.116, 1.117
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2006-06-02 01:48:48
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv32515 Modified Files: Parser.java Log Message: implement RFE #1436082 Follow redirections with cookie processing Use ConnectionManager.setRedirectionProcessingEnabled(true). Probably only useful if combined with ConnectionManager.setCookieProcessingEnabled(true). Index: Parser.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.116 retrieving revision 1.117 diff -C2 -d -r1.116 -r1.117 *** Parser.java 30 May 2006 03:11:02 -0000 1.116 --- Parser.java 2 Jun 2006 01:48:44 -0000 1.117 *************** *** 230,233 **** --- 230,234 ---- * the default character set is used. * @return A parser with the <code>html</code> string as input. + * @exception IllegalArgumentException if <code>html</code> is <code>null</code>. */ public static Parser createParser (String html, String charset) *************** *** 271,276 **** { setFeedback (fb); - if (null == lexer) - throw new IllegalArgumentException ("lexer cannot be null"); setLexer (lexer); setNodeFactory (new PrototypicalNodeFactory ()); --- 272,275 ---- *************** *** 315,341 **** ParserException { - int length; - boolean html; - char ch; - - if (null == resource) - throw new IllegalArgumentException ("resource cannot be null"); setFeedback (feedback); ! length = resource.length (); ! html = false; ! for (int i = 0; i < length; i++) ! { ! ch = resource.charAt (i); ! if (!Character.isWhitespace (ch)) ! { ! if ('<' == ch) ! html = true; ! break; ! } ! } ! if (html) ! setLexer (new Lexer (new Page (resource))); ! else ! setLexer (new Lexer (getConnectionManager ().openConnection (resource))); setNodeFactory (new PrototypicalNodeFactory ()); } --- 314,319 ---- ParserException { setFeedback (feedback); ! setResource (resource); setNodeFactory (new PrototypicalNodeFactory ()); } *************** *** 388,391 **** --- 366,403 ---- /** + * Set the html, a url, or a file. + * @param resource The resource to use. + * @exception IllegalArgumentException if <code>resource</code> is <code>null</code>. + * @exception ParserException if a problem occurs in connecting. + */ + public void setResource (String resource) + throws + ParserException + { + int length; + boolean html; + char ch; + + if (null == resource) + throw new IllegalArgumentException ("resource cannot be null"); + length = resource.length (); + html = false; + for (int i = 0; i < length; i++) + { + ch = resource.charAt (i); + if (!Character.isWhitespace (ch)) + { + if ('<' == ch) + html = true; + break; + } + } + if (html) + setLexer (new Lexer (new Page (resource))); + else + setLexer (new Lexer (getConnectionManager ().openConnection (resource))); + } + + /** * Set the connection for this parser. * This method creates a new <code>Lexer</code> reading from the connection. *************** *** 397,400 **** --- 409,414 ---- * @see #setLexer * @see #getConnection + * @exception IllegalArgumentException if <code>connection</code> is <code>null</code>. + * @exception ParserException if a problem occurs in connecting. */ public void setConnection (URLConnection connection) *************** *** 425,428 **** --- 439,443 ---- * @throws ParserException If the url is invalid or creation of the * underlying Lexer cannot be performed. + * @exception ParserException if a problem occurs in connecting. * @see #getURL */ *************** *** 481,488 **** * since the lexer owns the node factory object. * It does not adjust the <code>feedback</code> object. - * Trying to set the lexer to <code>null</code> is a no-op. * @param lexer The lexer object to use. * @see #setNodeFactory * @see #getLexer */ public void setLexer (Lexer lexer) --- 496,503 ---- * since the lexer owns the node factory object. * It does not adjust the <code>feedback</code> object. * @param lexer The lexer object to use. * @see #setNodeFactory * @see #getLexer + * @exception IllegalArgumentException if <code>lexer</code> is <code>null</code>. */ public void setLexer (Lexer lexer) *************** *** 491,510 **** String type; ! if (null != lexer) ! { // move a node factory that's been set to the new lexer ! factory = null; ! if (null != getLexer ()) ! factory = getLexer ().getNodeFactory (); ! if (null != factory) ! lexer.setNodeFactory (factory); ! mLexer = lexer; ! // warn about content that's not likely text ! type = mLexer.getPage ().getContentType (); ! if (type != null && !type.startsWith ("text")) ! getFeedback ().warning ( ! "URL " ! + mLexer.getPage ().getUrl () ! + " does not contain text"); ! } } --- 506,525 ---- String type; ! if (null == lexer) ! throw new IllegalArgumentException ("lexer cannot be null"); ! // move a node factory that's been set to the new lexer ! factory = null; ! if (null != getLexer ()) ! factory = getLexer ().getNodeFactory (); ! if (null != factory) ! lexer.setNodeFactory (factory); ! mLexer = lexer; ! // warn about content that's not likely text ! type = mLexer.getPage ().getContentType (); ! if (type != null && !type.startsWith ("text")) ! getFeedback ().warning ( ! "URL " ! + mLexer.getPage ().getUrl () ! + " does not contain text"); } *************** *** 533,536 **** --- 548,552 ---- * @param factory The new node factory for the current lexer. * @see #getNodeFactory + * @exception IllegalArgumentException if <code>factory</code> is <code>null</code>. */ public void setNodeFactory (NodeFactory factory) *************** *** 720,723 **** --- 736,740 ---- * @throws ParserException If a error occurs in setting up the * underlying Lexer. + * @exception IllegalArgumentException if <code>inputHTML</code> is <code>null</code>. */ public void setInputHTML (String inputHTML) *************** *** 838,852 **** try { if (1 < args.length) filter = new TagNameFilter (args[1]); else - filter = null; - parser = new Parser (args[0]); - if (1 < args.length) { // for a simple dump, use more verbose settings parser.setFeedback (Parser.STDOUT); getConnectionManager ().setMonitor (parser); } System.out.println (parser.parse (filter)); } --- 855,871 ---- try { + parser = new Parser (); if (1 < args.length) filter = new TagNameFilter (args[1]); else { + filter = null; // for a simple dump, use more verbose settings parser.setFeedback (Parser.STDOUT); getConnectionManager ().setMonitor (parser); } + getConnectionManager ().setRedirectionProcessingEnabled (true); + getConnectionManager ().setCookieProcessingEnabled (true); + parser.setResource (args[0]); System.out.println (parser.parse (filter)); } |