htmlparser-cvs Mailing List for HTML Parser
Brought to you by:
derrickoswald
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: EBS-Printer c. s. <sa...@eb...> - 2015-03-14 16:22:36
|
<html> <style>body {font-family:Arial, Verdana; font-size:11px; background-image: url( http://www.ebsenterprise.com/Files/NewsLetterBGImage/1425954355263741. ); background-repeat: no-repeat; } </style><body> <p> <span style="font-size: 14px">The following product is now available for sale:</span></p> <p> <span style="font-size: 16px"><strong><strong><span style="font-size: 16px"><strong><strong><span style="font-size: 16px"><strong><strong>Product: <strong><font size="3">Genuine 73/73N Inkjet Cartridges</font></strong></strong></strong></span><span style="font-size: 16px"><strong><strong><strong><strong><strong><font size="3"><span style="font-size: 16px"><strong><strong><strong><strong><font size="3"><span style="font-size: 16px"><strong><strong><strong><strong><br /> </strong></strong></strong></strong></span></font></strong></strong></strong></strong></span></font></strong></strong></strong></strong></strong></span></strong></strong></span></strong></strong></span><span style="font-size: 16px"><strong><strong><span style="font-size: 16px"><strong><strong><span style="font-size: 16px"><strong><strong><strong><strong><strong><font size="3"><font size="3"><strong><strong><font size="3"><span style="font-size: 16px"><strong><strong><span style="font-size: 16px"><strong><strong><span style="font-size: 16px"><strong><strong><strong><strong><strong><font size="3"><font size="3"><strong><strong><font size="3">Model#: T0731/T0731N - T0734/T0734N (CMYK)</font></strong><strong><font size="3"><strong><font size="3"><br /> </font></strong></font></strong></strong></font></font></strong></strong></strong></strong></strong></span></strong></strong></span></strong></strong></span>Quantities Available: 4000 sets</font></strong><strong><font size="3"><strong><font size="3"><br /> </font></strong></font></strong></strong></font></font></strong></strong></strong></strong></strong></span></strong></strong></span><span style="font-size: 16px"><strong><strong><span style="font-size: 16px"><strong><strong><strong><strong><font size="3">Packing Condition : Genuine bulk packaging</font></strong></strong></strong></strong></span></strong></strong></span></strong></strong></span></p> <table border="6" cellpadding="0" cellspacing="0" style="height: 29px; width: 61px"> <tbody> <tr> <td> <img alt="" src="cid:269551.jpg" style="height: 269px; width: 196px" /></td> <td> <img alt="" src="cid:269552.jpg" style="height: 269px; width: 250px" /></td> </tr> </tbody> </table> <p> <span class="Apple-style-span" style="white-space: normal; word-spacing: 0px; border-collapse: separate; text-transform: none; color: rgb(0,0,0); font: medium 'times new roman'; orphans: 2; widows: 2; letter-spacing: normal; text-indent: 0px; -webkit-text-stroke-width: 0px; -webkit-border-horizontal-spacing: 0px; -webkit-border-vertical-spacing: 0px; -webkit-text-decorations-in-effect: none; -webkit-text-size-adjust: auto">Please let us know if interested.<span class="Apple-converted-space"> </span><br /> </span><span class="Apple-style-span" style="white-space: normal; word-spacing: 0px; border-collapse: separate; text-transform: none; color: rgb(0,0,0); font: medium 'times new roman'; orphans: 2; widows: 2; letter-spacing: normal; text-indent: 0px; -webkit-text-stroke-width: 0px; -webkit-border-horizontal-spacing: 0px; -webkit-border-vertical-spacing: 0px; -webkit-text-decorations-in-effect: none; -webkit-text-size-adjust: auto">Thank you for your attention</span></p> <div> <span class="Apple-style-span" style="white-space: normal; word-spacing: 0px; border-collapse: separate; text-transform: none; color: rgb(0,0,0); font: medium 'times new roman'; orphans: 2; widows: 2; letter-spacing: normal; text-indent: 0px; -webkit-text-stroke-width: 0px; -webkit-border-horizontal-spacing: 0px; -webkit-border-vertical-spacing: 0px; -webkit-text-decorations-in-effect: none; -webkit-text-size-adjust: auto">Regards,</span></div> <div> <span class="Apple-style-span" style="white-space: normal; word-spacing: 0px; border-collapse: separate; text-transform: none; color: rgb(0,0,0); font: medium 'times new roman'; orphans: 2; widows: 2; letter-spacing: normal; text-indent: 0px; -webkit-text-stroke-width: 0px; -webkit-border-horizontal-spacing: 0px; -webkit-border-vertical-spacing: 0px; -webkit-text-decorations-in-effect: none; -webkit-text-size-adjust: auto">EBS - Your international business partner </span></div> <div> <span class="Apple-style-span" style="white-space: normal; word-spacing: 0px; border-collapse: separate; text-transform: none; color: rgb(0,0,0); font: medium 'times new roman'; orphans: 2; widows: 2; letter-spacing: normal; text-indent: 0px; -webkit-text-stroke-width: 0px; -webkit-border-horizontal-spacing: 0px; -webkit-border-vertical-spacing: 0px; -webkit-text-decorations-in-effect: none; -webkit-text-size-adjust: auto">See our website for more products: <a href="http://www.ebsenterprise.com/">http://www.ebsenterprise.com/</a></span></div> <div> <span style="font-size: 10px"><span class="Apple-style-span" style="font-family: 'times new roman'; font-variant: normal; white-space: normal; word-spacing: 0px; border-collapse: separate; text-transform: none; font-weight: normal; color: rgb(0,0,0); font-style: normal; orphans: 2; widows: 2; letter-spacing: normal; line-height: normal; text-indent: 0px; -webkit-text-stroke-width: 0px; -webkit-border-horizontal-spacing: 0px; -webkit-border-vertical-spacing: 0px; -webkit-text-decorations-in-effect: none; -webkit-text-size-adjust: auto"><br /> Hua Chiao Commercial Center, 678 Nathan Road, Kowloon, Hong Kong<br /> Tel: +852-9721-9560<a _itemref="101" border="none #404040" c="NaN" href="javascript:ElClick(getId('Btn_Save'))" id="mmlink27" name="mM1" onclick="return $K(this._itemRef)" onmouseover="_p1(this);_mot=$P(_mot)" style="overflow: hidden; font-size: 999px; text-decoration: none; height: 30px; border-top-color: #404040; width: 69px; background: none transparent scroll repeat 0% 0%; position: absolute; border-left-color: #404040; outline-width: medium; border-bottom-color: #404040; outline-style: none; left: 68px; z-index: 1; border-right-color: #404040; outline-color: invert; line-height: normal; top: 2px; visibility: visible" target="_self" title=""> </a> Fax: +852-3007-3255<br /> Website: <a href="http://www.ebsenterprise.com/" rel="nofollow" target="_blank">http://www.ebsenterprise.com</a> Email: <a href="http://us.mc1617.mail.yahoo.com/mc/compose?to=sa...@eb..." rel="nofollow" target="_blank" ymailto="mailto:sa...@eb...">sa...@eb...</a><br /> <br /> </span></span></div> <img src="http://www.ebsenterprise.com/cgi-bin/index.cgi?Action=FrontEndHome.SubscribeRead&IDs=142595467026723&Email=htm...@li..." alt="line" width='300' height='1'> <br><br> If you do not wish to receive further messages, please click <a href="http://unsubscribed-thank-you.com/cgi-bin/index.cgi?Action=FrontEndHome.UnSubscribe&IDs=142595467026723&Email=htm...@li...">here</a> to unsubscribe.<br> </body> </html> |
From: Derrick O. <der...@us...> - 2006-06-10 15:11:48
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv28061/htmlparser/src/org/htmlparser/lexer Modified Files: Lexer.java Log Message: Update version to 1.6-20060610. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** Lexer.java 2 Jun 2006 03:14:20 -0000 1.49 --- Lexer.java 10 Jun 2006 15:11:32 -0000 1.50 *************** *** 73,77 **** */ public static final String ! VERSION_TYPE = "Integration Build" ; --- 73,77 ---- */ public static final String ! VERSION_TYPE = "Release Build" ; *************** *** 80,84 **** */ public static final String ! VERSION_DATE = "May 27, 2006" ; --- 80,84 ---- */ public static final String ! VERSION_DATE = "Jun 10, 2006" ; |
From: Derrick O. <der...@us...> - 2006-06-10 15:11:47
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv28061/htmlparser/docs Modified Files: changes.txt release.txt Log Message: Update version to 1.6-20060610. Index: release.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/release.txt,v retrieving revision 1.74 retrieving revision 1.75 diff -C2 -d -r1.74 -r1.75 *** release.txt 27 May 2006 18:43:24 -0000 1.74 --- release.txt 10 Jun 2006 15:11:32 -0000 1.75 *************** *** 1,3 **** ! HTMLParser Version 1.6 (Integration Build May 27, 2006) ********************************************* --- 1,3 ---- ! HTMLParser Version 1.6 (Release Build Jun 10, 2006) ********************************************* *************** *** 33,37 **** Definition list tags (dl, dt, dd), are also now included in the standard set of tags recognized by the parser. ! The node interface has been augmented with get first/last child and get previous/next sibling methods to ease traversing the HTML document. The TextNode class has an added isWhiteSpace method that returns true --- 33,40 ---- Definition list tags (dl, dt, dd), are also now included in the standard set of tags recognized by the parser. ! The FilterBean now has a 'recursive' property to control descent through ! children when applying filters. ! The NodeList class is a little more standard now with a remove(node) method. ! The Node interface has been augmented with get first/last child and get previous/next sibling methods to ease traversing the HTML document. The TextNode class has an added isWhiteSpace method that returns true *************** *** 42,57 **** along with new constructors to OrFilter/AndFilter that take an array of NodeFilter's. Refactoring ----------- - The FilterBean now has a 'recursive' property to control descent through - children when applying filters. - The NodeList class is a little more standard now with a remove(node) method. Some refactoring to allow the htmllexer jar file to be compiled by gcj. Moved non-JUnit test code to Request For Enhancement (RFE) as attachments, so all the code in the tests package should now compile. Bug Fixes --------- #1488951 RemarkNode.toPlainTextString() incorrect behaviour #1467712 Page#getCharset never works --- 45,63 ---- along with new constructors to OrFilter/AndFilter that take an array of NodeFilter's. + Deflate encoding is now handled correctly and there is now an option to + have the ConnectionManager follow redirections manually so that cookie + processing can occur between redirections. + There is a new override for toHtml() that avoids issuing generated end tags. Refactoring ----------- Some refactoring to allow the htmllexer jar file to be compiled by gcj. Moved non-JUnit test code to Request For Enhancement (RFE) as attachments, so all the code in the tests package should now compile. + Removed all deprecated classes and methods. Bug Fixes --------- + #1496863 StringBean collapse() adds extra whitespace #1488951 RemarkNode.toPlainTextString() incorrect behaviour #1467712 Page#getCharset never works *************** *** 70,75 **** --- 76,86 ---- Patches ------- + #1436082 Follow redirections with cookie processing #1338534 Support get first/last child, previous/next sibling + Requests For Enhancements + ------------ + #1394144 handle deflate encoding + Changes since Version 1.4 ------------------------- Index: changes.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/changes.txt,v retrieving revision 1.210 retrieving revision 1.211 diff -C2 -d -r1.210 -r1.211 *** changes.txt 27 May 2006 18:43:24 -0000 1.210 --- changes.txt 10 Jun 2006 15:11:31 -0000 1.211 *************** *** 12,19 **** * * * This file has been trimmed of changes prior to version 1.4. For access to * ! * earlier change histories, please consult cvs.htmlparser.sorceforge.net. * * * ******************************************************************************* Integration Build 1.6 - 20060527 -------------------------------- --- 12,120 ---- * * * This file has been trimmed of changes prior to version 1.4. For access to * ! * earlier change histories, please consult htmlparser.cvs.sorceforge.net. * * * ******************************************************************************* + Release Build 1.6 - 20060610 + -------------------------------- + + 2006-06-10 10:39 derrickoswald + + * docs/faq.html: + + add faq to docs + + 2006-06-05 19:53 derrickoswald + + * src/org/htmlparser/tests/InstanceofPerformanceTest.java: + + Remove InstanceofPerformanceTest, no longer needed. + + 2006-06-04 15:17 derrickoswald + + * src/org/htmlparser/tests/AllTests.java, + src/org/htmlparser/tests/ParserTest.java, + src/org/htmlparser/tests/tagTests/BodyTagTest.java, + src/org/htmlparser/tests/tagTests/FormTagTest.java, + src/org/htmlparser/tests/tagTests/LabelTagTest.java, + src/org/htmlparser/tests/tagTests/LinkTagTest.java, + src/org/htmlparser/tests/tagTests/ObjectCollectionTest.java, + build.xml, src/org/htmlparser/Parser.java, + src/org/htmlparser/StringNodeFactory.java, + src/org/htmlparser/Tag.java, + src/org/htmlparser/tests/lexerTests/TagTests.java, + src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java, + src/org/htmlparser/util/LinkProcessor.java, + src/org/htmlparser/util/SpecialHashtable.java, + src/org/htmlparser/util/Translate.java, + src/org/htmlparser/nodes/TagNode.java, + src/org/htmlparser/tags/LinkTag.java: + + Eliminate deprecated classes and methods. + Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, + and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. + + 2006-06-01 23:14 derrickoswald + + * src/org/htmlparser/: Parser.java, lexer/Lexer.java, + util/NodeTreeWalker.java: + + Fix Javadoc warnings. + + 2006-06-01 22:43 derrickoswald + + * src/org/htmlparser/: http/ConnectionManager.java, + lexer/Page.java: + + implement RFE #1394144 handle deflate encoding + InflaterInputStream needed an additional Inflater argument. + + 2006-06-01 21:48 derrickoswald + + * src/org/htmlparser/: http/ConnectionManager.java, + http/HttpHeader.java, Parser.java: + + implement RFE #1436082 Follow redirections with cookie processing + Use ConnectionManager.setRedirectionProcessingEnabled(true). + Probably only useful if combined with ConnectionManager.setCookieProcessingEnabled(true). + + 2006-05-30 22:10 derrickoswald + + * src/org/htmlparser/: tests/utilTests/NodeListTest.java, + Node.java, nodes/AbstractNode.java, nodes/RemarkNode.java, + nodes/TagNode.java, nodes/TextNode.java, tags/CompositeTag.java, + tags/ScriptTag.java, util/NodeList.java: + + implement task #93148 toHtml(boolean verbatim) + To avoid printing generated end tags use toHtml(true). + + 2006-05-29 23:11 derrickoswald + + * src/org/htmlparser/Parser.java: + + Update javadoc for new Parser constructor behaviour. + + 2006-05-29 22:53 derrickoswald + + * src/org/htmlparser/Parser.java: + + Allow passing HTML in the Parser constructor. + So now it allows HTML, a URL or a file name. + + 2006-05-29 21:30 derrickoswald + + * src/org/htmlparser/http/ConnectionManager.java: + + Handle bad cookie names. + Traps cookie name problems, but ignores any following cookies. + + 2006-05-29 21:07 derrickoswald + + * src/org/htmlparser/: beans/StringBean.java, + tests/utilTests/BeanTest.java: + + fix bug#1496863 StringBean collapse() adds extra whitespace + Keep collapsing state machine state as member variable. + Integration Build 1.6 - 20060527 -------------------------------- |
From: Derrick O. <der...@us...> - 2006-06-10 15:11:47
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv28061/htmlparser/src/org/htmlparser Modified Files: Parser.java Log Message: Update version to 1.6-20060610. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.119 retrieving revision 1.120 diff -C2 -d -r1.119 -r1.120 *** Parser.java 4 Jun 2006 19:17:21 -0000 1.119 --- Parser.java 10 Jun 2006 15:11:32 -0000 1.120 *************** *** 128,132 **** */ public static final String ! VERSION_TYPE = "Integration Build" ; --- 128,132 ---- */ public static final String ! VERSION_TYPE = "Release Build" ; *************** *** 135,139 **** */ public static final String ! VERSION_DATE = "May 27, 2006" ; --- 135,139 ---- */ public static final String ! VERSION_DATE = "Jun 10, 2006" ; |
From: Derrick O. <der...@us...> - 2006-06-10 14:40:07
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv14011 Added Files: faq.html Log Message: add faq to docs --- NEW FILE: faq.html --- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html> <head> <title>HTML Parser Frequently Asked Questions</title> <meta name="author" content=" Derrick Oswald" /> <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" /> <link REL ="stylesheet" TYPE="text/css" HREF="javadoc/stylesheet.css" TITLE="Style"> </head> <body class="composite"> <div id="bodyColumn"> <div id="contentBox"> <head> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></meta> <meta name="KeyWords" content="faq,htmlparser,java"></meta> <link rel="stylesheet" type="text/css" href="javadoc/stylesheet.css" title="Style"></link> </head> <div class="section"><h2>Frequently Asked Questions</h2> <ul> <li><a href="#encodingchangeexception">Why am I getting an EncodingChangeException?</a></li> <li><a href="#post">How can I use POST to fetch a page?</a></li> <li><a href="#timeout">Is there a way to force a timeout for delinquent pages?</a></li> <li><a href="#composite">Why aren't <P>, <B>, <I> etc. tags fully nested?</a></li> <li><a href="#quiet">How can I block parser messages from appearing on stdout?</a></li> <li><a href="#empty">How does the parser deal with tags like <tag/>?</a></li> <li><a href="#jsp">How is JSP parsed using the parser?</a></li> <li><a href="#byte">How do you find the byte offset from the beginning of a document for a tag?</a></li> </ul> <a name="encodingchangeexception"></a> <div class="section"><h3>Why am I getting an EncodingChangeException?</h3> An EncodingChangeException is thrown to let you, the user, know that some nodes already handed out by the parser are incorrect according to an encoding directive in a <META> tag. <p>When a <META> tag with an encoding directive is encountered, the parser rescans the input up to the current position using the new encoding. If a different character results from interpreting the bytes with the new encoding, the exception is thrown. </p> <p> If you are supplying the parser with your own input, as from a file, be sure to set the encoding if it is not the default (ISO-8859-1). You can do this on the Page, Lexer, or Parser objects. </p> <p> If the parser is fetching the data for you, the problem is with the HTTP server, which should have sent the correct encoding as part of the Content-Type header string. Given that you have no control over the server, the only solution is to reattempt the parse with the new encoding. </p> <p>After the exception is thrown, the parser has set it's encoding to the new value, so you should be able to just reset and reparse, see for example the handling in StringBean: <div class="source"><pre> try { ... parser.parse (...) throws an EncodingChangeException... } catch (EncodingChangeException ece) { ... do whatever necessary to reset your state here try { // reset the parser parser.reset (); // try again with the encoding now in force parser.parse (...); } catch (ParserException pe) { } } catch (ParserException pe) { } </pre></div> </p> </div> <a name="post"></a> <div class="section"><h3>How can I use POST to fetch a page?</h3> <p>The standard HTTP request submitted by the parser is a GET. The usual request submitted by a form is a POST. </p> <p>To illustrate how to use POST with the parser, we'll submit a form to the WHOIS database of the American Registry for Internet Numbers (ARIN).<br></br> <i>Note: there is an equivalent GET form at http://ws.arin.net/whois</i>.<br></br> <i>See also:</i>. <ul> <li>RIPE http://www.ripe.net/perl/whois</li> <li>APNIC http://www.apnic.net/apnic-bin/whois.pl</li> <li>LACNIC http://lacnic.net/cgi-bin/lacnic/whois</li> </ul> <p>On the ARIN web site, the page <a href="http://ws.arin.net/cgi-bin/whois.pl">http://ws.arin.net/cgi-bin/whois.pl</a> has the following FORM that asks for an IP address and returns the registry details: </p> <div class="source"><pre> <form name="thisform" method="POST" action="/cgi-bin/whois.pl"> <font face="arial,verdana,helvetica" size="2"> Search for : </font> <input type="text" Name="queryinput" size="20"> <input type="submit"><br> </form> </pre></div> <p>From this we determine that the <tt>METHOD</tt> is <tt>POST</tt> and the form should be submitted to <tt>/cgi-bin/whois.pl</tt>. This absolute URL is relative to the page it is found on, so the form should be submitted to <tt>http://ws.arin.net/cgi-bin/whois.pl</tt> when the <tt>Submit</tt> input is clicked. The only <tt>INPUT</tt> element other than the <tt>Submit</tt> is a single <tt>text</tt> field named <tt>queryinput</tt> that takes 20 or fewer characters. Other types of input element are described in <a href="http://www.w3.org/TR/html4/interact/forms.html">http://www.w3.org/TR/html4/interact/forms.html</a>. </p> <p>The basic operation is to pass a fully prepared <tt>HttpURLConnection</tt> connected to the <tt>POST</tt> target URL into the <tt>Parser</tt>, either in the constructor or via the <tt>setConnection()</tt> method. To condition the connection, use the <tt>setRequestMethod()</tt> method to set the <tt>POST</tt> operation, and the <tt>setRequestProperty()</tt> and other explicit method calls. Then write the input field(s) as an ampersand concatenation (<tt>"input1=value1&input2=value2&..."</tt>) into the <tt>PrintWriter</tt> obtained by a call to <tt>getOutputStream()</tt>. </p> <p>The following sample program illustrates the principles using a <tt>StringBean</tt>, but the same code could be used with a <tt>Parser</tt> by replacing the last three lines in the <tt>try</tt> block with: </p> <div class="source"><pre> parser = new Parser (); parser.setConnection (connection); // ... do parser operations </pre></div> <p></p> <div class="source"><pre> import java.io.PrintWriter; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import org.htmlparser.beans.StringBean; /** * WhoIs.java * Use POST to get information about an IP address from ws.arin.net. * Created on April 29, 2006, 11:06 PM */ public class WhoIs { String mText; // text extracted from the response to the POST request /** * Creates a new instance of WhoIs. */ public WhoIs (String ipaddress) { URL url; HttpURLConnection connection; StringBuffer buffer; PrintWriter out; StringBean bean; try { // from the 'action' (relative to the refering page) url = new URL ("http://ws.arin.net/cgi-bin/whois.pl"); connection = (HttpURLConnection)url.openConnection (); connection.setRequestMethod ("POST"); connection.setDoOutput (true); connection.setDoInput (true); connection.setUseCaches (false); // more or less of these may be required // see Request Header Definitions: http://www.ietf.org/rfc/rfc2616.txt connection.setRequestProperty ("Accept-Charset", "*"); connection.setRequestProperty ("Referer", "http://ws.arin.net/cgi-bin/whois.pl"); connection.setRequestProperty ("User-Agent", "WhoIs.java/1.0"); buffer = new StringBuffer (1024); // 'input' fields separated by ampersands (&) buffer.append ("queryinput="); buffer.append (ipaddress); // etc. out = new PrintWriter (connection.getOutputStream ()); out.print (buffer); out.close (); bean = new StringBean (); bean.setConnection (connection); mText = bean.getStrings (); } catch (Exception e) { mText = e.getMessage (); } } public String getText () { return (mText); } /** * Program mainline. * @param args The ip address (dot notation) to look up. */ public static void main (String[] args) { if (0 >= args.length) System.out.println ("Usage: java WhoIs <ipaddress>"); else System.out.println (new WhoIs (args[0]).getText ()); } } </pre></div> </div> <a name="timeout"></a> <div class="section"><h3>Is there a way to force a timeout for delinquent pages?</h3> <p>If you are using the Sun jvm, try using: <div class="source"><pre> System.setProperty ("sun.net.client.defaultReadTimeout", "7000"); System.setProperty ("sun.net.client.defaultConnectTimeout", "7000"); </pre></div> in the mainline before starting your main application processing. </p> <p>This sets the socket timeouts to 7 seconds, but you will need to catch the I/O exceptions. </p> </div> <a name="composite"></a> <div class="section"><h3>Why aren't <P>, <B>, <I> etc. tags fully nested?</h3> <p>Authors are sometimes lazy and often fail to close some tags as required by the HTML standard. This causes some problems for the parser. </p> <p>For this heuristic reason, not all possible tags are registered as composite tags, which is what generates the 'parent/child' nesting relationship. It is considered better to have a valid, less nested parse than a possibly invalid parse. </p> <p>You are free to add whatever nodes you like as composite nodes using the prototypical node factory paradigm. First create your class that derives from <tt>CompositeTagNode</tt> (copy and modify one of the existing tags that is most like your desired tag): </p> <div class="source"><pre> public class BoldTag extends CompositeTag { private static final String[] mIds = new String[] {"B"}; public BoldTag () { } public String[] getIds () { return (mIds); } public String[] getEnders () { return (mIds); } public String[] getEndTagEnders () { return (new String[0]); } } </pre></div> <p>Then, register an instance of your node with a PrototypicalNodeFactory: </p> <div class="source"><pre> PrototypicalNodeFactory factory = new PrototypicalNodeFactory (); factory.registerTag (new BoldTag ()); parser.setNodeFactory (factory); </pre></div> <p>The problem becomes detecting when the tag doesn't have a </B> like it should, so getEnders() and getEndTagEnders() should probably have a longer list of tag names. Enders are the tag names that force an end tag to be generated, while EndTagEnders are the end tags (</xxx>) that force an end tag to be generated. </p> </div> <a name="quiet"></a> <div class="section"><h3>How can I block parser messages from appearing on stdout?</h3> <p>The parser sends warning and error messages to standard output by default. You might want to block these messages. To achieve this, use a different feedback object: </p> <div class="source"><pre> Parser parser = new Parser ("http://...", new DefaultParserFeedback (DefaultParserFeedback.QUIET)); </pre></div> <p>The <tt>Parser</tt> class has a static member with just such a construction: </p> <div class="source"><pre> Parser parser = new Parser ("http://...", Parser.DEVNULL); </pre></div> <p>You can also switch the feedback to DEBUG mode, to get extra details. </p> <div class="source"><pre> Parser parser = new Parser ("http://...", new DefaultParserFeedback (DefaultParserFeedback.DEBUG)); </pre></div> <p> To handle the feedback yourself, implement the <tt>ParserFeedback</tt>, interface by implementing <tt>info()</tt>, <tt>warning()</tt> and <tt>error()</tt>. </p> </div> <a name="empty"></a> <div class="section"><h3>How does the parser deal with tags like <tag/>?</h3> <p> The parser handles tags ending with a slash as a normal <tt>Tag</tt> object. The <tt>Tag</tt> interface has a method - <tt>isEmptyXmlTag()</tt> which returns <tt>true</tt> if is this such an empty xml tag (has no end tag). </p> </div> <a name="jsp"></a> <div class="section"><h3>How is JSP parsed using the parser?</h3> <p>There is a <tt>JspTag</tt> class that handles "%", "%=" and "%@" tags, <em>but not within tags or remarks</em>. So, the Jsp tag within the tag <tt><input type='<%= MyType %>'></tt> would not be returned as a tag, but would instead be part of the text of the 'type' attribute, but the same tag within the text of the page would be returned as a <tt>JspTag</tt> tag. </p> </div> <a name="byte"></a> <div class="section"><h3>How do you find the byte offset from the beginning of a document for a tag?</h3> <p>Character positions are much easier to obtain than byte positions. Each tag returned by the parser or lexer has methods <tt>getStartPosition()</tt> and <tt>getEndPosition()</tt> which return the starting and ending character positions. </p> <p>These can be converted to line and column numbers in a hypothetical text file using <tt>row()</tt> and <tt>column()</tt> methods on the <tt>Page</tt> object: </p> <div class="source"><pre> Page page = parser.getLexer ().getPage (); int row = page.row (tag.getStartPosition ()); // note: zero based int column = page.column (tag.getStartPosition ()); </pre></div> <p>Converting a character position into a byte position is dependant on the character encoding used. For the ISO-8859-1 encoding, the correspondence is one byte per character, but for other encodings, often more than one byte is used per character. Perhaps the only safe way is to write all the characters, up to the character position of interest, to a suitably encoded writer on a stream, flush the writer and then examine the byte position of the underlying stream. </p> </div> </div> </div> </div> <div class="clear"> <hr/> </div> <div id="footer"> <div class="xright">© 2001-2006 </div> <div class="clear"> <hr/> </div> </div> </body> </html> |
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/nodeDecoratorTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/tests/nodeDecoratorTests Removed Files: EscapeCharacterRemovingNodeTest.java NonBreakingSpaceConvertingNodeTest.java AllTests.java DecodingNodeTest.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. --- EscapeCharacterRemovingNodeTest.java DELETED --- --- NonBreakingSpaceConvertingNodeTest.java DELETED --- --- DecodingNodeTest.java DELETED --- --- AllTests.java DELETED --- |
From: Derrick O. <der...@us...> - 2006-06-06 02:45:57
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/tags Modified Files: LinkTag.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. Index: LinkTag.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/LinkTag.java,v retrieving revision 1.54 retrieving revision 1.55 diff -C2 -d -r1.54 -r1.55 *** LinkTag.java 10 Apr 2005 23:20:45 -0000 1.54 --- LinkTag.java 4 Jun 2006 19:17:22 -0000 1.55 *************** *** 70,74 **** * where the link points to, and the text it contains. * <p> ! * In order to get the contents of the link tag, use the method linkData(), * which returns an enumeration of nodes encapsulated within the link. * <p> --- 70,74 ---- * where the link points to, and the text it contains. * <p> ! * In order to get the contents of the link tag, use the method children(), * which returns an enumeration of nodes encapsulated within the link. * <p> *************** *** 77,81 **** * Node node ; * ImageTag imageTag; ! * for (Enumeration e=linkTag.linkData();e.hasMoreElements();) { * node = (Node)e.nextElement(); * if (node instanceof ImageTag) { --- 77,81 ---- * Node node ; * ImageTag imageTag; ! * for (Enumeration e=linkTag.children();e.hasMoreElements();) { * node = (Node)e.nextElement(); * if (node instanceof ImageTag) { *************** *** 276,282 **** if (null != getChildren ()) { - sb.append(" "+"LinkData\n"); - sb.append(" "+"--------\n"); - Node node; int i = 0; --- 276,279 ---- *************** *** 288,292 **** } } - sb.append(" "+"*** END of LinkData ***\n"); return sb.toString(); } --- 285,288 ---- *************** *** 303,315 **** /** - * This method returns an enumeration of data that it contains - * @return Enumeration - * @deprecated Use children() instead. - */ - public SimpleNodeIterator linkData() { - return children(); - } - - /** * Extract the link from the HREF attribute. * @return The URL from the HREF attibute. This is absolute if the tag has --- 299,302 ---- |
From: Derrick O. <der...@us...> - 2006-06-06 02:45:56
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodeDecorators In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/nodeDecorators Removed Files: DecodingNode.java EscapeCharacterRemovingNode.java AbstractNodeDecorator.java NonBreakingSpaceConvertingNode.java package.html Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. --- NonBreakingSpaceConvertingNode.java DELETED --- --- package.html DELETED --- --- DecodingNode.java DELETED --- --- AbstractNodeDecorator.java DELETED --- --- EscapeCharacterRemovingNode.java DELETED --- |
From: Derrick O. <der...@us...> - 2006-06-06 02:45:55
|
Update of //cvsroot/htmlparser/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487 Modified Files: build.xml Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. Index: build.xml =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/build.xml,v retrieving revision 1.84 retrieving revision 1.85 diff -C2 -d -r1.84 -r1.85 *** build.xml 27 May 2006 18:43:25 -0000 1.84 --- build.xml 4 Jun 2006 19:17:21 -0000 1.85 *************** *** 286,290 **** <include name="org/htmlparser/util/sort/**/*.java"/> <include name="org/htmlparser/visitors/NodeVisitor.java"/> - <include name="org/htmlparser/parserHelper/SpecialHashtable.class"/> </javac> </target> --- 286,289 ---- *************** *** 326,330 **** <include name="org/htmlparser/util/NodeIterator.class"/> <include name="org/htmlparser/util/SimpleNodeIterator.class"/> - <include name="org/htmlparser/util/SpecialHashtable.class"/> <include name="org/htmlparser/util/EncodingChangeException.class"/> <include name="org/htmlparser/util/sort/**/*.class"/> --- 325,328 ---- |
From: Derrick O. <der...@us...> - 2006-06-06 02:45:53
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/tests/tagTests Modified Files: LabelTagTest.java FormTagTest.java LinkTagTest.java ObjectCollectionTest.java BodyTagTest.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. Index: FormTagTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/FormTagTest.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** FormTagTest.java 27 May 2006 14:03:52 -0000 1.48 --- FormTagTest.java 4 Jun 2006 19:17:20 -0000 1.49 *************** *** 297,308 **** ); ((PrototypicalNodeFactory)parser.getNodeFactory ()).unregisterTag (new FormTag ()); ! Node [] nodes = ! parser.extractAllNodesThatAre( FormTag.class ! ); assertEquals( ! "shouldnt have found form tag", 0, ! nodes.length ); } --- 297,308 ---- ); ((PrototypicalNodeFactory)parser.getNodeFactory ()).unregisterTag (new FormTag ()); ! NodeList nodes = ! parser.extractAllNodesThatMatch (new NodeClassFilter ( FormTag.class ! )); assertEquals( ! "shouldn't have found form tag", 0, ! nodes.size () ); } *************** *** 436,442 **** ); FormTag formTag = ! (FormTag)(parser.extractAllNodesThatAre( FormTag.class ! )[0]); assertNotNull("Should have found a form tag",formTag); assertStringEquals("name","form0",formTag.getFormName()); --- 436,442 ---- ); FormTag formTag = ! (FormTag)(parser.extractAllNodesThatMatch (new NodeClassFilter ( FormTag.class ! )).elementAt (0)); assertNotNull("Should have found a form tag",formTag); assertStringEquals("name","form0",formTag.getFormName()); *************** *** 503,509 **** createParser (html); formTag = ! (FormTag)(parser.extractAllNodesThatAre ( FormTag.class ! )[0]); assertNotNull ("Should have found a form tag",formTag); assertStringEquals ("name", "searchForm", formTag.getFormName ()); --- 503,509 ---- createParser (html); formTag = ! (FormTag)(parser.extractAllNodesThatMatch (new NodeClassFilter ( FormTag.class ! )).elementAt (0)); assertNotNull ("Should have found a form tag",formTag); assertStringEquals ("name", "searchForm", formTag.getFormName ()); *************** *** 554,560 **** createParser (html); formTag = ! (FormTag)(parser.extractAllNodesThatAre ( FormTag.class ! )[0]); assertNotNull ("Should have found a form tag",formTag); nl = formTag.getFormInputs (); --- 554,560 ---- createParser (html); formTag = ! (FormTag)(parser.extractAllNodesThatMatch (new NodeClassFilter ( FormTag.class ! )).elementAt (0)); assertNotNull ("Should have found a form tag",formTag); nl = formTag.getFormInputs (); Index: LabelTagTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/LabelTagTest.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** LabelTagTest.java 7 Dec 2003 23:41:43 -0000 1.1 --- LabelTagTest.java 4 Jun 2006 19:17:20 -0000 1.2 *************** *** 101,106 **** LabelTag labelTag = (LabelTag) node[0]; assertStringEquals("Label", html, labelTag.toHtml()); ! Hashtable attr = labelTag.getAttributes(); ! assertNull("ID",attr.get("id")); } --- 101,105 ---- LabelTag labelTag = (LabelTag) node[0]; assertStringEquals("Label", html, labelTag.toHtml()); ! assertNull("ID",labelTag.getAttribute("id")); } *************** *** 117,122 **** labelTag = (LabelTag) node[1]; assertStringEquals("Label", label2 + "</label>",labelTag.toHtml()); ! Hashtable attr = labelTag.getAttributes(); ! assertNull("ID",attr.get("id")); } --- 116,120 ---- labelTag = (LabelTag) node[1]; assertStringEquals("Label", label2 + "</label>",labelTag.toHtml()); ! assertNull("ID",labelTag.getAttribute("id")); } Index: LinkTagTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/LinkTagTest.java,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** LinkTagTest.java 12 Nov 2005 16:44:54 -0000 1.53 --- LinkTagTest.java 4 Jun 2006 19:17:20 -0000 1.54 *************** *** 31,34 **** --- 31,35 ---- import org.htmlparser.Tag; import org.htmlparser.Text; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.HeadTag; import org.htmlparser.tags.Html; *************** *** 36,39 **** --- 37,41 ---- import org.htmlparser.tags.LinkTag; import org.htmlparser.tests.ParserTestCase; + import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator; *************** *** 412,419 **** "<a href=s/7509><b>Yahoo! Movies</b></a>" + "</td>","http://www.yahoo.com"); ! Node linkNodes [] = parser.extractAllNodesThatAre(LinkTag.class); ! assertEquals("number of links",2,linkNodes.length); ! LinkTag linkTag = (LinkTag)linkNodes[0]; assertStringEquals("Link","http://www.yahoo.com/s/8741",linkTag.getLink()); // Verify the link data --- 414,421 ---- "<a href=s/7509><b>Yahoo! Movies</b></a>" + "</td>","http://www.yahoo.com"); ! NodeList linkNodes = parser.extractAllNodesThatMatch (new NodeClassFilter (LinkTag.class)); ! assertEquals("number of links", 2, linkNodes.size ()); ! LinkTag linkTag = (LinkTag)linkNodes.elementAt (0); assertStringEquals("Link","http://www.yahoo.com/s/8741",linkTag.getLink()); // Verify the link data Index: ObjectCollectionTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/ObjectCollectionTest.java,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** ObjectCollectionTest.java 31 Jul 2004 16:42:31 -0000 1.22 --- ObjectCollectionTest.java 4 Jun 2006 19:17:20 -0000 1.23 *************** *** 30,33 **** --- 30,34 ---- import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.Tag; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.Div; import org.htmlparser.tags.Span; *************** *** 70,74 **** ); parser.setNodeFactory (new PrototypicalNodeFactory (new Span ())); ! assertSpanContent(parser.extractAllNodesThatAre(Span.class)); } --- 71,75 ---- ); parser.setNodeFactory (new PrototypicalNodeFactory (new Span ())); ! assertSpanContent(parser.extractAllNodesThatMatch (new NodeClassFilter (Span.class)).toNodeArray ()); } Index: BodyTagTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/BodyTagTest.java,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** BodyTagTest.java 20 Mar 2006 00:02:50 -0000 1.22 --- BodyTagTest.java 4 Jun 2006 19:17:20 -0000 1.23 *************** *** 78,82 **** NodeIterator iterator; Node node; - Hashtable attributes; try --- 78,81 ---- *************** *** 91,97 **** if (node instanceof BodyTag) { ! attributes = ((BodyTag)node).getAttributes (); ! assertTrue ("no style attribute", attributes.containsKey ("STYLE")); ! assertTrue ("no title attribute", attributes.containsKey ("TITLE")); } else --- 90,95 ---- if (node instanceof BodyTag) { ! assertNotNull ("no style attribute", ((BodyTag)node).getAttribute ("STYLE")); ! assertNotNull ("no title attribute", ((BodyTag)node).getAttribute ("TITLE")); } else |
From: Derrick O. <der...@us...> - 2006-06-06 02:23:10
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/nodes Modified Files: TagNode.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. Index: TagNode.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/TagNode.java,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** TagNode.java 31 May 2006 02:10:15 -0000 1.7 --- TagNode.java 4 Jun 2006 19:17:22 -0000 1.8 *************** *** 40,44 **** import org.htmlparser.scanners.TagScanner; import org.htmlparser.util.ParserException; - import org.htmlparser.util.SpecialHashtable; import org.htmlparser.visitors.NodeVisitor; --- 40,43 ---- *************** *** 174,185 **** ret = null; ! if (name.equalsIgnoreCase (SpecialHashtable.TAGNAME)) ! ret = ((Attribute)getAttributesEx ().elementAt (0)).getName (); ! else ! { ! attribute = getAttributeEx (name); ! if (null != attribute) ! ret = attribute.getValue (); ! } return (ret); --- 173,179 ---- ret = null; ! attribute = getAttributeEx (name); ! if (null != attribute) ! ret = attribute.getValue (); return (ret); *************** *** 383,435 **** /** - * Gets the attributes in the tag. - * This is not the preferred method to get attributes, see {@link - * #getAttributesEx getAttributesEx} which returns a list of {@link - * Attribute} objects, which offer more information than the simple - * <code>String</code> objects available from this <code>Hashtable</code>. - * @return Returns a list of name/value pairs representing the attributes. - * These are not in order, the keys (names) are converted to uppercase and the values - * are not quoted, even if they need to be. The table <em>will</em> return - * <code>null</code> if there was no value for an attribute (no equals - * sign or nothing to the right of the equals sign). A special entry with - * a key of SpecialHashtable.TAGNAME ("$<TAGNAME>$") holds the tag name. - * The conversion to uppercase is performed with an ENGLISH locale. - */ - public Hashtable getAttributes () - { - Vector attributes; - Attribute attribute; - String value; - Hashtable ret; - - ret = new SpecialHashtable (); - attributes = getAttributesEx (); - if (0 < attributes.size ()) - { - // special handling for the node name - attribute = (Attribute)attributes.elementAt (0); - ret.put (SpecialHashtable.TAGNAME, attribute.getName ().toUpperCase (Locale.ENGLISH)); - // the rest - for (int i = 1; i < attributes.size (); i++) - { - attribute = (Attribute)attributes.elementAt (i); - if (!attribute.isWhitespace ()) - { - value = attribute.getValue (); - if (attribute.isEmpty ()) - value = SpecialHashtable.NOTHING; - if (null == value) - value = SpecialHashtable.NULLVALUE; - ret.put (attribute.getName ().toUpperCase (Locale.ENGLISH), value); - } - } - } - else - ret.put (SpecialHashtable.TAGNAME, ""); - - return (ret); - } - - /** * Return the name of this tag. * <p> --- 377,380 ---- *************** *** 529,580 **** /** * Sets the attributes. - * A special entry with a key of SpecialHashtable.TAGNAME ("$<TAGNAME>$") - * sets the tag name. - * @param attributes The attribute collection to set. - */ - public void setAttributes (Hashtable attributes) - { - Vector att; - String key; - String value; - char quote; - Attribute attribute; - - att = new Vector (); - for (Enumeration e = attributes.keys (); e.hasMoreElements (); ) - { - key = (String)e.nextElement (); - value = (String)attributes.get (key); - if (value.startsWith ("'") && value.endsWith ("'") && (2 <= value.length ())) - { - quote = '\''; - value = value.substring (1, value.length () - 1); - } - else if (value.startsWith ("\"") && value.endsWith ("\"") && (2 <= value.length ())) - { - quote = '"'; - value = value.substring (1, value.length () - 1); - } - else - quote = (char)0; - if (key.equals (SpecialHashtable.TAGNAME)) - { - attribute = new Attribute (value, null, quote); - att.insertElementAt (attribute, 0); - } - else - { - // add whitespace between attributes - attribute = new Attribute (" "); - att.addElement (attribute); - attribute = new Attribute (key, value, quote); - att.addElement (attribute); - } - } - this.mAttributes = att; - } - - /** - * Sets the attributes. * NOTE: Values of the extended hashtable are two element arrays of String, * with the first element being the original name (not uppercased), --- 474,477 ---- *************** *** 742,755 **** /** - * Returns table of attributes in the tag - * @return Hashtable - * @deprecated This method is deprecated. Use getAttributes() instead. - */ - public Hashtable getParsed () - { - return getAttributes (); - } - - /** * Default tag visiting code. * Based on <code>isEndTag()</code>, calls either <code>visitTag()</code> or --- 639,642 ---- |
From: Derrick O. <der...@us...> - 2006-06-06 02:18:47
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/tests Modified Files: ParserTest.java AllTests.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. Index: ParserTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v retrieving revision 1.66 retrieving revision 1.67 diff -C2 -d -r1.66 -r1.67 *** ParserTest.java 17 Apr 2006 13:53:12 -0000 1.66 --- ParserTest.java 4 Jun 2006 19:17:20 -0000 1.67 *************** *** 773,777 **** File file; PrintWriter out; ! Node[] nodes; html = "<html></html>"; --- 773,777 ---- File file; PrintWriter out; ! NodeList nodes; html = "<html></html>"; *************** *** 840,851 **** { parser.setInputHTML (html); ! nodes = parser.extractAllNodesThatAre (LinkTag.class); } catch (ParserException e) { e.printStackTrace (); ! nodes = new Node[0]; } ! assertTrue ("node count", 3 == nodes.length); } catch (Exception e) --- 840,851 ---- { parser.setInputHTML (html); ! nodes = parser.extractAllNodesThatMatch (new NodeClassFilter (LinkTag.class)); } catch (ParserException e) { e.printStackTrace (); ! nodes = new NodeList (); } ! assertTrue ("node count", 3 == nodes.size ()); } catch (Exception e) Index: AllTests.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/AllTests.java,v retrieving revision 1.61 retrieving revision 1.62 diff -C2 -d -r1.61 -r1.62 *** AllTests.java 11 Apr 2006 12:03:07 -0000 1.61 --- AllTests.java 4 Jun 2006 19:17:20 -0000 1.62 *************** *** 60,64 **** suite.addTest (org.htmlparser.tests.visitorsTests.AllTests.suite ()); suite.addTest (org.htmlparser.tests.parserHelperTests.AllTests.suite ()); - suite.addTest (org.htmlparser.tests.nodeDecoratorTests.AllTests.suite ()); suite.addTestSuite (org.htmlparser.tests.filterTests.FilterTest.class); --- 60,63 ---- |
From: Derrick O. <der...@us...> - 2006-06-06 02:18:47
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/tests/scannersTests Modified Files: ScriptScannerTest.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. Index: ScriptScannerTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java,v retrieving revision 1.60 retrieving revision 1.61 diff -C2 -d -r1.60 -r1.61 *** ScriptScannerTest.java 27 May 2006 14:36:46 -0000 1.60 --- ScriptScannerTest.java 4 Jun 2006 19:17:21 -0000 1.61 *************** *** 31,34 **** --- 31,35 ---- import org.htmlparser.Node; import org.htmlparser.Parser; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.lexer.Lexer; *************** *** 80,85 **** // Check the data in the applet tag ScriptTag scriptTag = (ScriptTag)node[0]; ! Hashtable table = scriptTag.getAttributes(); ! String srcExpected = (String)table.get("SRC"); assertEquals("Expected SRC value",src,srcExpected); } --- 81,85 ---- // Check the data in the applet tag ScriptTag scriptTag = (ScriptTag)node[0]; ! String srcExpected = scriptTag.getAttribute ("SRC");; assertEquals("Expected SRC value",src,srcExpected); } *************** *** 215,219 **** ); Node scriptNodes [] = ! parser.extractAllNodesThatAre(ScriptTag.class); assertType( "scriptnode", --- 215,219 ---- ); Node scriptNodes [] = ! parser.extractAllNodesThatMatch (new NodeClassFilter (ScriptTag.class)).toNodeArray (); assertType( "scriptnode", |
From: Derrick O. <der...@us...> - 2006-06-06 02:18:46
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser Modified Files: Parser.java Tag.java Removed Files: StringNodeFactory.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. --- StringNodeFactory.java DELETED --- Index: Parser.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.118 retrieving revision 1.119 diff -C2 -d -r1.118 -r1.119 *** Parser.java 2 Jun 2006 03:14:20 -0000 1.118 --- Parser.java 4 Jun 2006 19:17:21 -0000 1.119 *************** *** 771,795 **** } - /** - * Convenience method to extract all nodes of a given class type. - * Equivalent to - * <code>extractAllNodesThatMatch (new NodeClassFilter (nodeType))</code>. - * @param nodeType The class of the nodes to collect. - * @throws ParserException If a parse error occurs. - * @return A list of nodes which have the class specified. - * @deprecated Use extractAllNodesThatMatch (new NodeClassFilter (cls)). - * @see #extractAllNodesThatAre - */ - public Node [] extractAllNodesThatAre (Class nodeType) - throws - ParserException - { - NodeList ret; - - ret = extractAllNodesThatMatch (new NodeClassFilter (nodeType)); - - return (ret.toNodeArray ()); - } - // // ConnectionMonitor interface --- 771,774 ---- Index: Tag.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Tag.java,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** Tag.java 15 Nov 2005 02:09:10 -0000 1.7 --- Tag.java 4 Jun 2006 19:17:21 -0000 1.8 *************** *** 109,141 **** /** - * Gets the attributes in the tag. - * This is not the preferred method to get attributes, see {@link - * #getAttributesEx getAttributesEx} which returns a list of {@link - * Attribute} objects, which offer more information than the simple - * <code>String</code> objects available from this <code>Hashtable</code>. - * @return Returns a list of name/value pairs representing the attributes. - * These are not in order, the keys (names) are converted to uppercase - * and the values are not quoted, even if they need to be. - * The table <em>will</em> return <code>null</code> if there was no value - * for an attribute (either no equals sign or nothing to the right of the - * equals sign). A special entry with a key of - * SpecialHashtable.TAGNAME ("$<TAGNAME>$") holds the tag name. - * The conversion to uppercase is performed with an ENGLISH locale. - * @deprecated Use getAttributesEx() instead. - * @see #setAttributes - */ - Hashtable getAttributes (); - - /** - * Sets the attributes. - * A special entry with a key of SpecialHashtable.TAGNAME ("$<TAGNAME>$") - * sets the tag name. - * @param attributes The attribute collection to set. - * @deprecated Use setAttributesEx() instead. - * @see #getAttributes - */ - void setAttributes (Hashtable attributes); - - /** * Return the name of this tag. * <p> --- 109,112 ---- |
From: Derrick O. <der...@us...> - 2006-06-06 02:18:45
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/util Modified Files: Translate.java Removed Files: LinkProcessor.java SpecialHashtable.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. --- LinkProcessor.java DELETED --- --- SpecialHashtable.java DELETED --- Index: Translate.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** Translate.java 31 Jul 2004 16:42:33 -0000 1.46 --- Translate.java 4 Jun 2006 19:17:21 -0000 1.47 *************** *** 727,762 **** /** - * Convert a reference to a unicode character. - * Convert a single numeric character reference or character entity reference - * to a unicode character. - * @param string The string to convert. Of the form &xxxx; or &#xxxx; with - * or without the leading ampersand or trailing semi-colon. - * @param start The starting pooint in the string to look for a character reference. - * @param end The ending point in the string to stop looking for a character reference. - * @return The converted character or ' |
From: Derrick O. <der...@us...> - 2006-06-06 02:18:44
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv19487/src/org/htmlparser/tests/lexerTests Modified Files: TagTests.java Log Message: Eliminate deprecated classes and methods. Removed nodeDecorator package, StringNodeFactory, LinkProcesor, SpecialHashTable, and methods for linkData, non-Ex Attributes and FindAllNodesThatAre. Index: TagTests.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/TagTests.java,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** TagTests.java 15 May 2005 11:49:05 -0000 1.14 --- TagTests.java 4 Jun 2006 19:17:21 -0000 1.15 *************** *** 31,34 **** --- 31,35 ---- import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.Tag; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; *************** *** 364,368 **** try { mResult = false; ! Node linkTag [] = mParser.extractAllNodesThatAre(LinkTag.class); mLink1 = (LinkTag)linkTag[0]; mLink2 = (LinkTag)linkTag[1]; --- 365,369 ---- try { mResult = false; ! Node linkTag [] = mParser.extractAllNodesThatMatch (new NodeClassFilter (LinkTag.class)).toNodeArray (); mLink1 = (LinkTag)linkTag[0]; mLink2 = (LinkTag)linkTag[1]; |
From: Derrick O. <der...@us...> - 2006-06-06 02:18:41
|
Update of //cvsroot/htmlparser/htmlparser/docs/samples In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv17004 Removed Files: crawler.html text.html custom.html links.html ripper.html index.html feedback.html exception.html linksEmbedded.html imageslinks.html Log Message: Remove obsolete samples directory. --- index.html DELETED --- --- linksEmbedded.html DELETED --- --- crawler.html DELETED --- --- links.html DELETED --- --- imageslinks.html DELETED --- --- ripper.html DELETED --- --- exception.html DELETED --- --- text.html DELETED --- --- custom.html DELETED --- --- feedback.html DELETED --- |
From: Derrick O. <der...@us...> - 2006-06-06 01:43:06
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv15840 Removed Files: InstanceofPerformanceTest.java Log Message: Remove InstanceofPerformanceTest, no longer needed. --- InstanceofPerformanceTest.java DELETED --- |
From: Derrick O. <der...@us...> - 2006-06-02 03:14:24
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv10423/util Modified Files: NodeTreeWalker.java Log Message: Fix Javadoc warnings. Index: NodeTreeWalker.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/NodeTreeWalker.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** NodeTreeWalker.java 13 Feb 2006 14:50:35 -0000 1.1 --- NodeTreeWalker.java 2 Jun 2006 03:14:21 -0000 1.2 *************** *** 78,82 **** /** * Creates a new instance of NodeTreeWalker using depth-first tree traversal, without limits on how deep it may traverse. ! * @param root Node The Node to set as the root of the tree. * @throws NullPointerException if root Node is null. */ --- 78,82 ---- /** * Creates a new instance of NodeTreeWalker using depth-first tree traversal, without limits on how deep it may traverse. ! * @param rootNode Node The Node to set as the root of the tree. * @throws NullPointerException if root Node is null. */ |
From: Derrick O. <der...@us...> - 2006-06-02 03:14:24
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv10423 Modified Files: Parser.java Log Message: Fix Javadoc warnings. Index: Parser.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.117 retrieving revision 1.118 diff -C2 -d -r1.117 -r1.118 *** Parser.java 2 Jun 2006 01:48:44 -0000 1.117 --- Parser.java 2 Jun 2006 03:14:20 -0000 1.118 *************** *** 325,329 **** * @throws ParserException If the resourceLocn argument does not resolve * to a valid page or file. ! * @see #Parser(string,ParserFeedback) */ public Parser (String resource) throws ParserException --- 325,329 ---- * @throws ParserException If the resourceLocn argument does not resolve * to a valid page or file. ! * @see #Parser(String,ParserFeedback) */ public Parser (String resource) throws ParserException |
From: Derrick O. <der...@us...> - 2006-06-02 03:14:24
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv10423/lexer Modified Files: Lexer.java Log Message: Fix Javadoc warnings. Index: Lexer.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** Lexer.java 27 May 2006 18:43:24 -0000 1.48 --- Lexer.java 2 Jun 2006 03:14:20 -0000 1.49 *************** *** 98,102 **** * a more lax (and closer to typical browser handling) remark parsing * is used. ! * Default <code>{@value}</code>. */ public static boolean STRICT_REMARKS = true; --- 98,102 ---- * a more lax (and closer to typical browser handling) remark parsing * is used. ! * Default <code>true</code>. */ public static boolean STRICT_REMARKS = true; |
From: Derrick O. <der...@us...> - 2006-06-02 02:43:29
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv26721/lexer Modified Files: Page.java Log Message: implement RFE #1394144 handle deflate encoding InflaterInputStream needed an additional Inflater argument. Index: Page.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** Page.java 27 May 2006 17:06:28 -0000 1.56 --- Page.java 2 Jun 2006 02:43:25 -0000 1.57 *************** *** 40,43 **** --- 40,44 ---- import java.net.UnknownHostException; import java.util.zip.GZIPInputStream; + import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; *************** *** 564,568 **** { stream = new Stream (new InflaterInputStream ( ! getConnection ().getInputStream ())); } else --- 565,569 ---- { stream = new Stream (new InflaterInputStream ( ! getConnection ().getInputStream (), new Inflater (true))); } else |
From: Derrick O. <der...@us...> - 2006-06-02 02:43:29
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/http In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv26721/http Modified Files: ConnectionManager.java Log Message: implement RFE #1394144 handle deflate encoding InflaterInputStream needed an additional Inflater argument. Index: ConnectionManager.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/http/ConnectionManager.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** ConnectionManager.java 2 Jun 2006 01:48:43 -0000 1.12 --- ConnectionManager.java 2 Jun 2006 02:43:24 -0000 1.13 *************** *** 60,64 **** mDefaultRequestProperties.put ("User-Agent", "HTMLParser/" + org.htmlparser.lexer.Lexer.VERSION_NUMBER); ! mDefaultRequestProperties.put ("Accept-Encoding", "gzip"); } --- 60,64 ---- mDefaultRequestProperties.put ("User-Agent", "HTMLParser/" + org.htmlparser.lexer.Lexer.VERSION_NUMBER); ! mDefaultRequestProperties.put ("Accept-Encoding", "gzip, deflate"); } |
From: Derrick O. <der...@us...> - 2006-06-02 01:48:48
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv32515 Modified Files: Parser.java Log Message: implement RFE #1436082 Follow redirections with cookie processing Use ConnectionManager.setRedirectionProcessingEnabled(true). Probably only useful if combined with ConnectionManager.setCookieProcessingEnabled(true). Index: Parser.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.116 retrieving revision 1.117 diff -C2 -d -r1.116 -r1.117 *** Parser.java 30 May 2006 03:11:02 -0000 1.116 --- Parser.java 2 Jun 2006 01:48:44 -0000 1.117 *************** *** 230,233 **** --- 230,234 ---- * the default character set is used. * @return A parser with the <code>html</code> string as input. + * @exception IllegalArgumentException if <code>html</code> is <code>null</code>. */ public static Parser createParser (String html, String charset) *************** *** 271,276 **** { setFeedback (fb); - if (null == lexer) - throw new IllegalArgumentException ("lexer cannot be null"); setLexer (lexer); setNodeFactory (new PrototypicalNodeFactory ()); --- 272,275 ---- *************** *** 315,341 **** ParserException { - int length; - boolean html; - char ch; - - if (null == resource) - throw new IllegalArgumentException ("resource cannot be null"); setFeedback (feedback); ! length = resource.length (); ! html = false; ! for (int i = 0; i < length; i++) ! { ! ch = resource.charAt (i); ! if (!Character.isWhitespace (ch)) ! { ! if ('<' == ch) ! html = true; ! break; ! } ! } ! if (html) ! setLexer (new Lexer (new Page (resource))); ! else ! setLexer (new Lexer (getConnectionManager ().openConnection (resource))); setNodeFactory (new PrototypicalNodeFactory ()); } --- 314,319 ---- ParserException { setFeedback (feedback); ! setResource (resource); setNodeFactory (new PrototypicalNodeFactory ()); } *************** *** 388,391 **** --- 366,403 ---- /** + * Set the html, a url, or a file. + * @param resource The resource to use. + * @exception IllegalArgumentException if <code>resource</code> is <code>null</code>. + * @exception ParserException if a problem occurs in connecting. + */ + public void setResource (String resource) + throws + ParserException + { + int length; + boolean html; + char ch; + + if (null == resource) + throw new IllegalArgumentException ("resource cannot be null"); + length = resource.length (); + html = false; + for (int i = 0; i < length; i++) + { + ch = resource.charAt (i); + if (!Character.isWhitespace (ch)) + { + if ('<' == ch) + html = true; + break; + } + } + if (html) + setLexer (new Lexer (new Page (resource))); + else + setLexer (new Lexer (getConnectionManager ().openConnection (resource))); + } + + /** * Set the connection for this parser. * This method creates a new <code>Lexer</code> reading from the connection. *************** *** 397,400 **** --- 409,414 ---- * @see #setLexer * @see #getConnection + * @exception IllegalArgumentException if <code>connection</code> is <code>null</code>. + * @exception ParserException if a problem occurs in connecting. */ public void setConnection (URLConnection connection) *************** *** 425,428 **** --- 439,443 ---- * @throws ParserException If the url is invalid or creation of the * underlying Lexer cannot be performed. + * @exception ParserException if a problem occurs in connecting. * @see #getURL */ *************** *** 481,488 **** * since the lexer owns the node factory object. * It does not adjust the <code>feedback</code> object. - * Trying to set the lexer to <code>null</code> is a no-op. * @param lexer The lexer object to use. * @see #setNodeFactory * @see #getLexer */ public void setLexer (Lexer lexer) --- 496,503 ---- * since the lexer owns the node factory object. * It does not adjust the <code>feedback</code> object. * @param lexer The lexer object to use. * @see #setNodeFactory * @see #getLexer + * @exception IllegalArgumentException if <code>lexer</code> is <code>null</code>. */ public void setLexer (Lexer lexer) *************** *** 491,510 **** String type; ! if (null != lexer) ! { // move a node factory that's been set to the new lexer ! factory = null; ! if (null != getLexer ()) ! factory = getLexer ().getNodeFactory (); ! if (null != factory) ! lexer.setNodeFactory (factory); ! mLexer = lexer; ! // warn about content that's not likely text ! type = mLexer.getPage ().getContentType (); ! if (type != null && !type.startsWith ("text")) ! getFeedback ().warning ( ! "URL " ! + mLexer.getPage ().getUrl () ! + " does not contain text"); ! } } --- 506,525 ---- String type; ! if (null == lexer) ! throw new IllegalArgumentException ("lexer cannot be null"); ! // move a node factory that's been set to the new lexer ! factory = null; ! if (null != getLexer ()) ! factory = getLexer ().getNodeFactory (); ! if (null != factory) ! lexer.setNodeFactory (factory); ! mLexer = lexer; ! // warn about content that's not likely text ! type = mLexer.getPage ().getContentType (); ! if (type != null && !type.startsWith ("text")) ! getFeedback ().warning ( ! "URL " ! + mLexer.getPage ().getUrl () ! + " does not contain text"); } *************** *** 533,536 **** --- 548,552 ---- * @param factory The new node factory for the current lexer. * @see #getNodeFactory + * @exception IllegalArgumentException if <code>factory</code> is <code>null</code>. */ public void setNodeFactory (NodeFactory factory) *************** *** 720,723 **** --- 736,740 ---- * @throws ParserException If a error occurs in setting up the * underlying Lexer. + * @exception IllegalArgumentException if <code>inputHTML</code> is <code>null</code>. */ public void setInputHTML (String inputHTML) *************** *** 838,852 **** try { if (1 < args.length) filter = new TagNameFilter (args[1]); else - filter = null; - parser = new Parser (args[0]); - if (1 < args.length) { // for a simple dump, use more verbose settings parser.setFeedback (Parser.STDOUT); getConnectionManager ().setMonitor (parser); } System.out.println (parser.parse (filter)); } --- 855,871 ---- try { + parser = new Parser (); if (1 < args.length) filter = new TagNameFilter (args[1]); else { + filter = null; // for a simple dump, use more verbose settings parser.setFeedback (Parser.STDOUT); getConnectionManager ().setMonitor (parser); } + getConnectionManager ().setRedirectionProcessingEnabled (true); + getConnectionManager ().setCookieProcessingEnabled (true); + parser.setResource (args[0]); System.out.println (parser.parse (filter)); } |
From: Derrick O. <der...@us...> - 2006-06-02 01:48:47
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/http In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv32515/http Modified Files: HttpHeader.java ConnectionManager.java Log Message: implement RFE #1436082 Follow redirections with cookie processing Use ConnectionManager.setRedirectionProcessingEnabled(true). Probably only useful if combined with ConnectionManager.setCookieProcessingEnabled(true). Index: HttpHeader.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/http/HttpHeader.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** HttpHeader.java 19 Jun 2005 12:01:13 -0000 1.1 --- HttpHeader.java 2 Jun 2006 01:48:43 -0000 1.2 *************** *** 112,127 **** { message = conn.getResponseMessage (); - buffer.append ("HTTP/1.1 "); - buffer.append (code); - buffer.append (" "); - buffer.append (message); - buffer.append ("\n"); for (int i = 0; null != (value = conn.getHeaderField (i)); i++) { key = conn.getHeaderFieldKey (i); ! if (null != key) { ! buffer.append (key); ! buffer.append (": "); buffer.append (value); buffer.append ("\n"); --- 112,133 ---- { message = conn.getResponseMessage (); for (int i = 0; null != (value = conn.getHeaderField (i)); i++) { key = conn.getHeaderFieldKey (i); ! if ((null == key) && (0 == i)) { ! buffer.append ("HTTP/1.1 "); ! buffer.append (code); ! buffer.append (" "); ! buffer.append (message); ! buffer.append ("\n"); ! } ! else ! { ! if (null != key) ! { ! buffer.append (key); ! buffer.append (": "); ! } buffer.append (value); buffer.append ("\n"); Index: ConnectionManager.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/http/ConnectionManager.java,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** ConnectionManager.java 30 May 2006 01:30:16 -0000 1.11 --- ConnectionManager.java 2 Jun 2006 01:48:43 -0000 1.12 *************** *** 143,146 **** --- 143,151 ---- /** + * Flag determining if redirection processing is being handled manually. + */ + protected boolean mRedirectionProcessingEnabled; + + /** * Cookie expiry date format for parsing. */ *************** *** 171,174 **** --- 176,180 ---- mCookieJar = null; mMonitor = null; + mRedirectionProcessingEnabled = false; } *************** *** 488,491 **** --- 494,541 ---- /** + * Predicate to determine if url redirection processing is currently enabled. + * @return <code>true</code> if redirection is being processed manually. + * @see #setRedirectionProcessingEnabled + */ + public boolean getRedirectionProcessingEnabled () + { + return (mRedirectionProcessingEnabled); + } + + /** + * Enables or disables manual redirection handling. + * Normally the <code>HttpURLConnection</code> follows redirections + * (HTTP response code 3xx) automatically if the + * <code>followRedirects</code> property is <code>true</code>. + * With this flag set the <code>ConnectionMonitor</code> performs the + * redirection processing; The advantage being that cookies (if enabled) + * are passed in subsequent requests. + * @param enabled The new state of the redirectionProcessingEnabled property. + */ + public void setRedirectionProcessingEnabled (boolean enabled) + { + mRedirectionProcessingEnabled = enabled; + } + + /** + * Get the Location field if any. + * @param http The connection to get the location from. + */ + protected String getLocation (HttpURLConnection http) + { + String key; + String value; + String ret; + + ret = null; + + for (int i = 0; ((null == ret) && (null != (value = http.getHeaderField (i)))); i++) + if ((null != (key = http.getHeaderFieldKey (i))) && (key.equalsIgnoreCase ("Location"))) + ret = value; + + return (ret); + } + + /** * Opens a connection using the given url. * @param url The url to open. *************** *** 497,500 **** --- 547,552 ---- ParserException { + boolean repeat; + int repeated; Properties sysprops; Hashtable properties; *************** *** 510,638 **** String auth; String encoded; URLConnection ret; ! try { try { ! // set up for proxy ! if ((null != getProxyHost ()) && (0 != getProxyPort ())) ! { ! sysprops = System.getProperties (); ! set = (String)sysprops.put ("proxySet", "true"); ! host = (String)sysprops.put ("proxyHost", getProxyHost ()); ! port = (String)sysprops.put ("proxyPort", ! Integer.toString (getProxyPort ())); ! // see http://java.sun.com/j2se/1.4.2/docs/guide/net/properties.html ! host2 = (String)sysprops.put ("http.proxyHost", ! getProxyHost ()); ! port2 = (String)sysprops.put ("http.proxyPort", ! Integer.toString (getProxyPort ())); ! System.setProperties (sysprops); ! ! } ! ! // open the connection... but don't connect yet ! ret = url.openConnection (); ! if (ret instanceof HttpURLConnection) { ! http = (HttpURLConnection)ret; ! ! // set the fixed request properties ! properties = getRequestProperties (); ! if (null != properties) ! for (enumeration = properties.keys (); ! enumeration.hasMoreElements ();) { ! key = (String)enumeration.nextElement (); ! value = (String)properties.get (key); ! ret.setRequestProperty (key, value); } ! // set the proxy name and password ! if ((null != getProxyUser ()) ! && (null != getProxyPassword ())) ! { ! auth = getProxyUser () + ":" + getProxyPassword (); ! encoded = encode (auth.getBytes("ISO-8859-1")); ! ret.setRequestProperty ("Proxy-Authorization", encoded); } ! ! // set the URL name and password ! if ((null != getUser ()) && (null != getPassword ())) { ! auth = getUser () + ":" + getPassword (); ! encoded = encode (auth.getBytes("ISO-8859-1")); ! ret.setRequestProperty ("Authorization", ! "Basic " + encoded); ! } ! ! // set the cookies based on the url ! addCookies (ret); ! if (null != getMonitor ()) ! getMonitor ().preConnect (http); ! } ! else ! http = null; ! try ! { ! ret.connect (); ! ! if (null != http) { ! if (null != getMonitor ()) ! getMonitor ().postConnect (http); ! ! parseCookies (ret); } } ! catch (UnknownHostException uhe) ! { ! int message = (int)(Math.random () * FOUR_OH_FOUR.length); ! throw new ParserException (FOUR_OH_FOUR[message], uhe); ! } ! catch (IOException ioe) { ! throw new ParserException (ioe.getMessage (), ioe); } } ! finally { ! if ((null != getProxyHost ()) && (0 != getProxyPort ())) ! { ! sysprops = System.getProperties (); ! if (null != set) ! sysprops.put ("proxySet", set); ! else ! sysprops.remove ("proxySet"); ! if (null != host) ! sysprops.put ("proxyHost", host); ! else ! sysprops.remove ("proxyHost"); ! if (null != port) ! sysprops.put ("proxyPort", port); ! else ! sysprops.remove ("proxyPort"); ! if (null != host2) ! sysprops.put ("http.proxyHost", host2); ! else ! sysprops.remove ("http.proxyHost"); ! if (null != port2) ! sysprops.put ("http.proxyPort", port2); ! else ! sysprops.remove ("http.proxyPort"); ! System.setProperties (sysprops); ! } } } ! catch (IOException ioe) ! { ! String msg = "Error in opening a connection to " ! + url.toExternalForm (); ! ParserException ex = new ParserException (msg, ioe); ! throw ex; ! } return (ret); --- 562,712 ---- String auth; String encoded; + int code; + String uri; URLConnection ret; ! repeated = 0; ! do { + repeat = false; try { ! try { ! // set up for proxy ! if ((null != getProxyHost ()) && (0 != getProxyPort ())) ! { ! sysprops = System.getProperties (); ! set = (String)sysprops.put ("proxySet", "true"); ! host = (String)sysprops.put ("proxyHost", getProxyHost ()); ! port = (String)sysprops.put ("proxyPort", ! Integer.toString (getProxyPort ())); ! // see http://java.sun.com/j2se/1.4.2/docs/guide/net/properties.html ! host2 = (String)sysprops.put ("http.proxyHost", ! getProxyHost ()); ! port2 = (String)sysprops.put ("http.proxyPort", ! Integer.toString (getProxyPort ())); ! System.setProperties (sysprops); ! ! } ! ! // open the connection... but don't connect yet ! ret = url.openConnection (); ! if (ret instanceof HttpURLConnection) ! { ! http = (HttpURLConnection)ret; ! ! if (getRedirectionProcessingEnabled ()) ! http.setInstanceFollowRedirects (false); ! ! // set the fixed request properties ! properties = getRequestProperties (); ! if (null != properties) ! for (enumeration = properties.keys (); ! enumeration.hasMoreElements ();) ! { ! key = (String)enumeration.nextElement (); ! value = (String)properties.get (key); ! ret.setRequestProperty (key, value); ! } ! ! // set the proxy name and password ! if ((null != getProxyUser ()) ! && (null != getProxyPassword ())) { ! auth = getProxyUser () + ":" + getProxyPassword (); ! encoded = encode (auth.getBytes("ISO-8859-1")); ! ret.setRequestProperty ("Proxy-Authorization", encoded); } ! // set the URL name and password ! if ((null != getUser ()) && (null != getPassword ())) ! { ! auth = getUser () + ":" + getPassword (); ! encoded = encode (auth.getBytes("ISO-8859-1")); ! ret.setRequestProperty ("Authorization", ! "Basic " + encoded); ! } ! ! if (getCookieProcessingEnabled ()) ! // set the cookies based on the url ! addCookies (ret); ! ! if (null != getMonitor ()) ! getMonitor ().preConnect (http); } ! else ! http = null; ! ! try { ! ret.connect (); ! if (null != http) ! { ! if (null != getMonitor ()) ! getMonitor ().postConnect (http); ! if (getCookieProcessingEnabled ()) ! parseCookies (ret); ! ! code = http.getResponseCode (); ! if ((3 == (code / 100)) && (repeated < 20)) ! if (null != (uri = getLocation (http))) ! { ! url = new URL (uri); ! repeat = true; ! repeated++; ! } ! } ! } ! catch (UnknownHostException uhe) { ! int message = (int)(Math.random () * FOUR_OH_FOUR.length); ! throw new ParserException (FOUR_OH_FOUR[message], uhe); ! } ! catch (IOException ioe) ! { ! throw new ParserException (ioe.getMessage (), ioe); } } ! finally { ! if ((null != getProxyHost ()) && (0 != getProxyPort ())) ! { ! sysprops = System.getProperties (); ! if (null != set) ! sysprops.put ("proxySet", set); ! else ! sysprops.remove ("proxySet"); ! if (null != host) ! sysprops.put ("proxyHost", host); ! else ! sysprops.remove ("proxyHost"); ! if (null != port) ! sysprops.put ("proxyPort", port); ! else ! sysprops.remove ("proxyPort"); ! if (null != host2) ! sysprops.put ("http.proxyHost", host2); ! else ! sysprops.remove ("http.proxyHost"); ! if (null != port2) ! sysprops.put ("http.proxyPort", port2); ! else ! sysprops.remove ("http.proxyPort"); ! System.setProperties (sysprops); ! } } } ! catch (IOException ioe) { ! String msg = "Error in opening a connection to " ! + url.toExternalForm (); ! ParserException ex = new ParserException (msg, ioe); ! throw ex; } } ! while (repeat); return (ret); |