[Htmlparser-cvs] htmlparser/docs/docs RSSFeeds.html,NONE,1.1 SamplePrograms.html,1.7,1.8 UsingCookie
Brought to you by:
derrickoswald
From: <der...@us...> - 2004-02-29 17:06:56
|
Update of /cvsroot/htmlparser/htmlparser/docs/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3933/docs/docs Modified Files: SamplePrograms.html UsingCookiesWithParser.html Added Files: RSSFeeds.html Log Message: Update version to 1.4-20040229 --- NEW FILE: RSSFeeds.html --- <html><head><title>RSSFeeds</title></head><body> <div class="wikitext"> <p><b>Parsing RSS Feeds <p>Out of the box, the parser only understands XML tags that have the same name as HTML tags. So this example: <pre> import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.TitleTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /* * RSS (RDF Site Summary - formerly called Rich Site Summary) is a method of * describing news or other Web content that is available for "feeding" * (distribution or syndication) from an online publisher to Web users. * RSS is an application of the Extensible Markup Language (XML) that adheres * to the World Wide Web Consortium's Resource Description Framework (RDF). * Originally developed by Netscape for its browser's Netcenter channels, * the RSS specification is now available for anyone to use. */ public class ResourceDescriptionFrameworkSiteSummary { public static void main (String[] args) throws ParserException { Parser parser; NodeList list; parser = new Parser ("http://sourceforge.net/export/rss2_sftopstats.php?feed=mostactive_weekly"); list = parser.extractAllNodesThatMatch (new NodeClassFilter (TitleTag.class)); for (NodeIterator iterator = list.elements (); iterator.hasMoreNodes (); ) System.out.println (iterator.nextNode ().toPlainTextString ()); } } <p>Will only find the TITLE tags, which may be what we want: <pre> Rank 1: Gaim (100% activity) Rank 2: Azureus - BitTorrent Client (99.9934% activity) Rank 3: eGroupWare: Enterprise Collaboration (99.9867% activity) Rank 4: WinMerge (99.9801% activity) Rank 5: phpMyAdmin (99.9735% activity) Rank 6: guliverkli (99.9668% activity) Rank 7: phpGedView (99.9602% activity) Rank 8: AMSN (99.9536% activity) Rank 9: dotproject (99.9469% activity) Rank 10: ScummVM (99.9403% activity) <p>However, with some custom tags defined, it can handle the heirarchy of the XML: <pre> import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.CompositeTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.TitleTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /* * RSS (RDF Site Summary - formerly called Rich Site Summary) is a method of * describing news or other Web content that is available for "feeding" * (distribution or syndication) from an online publisher to Web users. * RSS is an application of the Extensible Markup Language (XML) that adheres * to the World Wide Web Consortium's Resource Description Framework (RDF). * Originally developed by Netscape for its browser's Netcenter channels, * the RSS specification is now available for anyone to use. */ class Item extends CompositeTag { public String[] getIds () { return (new String[] { "ITEM" }); } } class Title extends CompositeTag { public String[] getIds () { return (new String[] { "TITLE" }); } } class Description extends CompositeTag { public String[] getIds () { return (new String[] { "DESCRIPTION" }); } } class Link extends CompositeTag { public String[] getIds () { return (new String[] { "LINK" }); } } class Guid extends CompositeTag { public String[] getIds () { return (new String[] { "GUID" }); } } class PubDate extends CompositeTag { public String[] getIds () { return (new String[] { "PUBDATE" }); } } public class ResourceDescriptionFrameworkSiteSummary { public static void main (String[] args) throws ParserException { Parser parser; PrototypicalNodeFactory factory; NodeList list; Item item; NodeList kids; Node node; Tag tag; String name; parser = new Parser ("http://sourceforge.net/export/rss2_projsummary.php?group_id=24399"); factory = new PrototypicalNodeFactory (true); // empty factory.registerTag (new Item ()); factory.registerTag (new Title ()); factory.registerTag (new Description ()); factory.registerTag (new Link ()); factory.registerTag (new Guid ()); factory.registerTag (new PubDate ()); parser.setNodeFactory (factory); list = parser.extractAllNodesThatMatch (new NodeClassFilter (Item.class)); for (NodeIterator iterator = list.elements (); iterator.hasMoreNodes (); ) { item = (Item)iterator.nextNode (); kids = item.getChildren (); if (null != kids) for (int i = 0; i < kids.size (); i++) { node = kids.elementAt (i); if (node instanceof Tag) { tag = (Tag)node; name = tag.getTagName (); if (name.equals ("TITLE") || name.equals ("DESCRIPTION")) System.out.println (tag.toPlainTextString ()); } } } } } <p>This isn't as pretty as it could be, but you get the idea: <pre> Project name: HTML Parser Project description: HTML Parser is a library, written in Java, which allows you to parse HTML (HTML 4.0 supported). It has been used by people on live projects. Developers appreciate how easy it is to use. The architecture is flexible, allowing you to extend it easily. Developers on project: 16 Project administrators: &#60;a href=&#34;http://sourceforge.net/users/derrickoswald/&#34;&#62;derrickoswald&#60;/a&#62;, &#60;a href=&#34;http://sourceforge.net/users/somik/&#34;&#62;somik&#60;/a&#62; Activity percentile (last week): 98.3413% Most recent daily statistics (24 Jan 2004): Ranking: 251, Activity percentile: 98.34%, Downloadable files: 25615 total downloads to date Most recent daily statistics (24 Jan 2004): Download count: 19 Mailing lists (public): 4 Public mailing lists: htmlparser-developer, htmlparser-announce, htmlparser-user, htmlparser-cvs Discussion forums (public): 2, containing 110 messages Public discussion forums: Open Discussion, Help, htmlparser-user, htmlparser-developer Tracker: Bugs (1 open/158 total) Tracker description: Bug Tracking System Tracker: Support Requests (1 open/20 total) Tracker description: Tech Support Tracking System Tracker: Patches (0 open/0 total) Tracker description: Patch Tracking System Tracker: Feature Requests (2 open/10 total) Tracker description: Feature Request Tracking System CVS (8169 commits/809 adds) Most recent daily statistics (24 Jan 2004): Commit count: 0; Add count: 0 &#60;br&#62;&#60;a href=&#34;http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/htmlparser/&#34;&#62;[Web-based access to repository]&#60;/a&#62; <div id="actionbar" class="toolbar"> <hr class="printer" noshade="noshade" /> <p class="editdate">Last edited on Tuesday, January 27, 2004 6:04:21 pm. <hr class="toolbar" noshade="noshade" /> </body></html> Index: SamplePrograms.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/SamplePrograms.html,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** SamplePrograms.html 26 Jan 2004 01:02:09 -0000 1.7 --- SamplePrograms.html 29 Feb 2004 16:48:43 -0000 1.8 *************** *** 22,25 **** --- 22,27 ---- <li><a href="JavaBeans.html" class="wiki">JavaBeans</a> + <li><a href="RSSFeeds.html" class="named-wiki" title="RSSFeeds">Parsing RSS Feeds</a> + <li><a href="WebCrawler.html" class="wiki">WebCrawler</a> - ignore this, it's old *************** *** 33,37 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, January 7, 2004 6:12:30 pm. <hr class="toolbar" noshade="noshade" /> --- 35,39 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Tuesday, January 27, 2004 5:25:45 pm. <hr class="toolbar" noshade="noshade" /> Index: UsingCookiesWithParser.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/UsingCookiesWithParser.html,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** UsingCookiesWithParser.html 26 Oct 2003 19:46:17 -0000 1.5 --- UsingCookiesWithParser.html 29 Feb 2004 16:48:43 -0000 1.6 *************** *** 6,10 **** <p><b>Using Cookies with the Parser ! <p><b>Problem: (by <span class="wikiunknown"><u>ShanSivakolundhu) <br /> In order to access a particular site I neet to have --- 6,10 ---- <p><b>Using Cookies with the Parser ! <p><b>Problem: (by ShanSivakolundhu) <br /> In order to access a particular site I neet to have *************** *** 16,20 **** URLConnection.connect(); ! <p><b>Solution: (by <span class="wikiunknown"><u>BobLewis) <br /> In order to send cookies in your Http requests, all --- 16,20 ---- URLConnection.connect(); ! <p><b>Solution: (by BobLewis) <br /> In order to send cookies in your Http requests, all *************** *** 22,92 **** URL Connection. ! <p>Generally what I've done is first create a ! HttpURLConnection, create some Cookie objects that are ! needed, and set the HTTP Header using those objects ! (See below for code to format the header value). ! ! <p>Then I'll create the Parser using the URLConnection ! something like this: <pre> ! DefaultHTMLParserFeedback feedback = ! new DefaultHTMLParserFeedback(DefaultHTMLParserFeedback.DEBUG); ! ! HTMLReader reader = null; ! HTMLParser parser = null; ! String charset = HttpUtil.getCharacterSet(urlConn); ! ! InputStreamReader isr = ! new InputStreamReader(urlConn.getInputStream(), charset); ! reader = new HTMLReader(isr, 8192); ! parser = new HTMLParser(reader, feedback); ! <p>The <span class="wikiunknown"><u>HttpUtil.getCharacterSet method used above is ! basically just taken from the method of the same name ! in the HTMLParser class. That method is protected, so ! I had to duplicate it elsewhere. ! <pre> /** ! * set cookies to send in a HttpURLConnection<br> ! * This method should only be called before any ! * parameters are posted ! * and before the connection is made. ! * @param urlConn the HttpURLConnection to send ! * the cookies through ! * @param cookies the cookies to send */ ! public static void postCookies(HttpURLConnection urlConn, Cookie[] cookies) { ! if ((cookies == null) || (cookies.length ==0)) { ! return; ! } ! String[] cookieHeaders = new String[cookies.length]; ! urlConn.setRequestProperty("cookie",generateCookieHeader(cookies)); ! } /** ! * generate a HTTP cookie header value string ! * from an array of cookies ! * @param cookies the cookies which should be set ! * in the header value ! * @return A string containing the HTTP Cookie ! * Header value */ ! private static String generateCookieHeader(Cookie[] cookies) { ! StringBuffer buf = new StringBuffer(); ! for (int i=0; i < cookies.length;i++) { ! buf.append(cookies[i].getName()); ! buf.append("="); ! buf.append(cookies[i].getValue()); ! if (i+1 != cookies.length) { ! buf.append("; "); } - else buf.append(" "); } ! return buf.toString(); } --- 22,138 ---- URL Connection. ! <p>Create the URL and open the connection, but before passing ! the connection to the parser, set the "Cookie" request property: <pre> ! import java.net.URL; ! import java.net.URLConnection; ! import javax.servlet.http.Cookie; ! import org.htmlparser.Parser; ! import org.htmlparser.util.NodeIterator; ! /** ! * Demonstrate cookie usage with the HTML Parser. ! */ ! public class CookieDemo ! { /** ! * The cookies. ! * You'll need to get these from your browser's cookie jar or somewhere. ! * Only the cookies that apply to the URL you are using and haven't expired ! * are supposed to be passed in the request. ! * This is only part of a real cookie, much longer than shown. */ ! public static Cookie[] cookies = ! { ! new Cookie ("user", "%2536%2535%2538%2531%2539%2530%253a etc."), ! }; /** ! * Generate a HTTP cookie header value string from an array of cookies. ! * <pre> ! * The syntax for the header is: ! * ! * cookie = "Cookie:" cookie-version ! * 1*((";" | ",") cookie-value) ! * cookie-value = NAME "=" VALUE [";" path] [";" domain] ! * cookie-version = "$Version" "=" value ! * NAME = attr ! * VALUE = value ! * path = "$Path" "=" value ! * domain = "$Domain" "=" value ! * ! * </pre> ! * @param cookies The cookies which should be set in the header value. ! * @return A string containing the HTTP Cookie Header value. ! * @see <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> */ ! public static String generateCookieHeader (Cookie[] cookies) ! { ! int version; ! boolean quote; ! StringBuffer ret; ! ret = new StringBuffer (); ! ! version = 0; ! for (int i = 0; i < cookies.length; i++) ! version = Math.max (version, cookies[i].getVersion ()); ! if (0 != version) ! { ! ret.append ("$Version=\""); ! ret.append (version); ! ret.append ("\""); ! } ! for (int i = 0; i < cookies.length; i++) ! { ! if (0 != ret.length ()) ! ret.append ("; "); ! ret.append (cookies[i].getName ()); ! ret.append ("="); ! if (0 != version) ! ret.append ("\""); ! ret.append (cookies[i].getValue ()); ! if (0 != version) ! ret.append ("\""); ! if (0 != version) ! { ! if ((null != cookies[i].getPath ()) ! && (0 != cookies[i].getPath ().length ())) ! { ! ret.append ("; $Path=\""); ! ret.append (cookies[i].getPath ()); ! ret.append ("\""); ! } ! if ((null != cookies[i].getDomain ()) ! && (0 != cookies[i].getDomain ().length ())) ! { ! ret.append ("; $Domain=\""); ! ret.append (cookies[i].getDomain ()); ! ret.append ("\""); ! } } } ! ! return (ret.toString ()); } + public static void main (String[] args) throws Exception + { + Parser parser; + URL url; + URLConnection connection; + + parser = new Parser (); + url = new URL ("http://slashdot.org"); + connection = url.openConnection (); + connection.setRequestProperty ("Cookie", generateCookieHeader (cookies)); + parser.setConnection (connection); + for (NodeIterator iterator = parser.elements (); iterator.hasMoreNodes (); ) + System.out.println (iterator.nextNode ()); + } + } + *************** *** 95,99 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, April 2, 2003 3:04:24 pm. <hr class="toolbar" noshade="noshade" /> --- 141,145 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Monday, January 26, 2004 7:26:47 pm. <hr class="toolbar" noshade="noshade" /> |