[Htmlparser-cvs] htmlparser/docs/docs RSSFeeds.html,NONE,1.1 SamplePrograms.html,1.7,1.8 UsingCookie

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/docs/docs
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3933/docs/docs

Modified Files:
	SamplePrograms.html UsingCookiesWithParser.html 
Added Files:
	RSSFeeds.html 
Log Message:
Update version to 1.4-20040229

--- NEW FILE: RSSFeeds.html ---
<html><head><title>RSSFeeds</title></head><body>

<div class="wikitext">
<p><b>Parsing RSS Feeds

Out of the box, the parser only understands XML tags that have the same name as
HTML tags. So this example:

<pre>
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/*
 *  RSS (RDF Site Summary - formerly called Rich Site Summary) is a method of
 * describing news or other Web content that is available for "feeding"
 * (distribution or syndication) from an online publisher to Web users.
 * RSS is an application of the Extensible Markup Language (XML) that adheres
 * to the World Wide Web Consortium's Resource Description Framework (RDF).
 * Originally developed by Netscape for its browser's Netcenter channels,
 * the RSS specification is now available for anyone to use.
 */
public class ResourceDescriptionFrameworkSiteSummary
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser;
        NodeList list;

        parser = new Parser ("http://sourceforge.net/export/rss2_sftopstats.php?feed=mostactive_weekly");
        list = parser.extractAllNodesThatMatch (new NodeClassFilter (TitleTag.class));
        for (NodeIterator iterator = list.elements (); iterator.hasMoreNodes (); )
            System.out.println (iterator.nextNode ().toPlainTextString ());

    }
}

<p>Will only find the TITLE tags, which may be what we want:

<pre>
Rank 1: Gaim (100% activity)
Rank 2: Azureus - BitTorrent Client (99.9934% activity)
Rank 3: eGroupWare: Enterprise Collaboration (99.9867% activity)
Rank 4: WinMerge (99.9801% activity)
Rank 5: phpMyAdmin (99.9735% activity)
Rank 6: guliverkli (99.9668% activity)
Rank 7: phpGedView (99.9602% activity)
Rank 8: AMSN (99.9536% activity)
Rank 9: dotproject (99.9469% activity)
Rank 10: ScummVM (99.9403% activity)

<p>However, with some custom tags defined, it can handle the heirarchy of the XML:

<pre>
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.CompositeTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/*
 *  RSS (RDF Site Summary - formerly called Rich Site Summary) is a method of
 * describing news or other Web content that is available for "feeding"
 * (distribution or syndication) from an online publisher to Web users.
 * RSS is an application of the Extensible Markup Language (XML) that adheres
 * to the World Wide Web Consortium's Resource Description Framework (RDF).
 * Originally developed by Netscape for its browser's Netcenter channels,
 * the RSS specification is now available for anyone to use.
 */
class Item        extends CompositeTag { public String[] getIds () { return (new String[] { "ITEM" }); } }
class Title       extends CompositeTag { public String[] getIds () { return (new String[] { "TITLE" }); } }
class Description extends CompositeTag { public String[] getIds () { return (new String[] { "DESCRIPTION" }); } }
class Link        extends CompositeTag { public String[] getIds () { return (new String[] { "LINK" }); } }
class Guid        extends CompositeTag { public String[] getIds () { return (new String[] { "GUID" }); } }
class PubDate     extends CompositeTag { public String[] getIds () { return (new String[] { "PUBDATE" }); } }

public class ResourceDescriptionFrameworkSiteSummary
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser;
        PrototypicalNodeFactory factory;
        NodeList list;
        Item item;
        NodeList kids;
        Node node;
        Tag tag;
        String name;

        parser = new Parser ("http://sourceforge.net/export/rss2_projsummary.php?group_id=24399");
        factory = new PrototypicalNodeFactory (true); // empty
        factory.registerTag (new Item ());
        factory.registerTag (new Title ());
        factory.registerTag (new Description ());
        factory.registerTag (new Link ());
        factory.registerTag (new Guid ());
        factory.registerTag (new PubDate ());
        parser.setNodeFactory (factory);
        list = parser.extractAllNodesThatMatch (new NodeClassFilter (Item.class));
        for (NodeIterator iterator = list.elements (); iterator.hasMoreNodes (); )
        {
            item = (Item)iterator.nextNode ();
            kids = item.getChildren ();
            if (null != kids)
                for (int i = 0; i &lt; kids.size (); i++)
                {
                    node = kids.elementAt (i);
                    if (node instanceof Tag)
                    {
                        tag = (Tag)node;
                        name = tag.getTagName ();
                        if (name.equals ("TITLE") || name.equals ("DESCRIPTION"))
                            System.out.println (tag.toPlainTextString ());
                    }
                }
        }
    }
}

<p>This isn't as pretty as it could be, but you get the idea:

<pre>
Project name: HTML Parser
Project description: HTML Parser is a library, written in Java, which allows you to parse HTML (HTML 4.0 supported).
It has been used by people on live projects. Developers appreciate how easy it is to use. The architecture is flexible, allowing you to extend it easily.
Developers on project: 16
Project administrators: &amp;#60;a href=&amp;#34;http://sourceforge.net/users/derrickoswald/&amp;#34;&amp;#62;derrickoswald&amp;#60;/a&amp;#62;, &amp;#60;a href=&amp;#34;http://sourceforge.net/users/somik/&amp;#34;&amp;#62;somik&amp;#60;/a&amp;#62;
Activity percentile (last week): 98.3413%
Most recent daily statistics (24 Jan 2004): Ranking: 251, Activity percentile: 98.34%,
Downloadable files: 25615 total downloads to date
Most recent daily statistics (24 Jan 2004): Download count: 19
Mailing lists (public): 4
Public mailing lists: htmlparser-developer, htmlparser-announce, htmlparser-user, htmlparser-cvs
Discussion forums (public): 2, containing 110 messages
Public discussion forums: Open Discussion, Help, htmlparser-user, htmlparser-developer
Tracker: Bugs (1 open/158 total)
Tracker description: Bug Tracking System
Tracker: Support Requests (1 open/20 total)
Tracker description: Tech Support Tracking System
Tracker: Patches (0 open/0 total)
Tracker description: Patch Tracking System
Tracker: Feature Requests (2 open/10 total)
Tracker description: Feature Request Tracking System
CVS (8169 commits/809 adds)
Most recent daily statistics (24 Jan 2004): Commit count: 0; Add count: 0 &amp;#60;br&amp;#62;&amp;#60;a href=&amp;#34;http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/htmlparser/&amp;#34;&amp;#62;[Web-based access to repository]&amp;#60;/a&amp;#62;

<div id="actionbar" class="toolbar">

<hr class="printer" noshade="noshade" />

<p class="editdate">Last edited on Tuesday, January 27, 2004  6:04:21 pm.

<hr class="toolbar" noshade="noshade" />
</body></html>
Index: SamplePrograms.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/SamplePrograms.html,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** SamplePrograms.html	26 Jan 2004 01:02:09 -0000	1.7
--- SamplePrograms.html	29 Feb 2004 16:48:43 -0000	1.8
***************
*** 22,25 ****
--- 22,27 ----
  <li><a href="JavaBeans.html" class="wiki">JavaBeans</a>

+ <li><a href="RSSFeeds.html" class="named-wiki" title="RSSFeeds">Parsing RSS Feeds</a>
+ 
  <li><a href="WebCrawler.html" class="wiki">WebCrawler</a> - ignore this, it's old

***************
*** 33,37 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, January  7, 2004  6:12:30 pm.

  <hr class="toolbar" noshade="noshade" />
--- 35,39 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Tuesday, January 27, 2004  5:25:45 pm.

  <hr class="toolbar" noshade="noshade" />

Index: UsingCookiesWithParser.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/UsingCookiesWithParser.html,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** UsingCookiesWithParser.html	26 Oct 2003 19:46:17 -0000	1.5
--- UsingCookiesWithParser.html	29 Feb 2004 16:48:43 -0000	1.6
***************
*** 6,10 ****
  <p><b>Using Cookies with the Parser

! <p><b>Problem: (by <span class="wikiunknown"><u>ShanSivakolundhu)
  <br />
  In order to access a particular site I neet to have
--- 6,10 ----
  <p><b>Using Cookies with the Parser

! Problem: (by ShanSivakolundhu)
 
 In order to access a particular site I neet to have
***************
*** 16,20 ****
 URLConnection.connect();

! <p><b>Solution: (by <span class="wikiunknown"><u>BobLewis)
  <br />
  In order to send cookies in your Http requests, all
--- 16,20 ----
  URLConnection.connect();

! Solution: (by BobLewis)
 
 In order to send cookies in your Http requests, all
***************
*** 22,92 ****
 URL Connection.

! Generally what I've done is first create a
! HttpURLConnection, create some Cookie objects that are
! needed, and set the HTTP Header using those objects
! (See below for code to format the header value).
! 
! Then I'll create the Parser using the URLConnection
! something like this:

  <pre>
!         DefaultHTMLParserFeedback feedback =
!            new DefaultHTMLParserFeedback(DefaultHTMLParserFeedback.DEBUG);
! 
!         HTMLReader reader = null;
!         HTMLParser parser = null;
!         String charset = HttpUtil.getCharacterSet(urlConn);
! 
!         InputStreamReader isr =
!            new InputStreamReader(urlConn.getInputStream(), charset);
!         reader = new HTMLReader(isr, 8192);
!         parser = new HTMLParser(reader, feedback);

! The HttpUtil.getCharacterSet method used above is
! basically just taken from the method of the same name
! in the HTMLParser class. That method is protected, so
! I had to duplicate it elsewhere.

! <pre>
      /**
!      * set cookies to send in a HttpURLConnection&lt;br&gt;
!      * This method should only be called before any
!      * parameters are posted
!      * and before the connection is made.
!      * @param urlConn the HttpURLConnection to send
!      * the cookies through
!      * @param cookies the cookies to send
       */
!     public static void postCookies(HttpURLConnection urlConn, Cookie[] cookies) {
!         if ((cookies == null) || (cookies.length ==0)) {
!             return;
!         }
!         String[] cookieHeaders = new String[cookies.length];
!         urlConn.setRequestProperty("cookie",generateCookieHeader(cookies));
!     }

      /**
!      * generate a HTTP cookie header value string
!      * from an array of cookies
!      * @param cookies the cookies which should be set
!      * in the header value
!      * @return A string containing the HTTP Cookie
!      * Header value
       */
!     private static String generateCookieHeader(Cookie[] cookies) {
!         StringBuffer buf = new StringBuffer();

!         for (int i=0; i &lt; cookies.length;i++) {
!             buf.append(cookies[i].getName());
!             buf.append("=");
!             buf.append(cookies[i].getValue());
!             if (i+1 != cookies.length) {
!                 buf.append("; ");
              }
-             else buf.append(" ");
          }
!         return buf.toString();
      }

--- 22,138 ----
  URL Connection.

! Create the URL and open the connection, but before passing
! the connection to the parser, set the "Cookie" request property:

<pre>
! import java.net.URL;
! import java.net.URLConnection;
! import javax.servlet.http.Cookie;

! import org.htmlparser.Parser;
! import org.htmlparser.util.NodeIterator;

! /**
!  * Demonstrate cookie usage with the HTML Parser.
!  */
! public class CookieDemo
! {
      /**
!      * The cookies.
!      * You'll need to get these from your browser's cookie jar or somewhere.
!      * Only the cookies that apply to the URL you are using and haven't expired
!      * are supposed to be passed in the request.
!      * This is only part of a real cookie, much longer than shown.
       */
!     public static Cookie[] cookies =
!     {
!         new Cookie ("user", "%2536%2535%2538%2531%2539%2530%253a etc."),
!     };

      /**
!      * Generate a HTTP cookie header value string from an array of cookies.
!      * &lt;pre&gt;
!      *   The syntax for the header is:
!      *
!      *    cookie          =       "Cookie:" cookie-version
!      *                            1*((";" | ",") cookie-value)
!      *    cookie-value    =       NAME "=" VALUE [";" path] [";" domain]
!      *    cookie-version  =       "$Version" "=" value
!      *    NAME            =       attr
!      *    VALUE           =       value
!      *    path            =       "$Path" "=" value
!      *    domain          =       "$Domain" "=" value
!      *
!      * &lt;/pre&gt;
!      * @param cookies The cookies which should be set in the header value.
!      * @return A string containing the HTTP Cookie Header value.
!      * @see &lt;a href="http://www.ietf.org/rfc/rfc2109.txt"&gt;RFC 2109&lt;/a&gt;
       */
!     public static String generateCookieHeader (Cookie[] cookies)
!     {
!         int version;
!         boolean quote;
!         StringBuffer ret;

!         ret = new StringBuffer ();
! 
!         version = 0;
!         for (int i = 0; i &lt; cookies.length; i++)
!             version = Math.max (version, cookies[i].getVersion ());
!         if (0 != version)
!         {
!             ret.append ("$Version=\"");
!             ret.append (version);
!             ret.append ("\"");
!         }
!         for (int i = 0; i &lt; cookies.length; i++)
!         {
!             if (0 != ret.length ())
!                 ret.append ("; ");
!             ret.append (cookies[i].getName ());
!             ret.append ("=");
!             if (0 != version)
!                 ret.append ("\"");
!             ret.append (cookies[i].getValue ());
!             if (0 != version)
!                 ret.append ("\"");
!             if (0 != version)
!             {
!                 if ((null != cookies[i].getPath ())
!                     &amp;&amp; (0 != cookies[i].getPath ().length ()))
!                 {
!                     ret.append ("; $Path=\"");
!                     ret.append (cookies[i].getPath ());
!                     ret.append ("\"");
!                 }
!                 if ((null != cookies[i].getDomain ())
!                     &amp;&amp; (0 != cookies[i].getDomain ().length ()))
!                 {
!                     ret.append ("; $Domain=\"");
!                     ret.append (cookies[i].getDomain ());
!                     ret.append ("\"");
!                 }
              }
          }
! 
!         return (ret.toString ());
      }

+     public static void main (String[] args) throws Exception
+     {
+         Parser parser;
+         URL url;
+         URLConnection connection;
+ 
+         parser = new Parser ();
+         url = new URL ("http://slashdot.org");
+         connection = url.openConnection ();
+         connection.setRequestProperty ("Cookie", generateCookieHeader (cookies));
+         parser.setConnection (connection);
+         for (NodeIterator iterator = parser.elements (); iterator.hasMoreNodes (); )
+             System.out.println (iterator.nextNode ());
+     }
+ }
+ 

***************
*** 95,99 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, April  2, 2003  3:04:24 pm.

  <hr class="toolbar" noshade="noshade" />
--- 141,145 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Monday, January 26, 2004  7:26:47 pm.

  <hr class="toolbar" noshade="noshade" />

[Htmlparser-cvs] htmlparser/docs/docs RSSFeeds.html,NONE,1.1 SamplePrograms.html,1.7,1.8 UsingCookie

[Htmlparser-cvs] htmlparser/docs/docs RSSFeeds.html,NONE,1.1 SamplePrograms.html,1.7,1.8 UsingCookiesWithParser.html,1.5,1.6