[Htmlparser-user] reading from Yahoo

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Hi, I was away for a long time...anyways here is  the program that I had written. I know very little programming so dont  get bored. Is there a better way to read from yahoo. I am sure there is:
  one more thing is that the program displays &#36; instead of $ symbol how do I overcome this. Please help anyone.

  the program follows:

  import java.io.*;
  import java.net.*;
  import java.net.URL;

  import org.htmlparser.*;
  import org.htmlparser.util.*;
  import org.htmlparser.Parser;
  import org.htmlparser.lexer.Lexer;
  import org.htmlparser.tags.Span;
  import org.htmlparser.tags.FormTag;
  import org.htmlparser.tags.LinkTag;
  import org.htmlparser.tags.StyleTag;
  import org.htmlparser.tags.ScriptTag;
  import org.htmlparser.tags.ParagraphTag;
  import org.htmlparser.tags.CompositeTag;
  import org.htmlparser.util.NodeList;
  import org.htmlparser.util.ParserException;
  import org.htmlparser.nodes.TagNode;
  import org.htmlparser.nodes.TextNode;
  import org.htmlparser.filters.TagNameFilter;
  import org.htmlparser.filters.LinkStringFilter;

  /**
   * Extract plaintext strings from a web page.
   * Illustrative program to gather the textual contents of a web page.
   * Uses a {@link org.htmlparser.beans.StringBean StringBean} to accumulate
   * the user visible text (what a browser would display) into a single string.
   *Step 1. Parse the page
   *Step 2. Collect the HTML tags in the page as nodes in a list.
   *Step 3. Keep Only the SPAN tags in the list.
   *   Links are continuously updated at yahoo page, they are in between the SPAN tag with 
   *   'recenttimedate' attribute.
   *Step 4. 
   */ 
  public class StringExtract
  {
      public static void main(String args[]){
      try{ 
              int i=0,j=0,k=0;
              boolean endOfnewsinthisPage = false;

               String sourceURL = args[0]; //sourceURL is the argument to read news  from
  // Step 1. Parsing the input page.
               Parser parser = new Parser (sourceURL);//parser will hold the tree of  the url

              NodeList li_tags = new NodeList(); 
  // Step 2. Collecting Tags in a list.
              NodeList list = parser.parse (null);

               //news links are at the span tag (time), spanList stores the span tags
  // Step 3. Keep only the SPAN tags in spanList.
               NodeList spanList = list.extractAllNodesThatMatch(new TagNameFilter  ("SPAN"),true);

  // Step 4. Extract link from each span tag.
              while(i < spanList.size())
              {
                   Span spanTag = (Span)spanList.elementAt(i);
  //                 System.out.println(spanTag.getText());
  //               We only need SPAN tags with attribute "class = 'recenttimedate'"
  //              Move to the link in the span tag
                   if(spanTag.getText().equals("span class=recenttimedate"))
                  {
                       li_tags.add(spanList.elementAt(i).getParent());
                  }
                  i++;
              }

              i=0;
              NodeList a_tags = new NodeList();

              NodeFilter filter = new TagNameFilter ("P");

              LinkTag validLink = new LinkTag();
              CompositeTag comptag = new CompositeTag();

              String linkTag = "http";
               LinkStringFilter linkTagFilter= new LinkStringFilter(linkTag);
              //  There are http links and also other links, a_tags will contain only  http links
              for( NodeIterator e = li_tags.elements(); e.hasMoreNodes();)
              {
                       e.nextNode().collectInto(a_tags, linkTagFilter);
              }
  //             BufferedWriter out = new BufferedWriter(new FileWriter("output.txt",  true));
  //            while( i < a_tags.size())
  //            {
                   LinkTag linkAtag = (LinkTag)a_tags.elementAt(0);
                   //Extract link from each a_tags element
                   String interestingLink = linkAtag.extractLink();

                  boolean exists = false;
                  j=0;
                   // In Yahoo, there are few http links which lead to images, we dont  need them, the following loop
                  // filters  out those links.
                   while(j < interestingLink.length() && (!exists))
                  {
                       exists = interestingLink.substring(j).startsWith("photos");
                      j++;
                  }
  // Step 5. Parse Each link that was  collected.                 
                   if((linkAtag.isHTTPLink()) &&  (!linkAtag.getLinkText().equals("")) && (!exists))
                  {

                       Parser parseIndividualURLs = new Parser(interestingLink);

                       NodeList nodesLink = parseIndividualURLs.parse(null);

                       String newString = new String();
                       TextNode textNode = new TextNode(newString);

                       for(NodeIterator x = nodesLink.elements(); x.hasMoreNodes();)
                      {
                           Node cNode = x.nextNode();
                           if((cNode.getChildren() != null) &&  (!cNode.getText().equals("div")))
                               nodesLink.add(cNode.getChildren());    
                      }
  // One link is one HTML document. nodesLink is the list of all nodes  under one  document.                     
                       for(k = 0; k < nodesLink.size(); k++)
                      {
                               Node cNode = nodesLink.elementAt(k);

                               Node prevNode = null;
                               Node nextNode = null;
                               TagNode nextTagNode = null;
                               TagNode prevTagNode = null;
                               TagNode dNode = null;
  //                             if(cNode instanceof LinkTag)
  //                             {
  //                                 LinkTag lnkTag = (LinkTag)cNode;
  //                                 System.out.println(lnkTag.getLinkText());
  //                             }

                               if(!((k-1) < 0))
                               {
                                   prevNode = nodesLink.elementAt(k-1);
                                   if(prevNode instanceof TagNode)
                                       prevTagNode = (TagNode)nodesLink.elementAt(k-1);
                               }
                               if(!((k+1) > nodesLink.size()))
                               {
                                   nextNode = nodesLink.elementAt(k+1);
                                   if(nextNode instanceof TagNode)
                                       nextTagNode = (TagNode)nodesLink.elementAt(k+1);
                               }

                               TagNode tNode = (TagNode)cNode.getParent();
                               NodeList newList = new NodeList();
                               //Printing the title of the news

                               if(cNode.getText().equals("title"))
                               {
  //                                 out.write(cNode.toPlainTextString());
                                   System.out.println(cNode.toPlainTextString());
                                   System.out.println();
                               }
                               if(cNode.getText().startsWith("div"))
                               {
                                   dNode = (TagNode)cNode;
                                   if(dNode.getAttribute("class") != null)
                                       if(dNode.getAttribute("class").equals("clearfix"))
                                           k = nodesLink.size() + 1;
                               }                                 

                               if(cNode instanceof TextNode)
                               {
  // This 'if block' prints each paragraph of the news
                                   if(prevNode.getText().equals("p"))
                                   {
                                       if(!(nextNode.getText().startsWith("span")))
                                       {
  //                                         out.write(cNode.toHtml().trim());
                                           System.out.println(cNode.toHtml().trim()); // here

                                       }
                                       else if(nextNode instanceof TagNode)
                                       {
                                           if(nextTagNode.getAttribute("class") != null)
                                               if(!(nextTagNode.getAttribute("class").equals("clearfix")))
                                               {
  //                                                 out.write(cNode.toHtml().trim());
                                                   System.out.println(cNode.toHtml().trim());
                                               }
                                               else
                                                   k = nodesLink.size();
                                       }
                                   }
  // This 'else if' block prints the first paragraph of the news (Because the first paragraph
  //at a different place in the  document.                             
                                   else if(prevNode.getText().equals("p/"))
                                       {
  //                                         out.write(cNode.toHtml().trim());
                                           System.out.println(cNode.toHtml().trim());
                                       }
  // There are some words in the document where Yahoo provides search facility(for example, a person,
  // a country etc.) and it is in the form of link. This block extracts text from those links.
                                   else if(prevNode.getText().startsWith("span"))
                                   {
                                       newList.add(prevNode.getChildren());
                                       for(NodeIterator x=newList.elements();x.hasMoreNodes();)
                                       {
                                           Node aNode = x.nextNode();
                                           if(aNode instanceof TagNode)
                                           {
                                               prevTagNode = (TagNode)aNode;
                                               if(prevTagNode.getAttribute("href") != null)
                                               {
  //                                                 out.write(aNode.toPlainTextString()+" "+cNode.toHtml().trim());
                                                   System.out.println(aNode.toPlainTextString()+" "+cNode.toHtml().trim());
                                               }                                                 
                                           }
                                       }
                                   }
                               }
                      }
  //                     System.out.println("Link:"+linkAtag.extractLink()+":Text:" +  linkAtag.getLinkText());
  //                     System.out.println();
  //                     System.out.println();
                  }
  //                i++;
  //            }
  //                out.close();
         }
         catch (Exception ex)
         {
             System.out.println("Printing Exceptional Error");
             ex.printStackTrace();
         }
    }  
  }

---------------------------------
 Yahoo! Mail - Helps protect you from nasty viruses.