[Htmlparser-user] reading from Yahoo
Brought to you by:
derrickoswald
From: Riaz u. <ru...@ya...> - 2006-02-03 02:04:26
|
Hi, I was away for a long time...anyways here is the program that I had written. I know very little programming so dont get bored. Is there a better way to read from yahoo. I am sure there is: one more thing is that the program displays $ instead of $ symbol how do I overcome this. Please help anyone. the program follows: import java.io.*; import java.net.*; import java.net.URL; import org.htmlparser.*; import org.htmlparser.util.*; import org.htmlparser.Parser; import org.htmlparser.lexer.Lexer; import org.htmlparser.tags.Span; import org.htmlparser.tags.FormTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.StyleTag; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.ParagraphTag; import org.htmlparser.tags.CompositeTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.nodes.TagNode; import org.htmlparser.nodes.TextNode; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.filters.LinkStringFilter; /** * Extract plaintext strings from a web page. * Illustrative program to gather the textual contents of a web page. * Uses a {@link org.htmlparser.beans.StringBean StringBean} to accumulate * the user visible text (what a browser would display) into a single string. *Step 1. Parse the page *Step 2. Collect the HTML tags in the page as nodes in a list. *Step 3. Keep Only the SPAN tags in the list. * Links are continuously updated at yahoo page, they are in between the SPAN tag with * 'recenttimedate' attribute. *Step 4. */ public class StringExtract { public static void main(String args[]){ try{ int i=0,j=0,k=0; boolean endOfnewsinthisPage = false; String sourceURL = args[0]; //sourceURL is the argument to read news from // Step 1. Parsing the input page. Parser parser = new Parser (sourceURL);//parser will hold the tree of the url NodeList li_tags = new NodeList(); // Step 2. Collecting Tags in a list. NodeList list = parser.parse (null); //news links are at the span tag (time), spanList stores the span tags // Step 3. Keep only the SPAN tags in spanList. NodeList spanList = list.extractAllNodesThatMatch(new TagNameFilter ("SPAN"),true); // Step 4. Extract link from each span tag. while(i < spanList.size()) { Span spanTag = (Span)spanList.elementAt(i); // System.out.println(spanTag.getText()); // We only need SPAN tags with attribute "class = 'recenttimedate'" // Move to the link in the span tag if(spanTag.getText().equals("span class=recenttimedate")) { li_tags.add(spanList.elementAt(i).getParent()); } i++; } i=0; NodeList a_tags = new NodeList(); NodeFilter filter = new TagNameFilter ("P"); LinkTag validLink = new LinkTag(); CompositeTag comptag = new CompositeTag(); String linkTag = "http"; LinkStringFilter linkTagFilter= new LinkStringFilter(linkTag); // There are http links and also other links, a_tags will contain only http links for( NodeIterator e = li_tags.elements(); e.hasMoreNodes();) { e.nextNode().collectInto(a_tags, linkTagFilter); } // BufferedWriter out = new BufferedWriter(new FileWriter("output.txt", true)); // while( i < a_tags.size()) // { LinkTag linkAtag = (LinkTag)a_tags.elementAt(0); //Extract link from each a_tags element String interestingLink = linkAtag.extractLink(); boolean exists = false; j=0; // In Yahoo, there are few http links which lead to images, we dont need them, the following loop // filters out those links. while(j < interestingLink.length() && (!exists)) { exists = interestingLink.substring(j).startsWith("photos"); j++; } // Step 5. Parse Each link that was collected. if((linkAtag.isHTTPLink()) && (!linkAtag.getLinkText().equals("")) && (!exists)) { Parser parseIndividualURLs = new Parser(interestingLink); NodeList nodesLink = parseIndividualURLs.parse(null); String newString = new String(); TextNode textNode = new TextNode(newString); for(NodeIterator x = nodesLink.elements(); x.hasMoreNodes();) { Node cNode = x.nextNode(); if((cNode.getChildren() != null) && (!cNode.getText().equals("div"))) nodesLink.add(cNode.getChildren()); } // One link is one HTML document. nodesLink is the list of all nodes under one document. for(k = 0; k < nodesLink.size(); k++) { Node cNode = nodesLink.elementAt(k); Node prevNode = null; Node nextNode = null; TagNode nextTagNode = null; TagNode prevTagNode = null; TagNode dNode = null; // if(cNode instanceof LinkTag) // { // LinkTag lnkTag = (LinkTag)cNode; // System.out.println(lnkTag.getLinkText()); // } if(!((k-1) < 0)) { prevNode = nodesLink.elementAt(k-1); if(prevNode instanceof TagNode) prevTagNode = (TagNode)nodesLink.elementAt(k-1); } if(!((k+1) > nodesLink.size())) { nextNode = nodesLink.elementAt(k+1); if(nextNode instanceof TagNode) nextTagNode = (TagNode)nodesLink.elementAt(k+1); } TagNode tNode = (TagNode)cNode.getParent(); NodeList newList = new NodeList(); //Printing the title of the news if(cNode.getText().equals("title")) { // out.write(cNode.toPlainTextString()); System.out.println(cNode.toPlainTextString()); System.out.println(); } if(cNode.getText().startsWith("div")) { dNode = (TagNode)cNode; if(dNode.getAttribute("class") != null) if(dNode.getAttribute("class").equals("clearfix")) k = nodesLink.size() + 1; } if(cNode instanceof TextNode) { // This 'if block' prints each paragraph of the news if(prevNode.getText().equals("p")) { if(!(nextNode.getText().startsWith("span"))) { // out.write(cNode.toHtml().trim()); System.out.println(cNode.toHtml().trim()); // here } else if(nextNode instanceof TagNode) { if(nextTagNode.getAttribute("class") != null) if(!(nextTagNode.getAttribute("class").equals("clearfix"))) { // out.write(cNode.toHtml().trim()); System.out.println(cNode.toHtml().trim()); } else k = nodesLink.size(); } } // This 'else if' block prints the first paragraph of the news (Because the first paragraph //at a different place in the document. else if(prevNode.getText().equals("p/")) { // out.write(cNode.toHtml().trim()); System.out.println(cNode.toHtml().trim()); } // There are some words in the document where Yahoo provides search facility(for example, a person, // a country etc.) and it is in the form of link. This block extracts text from those links. else if(prevNode.getText().startsWith("span")) { newList.add(prevNode.getChildren()); for(NodeIterator x=newList.elements();x.hasMoreNodes();) { Node aNode = x.nextNode(); if(aNode instanceof TagNode) { prevTagNode = (TagNode)aNode; if(prevTagNode.getAttribute("href") != null) { // out.write(aNode.toPlainTextString()+" "+cNode.toHtml().trim()); System.out.println(aNode.toPlainTextString()+" "+cNode.toHtml().trim()); } } } } } } // System.out.println("Link:"+linkAtag.extractLink()+":Text:" + linkAtag.getLinkText()); // System.out.println(); // System.out.println(); } // i++; // } // out.close(); } catch (Exception ex) { System.out.println("Printing Exceptional Error"); ex.printStackTrace(); } } } --------------------------------- Yahoo! Mail - Helps protect you from nasty viruses. |