Re: [Htmlparser-developer] HTMLParser Sample App

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Hi Don,
    It will be appreciated if you can post usage doubts in the
htmlparser-user mailing list (link is at http://htmlparser.sourceforge.net).
    To your query - the code you posted seems rather complex to do a not so
complex task :)

    Here's how you would do it in HTML Parser (in the attached code). The
code I have given is the shortcut-way. There is a way to get much shorter
code that what I am providing you, but that requires getting into the design
docs of the parser - and writing a Table Scanner. Then your code could
become some this like this :

HTMLParser parser = new HTMLParser("http://www.nba.com");
HTMLNode node;
int tableCount = 0;
for (Enumeration e = parser.elements();e.hasMoreElements();) {
    node = (HTMLNode) e.nextElement();
    if (node instanceof HTMLTableNode) {
         tableCount ++;
        if (tableCount==4) {
            HTMLTableNode tableNode = (HTMLTableNode)node;
            tableNode.print();
        }
    }
}

Regards,
Somik

----- Original Message -----
From: "Don Taggart" <dta...@e-...>
To: <Htm...@li...>
Sent: Tuesday, March 12, 2002 1:33 AM
Subject: [Htmlparser-developer] HTMLParser Sample App


> Hi,
> I am attempting to grab the content of a certain table on any website. For
> instance I'd like to get all of the text, tags, comments, etc contained in
> the 4rth table I run across. I've been able to do this successfully using
> the htmleditorkit in swing, but it has a few bugs.
>
> Would your HTML Parser be useful for this scenario, and If so, could you
> give me some guidance on how to start.
>
> Thanks,
> Don
>
>
> Heres my code that goes and get the contents of the 4rth table at nba.com
>
> import java.io.*;
> import java.net.*;
> import java.util.*;
> import javax.swing.text.*;
> import javax.swing.text.html.*;
> import javax.swing.text.html.parser.*;
>
> /**
>  * This small demo program shows how to use the
>  * HTMLEditorKit.Parser and its implementing class
>  * ParserDelegator in the Swing system.
>  */
>
> public class HtmlParseDemo2 {
>     public static void main(String [] args) {
>         Reader r;
>         String host = "";
>         String spec = "http://www.nba.com";
>        long endTime;
>        long endTime2;
>        long startTime = System.currentTimeMillis();
>        String snippet = "";
>
>
>         try {
>             if (spec.indexOf("://") > 0) {
>                 URL u = new URL(spec);
>                 host = u.getHost();
>                 Object content = u.getContent();
>
>                 if (content instanceof InputStream) {
>
>                     r = new InputStreamReader((InputStream)content);
>                 }
>                 else if (content instanceof Reader) {
>                     r = (Reader)content;
>                 }
>                 else {
>                     throw new Exception("Bad URL content type.");
>                 }
>             }
>             else {
>                 r = new FileReader(spec);
>             }
>
> endTime = System.currentTimeMillis();
>             System.out.println("Time to complete connection: " +
(endTime -
> startTime));
>
>             HTMLEditorKit.Parser parser;
>             System.out.println("About to parse " + spec);
>             parser = new ParserDelegator();
>
>             HTMLParseLister2 snippetCallback = new HTMLParseLister2(host);
>
>             file://Parse Away!
>             parser.parse(r, snippetCallback, true);
>             r.close();
>
>
>             endTime2 = System.currentTimeMillis();
>             System.out.println("Time to complete: " + (endTime2 -
> startTime));
>         }
>         catch (Exception e) {
>             System.err.println("Error: " + e);
>             e.printStackTrace(System.err);
>         }
>     }
> }
>
> /**
>  * HTML parsing proceeds by calling a callback for
>  * each and every piece of the HTML document.  This
>  * simple callback class simply prints an indented
>  * structural listing of the HTML data.
>  */
> class HTMLParseLister2 extends HTMLEditorKit.ParserCallback
> {
>
>
>
>    int indentSize = 0;
>    int tableNum = 0;
>     String atts;
>     String tabNum;
>     String endTable;
>     String tableLevel;
>     Stack tableStack = new Stack();
>    boolean finished = false;
>     HTML.Tag selectedTag = HTML.Tag.TABLE;
>     String selectedTable = Integer.toString(4);
>    boolean inImportantTag = false;
>    StringBuffer snippetString = new StringBuffer();
>
>
>
>    private String host;
>
>
>
>    public HTMLParseLister2(String host) {
>     this.host = host;
>     }
>
>     public String  getSnippet() {
> return snippetString.toString();
> }
>
>     protected void indent() {
>         indentSize += 4;
>     }
>
>     protected void unIndent() {
>         indentSize -= 4; if (indentSize < 0) indentSize = 0;
>     }
>
>     protected void pIndent() {
>         for(int i = 0; i < indentSize; i++) System.out.print(" ");
>     }
>
>     public void handleText(char[] data, int pos) {
>        if (!tableStack.empty() && !finished)
>        {
>        tableLevel = (String)tableStack.peek();
>         if (Integer.parseInt(tableLevel) >=
> (Integer.parseInt(selectedTable)))
>        {
>         file://pIndent();
>         String str = new String(data);
>        System.out.println(str);
>         }
>        }
>
>        if (inImportantTag)
>     {
>     String str = new String(data);
>         System.out.println(str);
>     }
>     }
>
> // ********************************************************
>     public void handleComment(char[] data, int pos) {
>
>     if (!tableStack.empty() && !finished)
>     {
>     tableLevel = (String)tableStack.peek();
>         if (Integer.parseInt(tableLevel) >=
> (Integer.parseInt(selectedTable)))
>     {
>         file://pIndent();
>         String str = new String(data);
>         file://System.out.println("<!--" + str + "-->");
>         file://indent();
>         file://pIndent();
>     }
>     }
>
>     if (inImportantTag)
>     {
>     String str = new String(data);
>         System.out.println("<!--" + str + "-->");
>     }
>
>     }
> // ********************************************************
>
> // ********************************************************
>     public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
{
>     // Is this Tag One of the few that we want to list outside the chosen
> component
>     if (t == HTML.Tag.STYLE || t == HTML.Tag.LINK)
>     {
>     atts = listAttributes(a);
>     inImportantTag = true;
>     System.out.print("<" + t.toString() + " " + atts + ">");
>     return;
>     }
>
>        if (t == selectedTag && !finished)
>        {
>
>      file://pIndent();
>      tableNum++;
>         tabNum = Integer.toString(tableNum);
>         tableStack.push(tabNum);
>         atts = listAttributes(a);
>         tableLevel = (String)tableStack.peek();
>         if (Integer.parseInt(tableLevel) >=
> (Integer.parseInt(selectedTable)))
>         {
>         file://System.out.println("<Table#" + tableLevel + ">");
>
>         }
>        }
>
>        if (!tableStack.empty() && !finished) {
>        tableLevel = (String)tableStack.peek();
>        if (Integer.parseInt(tableLevel) >=
> (Integer.parseInt(selectedTable)))
>        {
>        atts = listAttributes(a);
>         System.out.println("<" + t.toString() + " " + atts + ">");
>         }
>        }
>     }
>     // ********************************************************
>
>
> // ********************************************************
>     public void handleEndTag(HTML.Tag t, int pos) {
>     if (inImportantTag)
>     {
>     inImportantTag = false;
>     System.out.println("</" + t.toString() + ">");
>     }
>
>     if (!tableStack.empty() && !finished)
>     {
>        if (t == selectedTag)
>        {
>         file://unIndent();
>         file://pIndent();
>         tableLevel = (String)tableStack.peek();
>        if (Integer.parseInt(tableLevel) >=
> (Integer.parseInt(selectedTable))){
>         System.out.println("</" + t.toString() + ">");
>         }
>         if (tableStack.peek().equals(selectedTable))
>         finished = true;
>         endTable = (String) tableStack.pop();
>         }
>     }
>        if (!tableStack.empty() && !finished) {
>        tableLevel = (String)tableStack.peek();
>        if (Integer.parseInt(tableLevel) >=
> (Integer.parseInt(selectedTable)) && t != selectedTag) {
>        file://pIndent();
>         System.out.println("</" + t.toString() + ">");
>         file://pIndent();
>         }
>        }
>     }
> // ********************************************************
>
>
>
> // ********************************************************
>     public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int
pos)
> {
>
>
>
>
>     if (t == HTML.Tag.LINK && !finished)
>     {
>     atts = listAttributes(a);
>     System.out.println("<" + t.toString() + " " + atts + ">");
>     }
>
>     if (!tableStack.empty() && !finished)
>     {
>
>
>     atts = listAttributes(a);
>     if(a.getAttribute(HTML.Attribute.ENDTAG) != null)
>     {
>     handleEndTag(t, pos);
>     return;
>     }
>     file://if (tableStack.peek() == selectedTable)
>         file://pIndent();
>
>         tableLevel = (String)tableStack.peek();
>         if (Integer.parseInt(tableLevel) >=
> (Integer.parseInt(selectedTable)))
>         System.out.println("<" + t.toString() + " " + atts + ">");
>     }
>     }
> // ********************************************************
>
>
>
>
> // ********************************************************
> private String listAttributes(AttributeSet attributes) {
>     Enumeration e = attributes.getAttributeNames();
>     String attString = "";
>
>     while (e.hasMoreElements()) {
>       Object name = e.nextElement();
>       Object value = attributes.getAttribute(name);
>
>       if (name.toString().equals("href") || name.toString().equals("src")
> || name.toString().equals("action"))
>       {
>       if (value.toString().charAt(0) == '/')
>       value = host + value;
>       }
>       attString = attString + name + "=\"" + value + "\" ";
>
>     }
>     return attString;
>   }
> // ********************************************************
>
> // ********************************************************
>     public void handleError(String errorMsg, int pos){
>         file://System.out.println("Parsing error: " + errorMsg + " at " +
pos);
>     }
> }
>
>
> _______________________________________________
> Htmlparser-developer mailing list
> Htm...@li...
> https://lists.sourceforge.net/lists/listinfo/htmlparser-developer