[Htmlparser-developer] HTMLParser Sample App
Brought to you by:
derrickoswald
From: Don T. <dta...@e-...> - 2002-03-11 16:37:08
|
Hi, I am attempting to grab the content of a certain table on any website. For instance I'd like to get all of the text, tags, comments, etc contained in the 4rth table I run across. I've been able to do this successfully using the htmleditorkit in swing, but it has a few bugs. Would your HTML Parser be useful for this scenario, and If so, could you give me some guidance on how to start. Thanks, Don Heres my code that goes and get the contents of the 4rth table at nba.com import java.io.*; import java.net.*; import java.util.*; import javax.swing.text.*; import javax.swing.text.html.*; import javax.swing.text.html.parser.*; /** * This small demo program shows how to use the * HTMLEditorKit.Parser and its implementing class * ParserDelegator in the Swing system. */ public class HtmlParseDemo2 { public static void main(String [] args) { Reader r; String host = ""; String spec = "http://www.nba.com"; long endTime; long endTime2; long startTime = System.currentTimeMillis(); String snippet = ""; try { if (spec.indexOf("://") > 0) { URL u = new URL(spec); host = u.getHost(); Object content = u.getContent(); if (content instanceof InputStream) { r = new InputStreamReader((InputStream)content); } else if (content instanceof Reader) { r = (Reader)content; } else { throw new Exception("Bad URL content type."); } } else { r = new FileReader(spec); } endTime = System.currentTimeMillis(); System.out.println("Time to complete connection: " + (endTime - startTime)); HTMLEditorKit.Parser parser; System.out.println("About to parse " + spec); parser = new ParserDelegator(); HTMLParseLister2 snippetCallback = new HTMLParseLister2(host); //Parse Away! parser.parse(r, snippetCallback, true); r.close(); endTime2 = System.currentTimeMillis(); System.out.println("Time to complete: " + (endTime2 - startTime)); } catch (Exception e) { System.err.println("Error: " + e); e.printStackTrace(System.err); } } } /** * HTML parsing proceeds by calling a callback for * each and every piece of the HTML document. This * simple callback class simply prints an indented * structural listing of the HTML data. */ class HTMLParseLister2 extends HTMLEditorKit.ParserCallback { int indentSize = 0; int tableNum = 0; String atts; String tabNum; String endTable; String tableLevel; Stack tableStack = new Stack(); boolean finished = false; HTML.Tag selectedTag = HTML.Tag.TABLE; String selectedTable = Integer.toString(4); boolean inImportantTag = false; StringBuffer snippetString = new StringBuffer(); private String host; public HTMLParseLister2(String host) { this.host = host; } public String getSnippet() { return snippetString.toString(); } protected void indent() { indentSize += 4; } protected void unIndent() { indentSize -= 4; if (indentSize < 0) indentSize = 0; } protected void pIndent() { for(int i = 0; i < indentSize; i++) System.out.print(" "); } public void handleText(char[] data, int pos) { if (!tableStack.empty() && !finished) { tableLevel = (String)tableStack.peek(); if (Integer.parseInt(tableLevel) >= (Integer.parseInt(selectedTable))) { //pIndent(); String str = new String(data); System.out.println(str); } } if (inImportantTag) { String str = new String(data); System.out.println(str); } } // ******************************************************** public void handleComment(char[] data, int pos) { if (!tableStack.empty() && !finished) { tableLevel = (String)tableStack.peek(); if (Integer.parseInt(tableLevel) >= (Integer.parseInt(selectedTable))) { //pIndent(); String str = new String(data); //System.out.println("<!--" + str + "-->"); //indent(); //pIndent(); } } if (inImportantTag) { String str = new String(data); System.out.println("<!--" + str + "-->"); } } // ******************************************************** // ******************************************************** public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { // Is this Tag One of the few that we want to list outside the chosen component if (t == HTML.Tag.STYLE || t == HTML.Tag.LINK) { atts = listAttributes(a); inImportantTag = true; System.out.print("<" + t.toString() + " " + atts + ">"); return; } if (t == selectedTag && !finished) { //pIndent(); tableNum++; tabNum = Integer.toString(tableNum); tableStack.push(tabNum); atts = listAttributes(a); tableLevel = (String)tableStack.peek(); if (Integer.parseInt(tableLevel) >= (Integer.parseInt(selectedTable))) { //System.out.println("<Table#" + tableLevel + ">"); } } if (!tableStack.empty() && !finished) { tableLevel = (String)tableStack.peek(); if (Integer.parseInt(tableLevel) >= (Integer.parseInt(selectedTable))) { atts = listAttributes(a); System.out.println("<" + t.toString() + " " + atts + ">"); } } } // ******************************************************** // ******************************************************** public void handleEndTag(HTML.Tag t, int pos) { if (inImportantTag) { inImportantTag = false; System.out.println("</" + t.toString() + ">"); } if (!tableStack.empty() && !finished) { if (t == selectedTag) { //unIndent(); //pIndent(); tableLevel = (String)tableStack.peek(); if (Integer.parseInt(tableLevel) >= (Integer.parseInt(selectedTable))){ System.out.println("</" + t.toString() + ">"); } if (tableStack.peek().equals(selectedTable)) finished = true; endTable = (String) tableStack.pop(); } } if (!tableStack.empty() && !finished) { tableLevel = (String)tableStack.peek(); if (Integer.parseInt(tableLevel) >= (Integer.parseInt(selectedTable)) && t != selectedTag) { //pIndent(); System.out.println("</" + t.toString() + ">"); //pIndent(); } } } // ******************************************************** // ******************************************************** public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t == HTML.Tag.LINK && !finished) { atts = listAttributes(a); System.out.println("<" + t.toString() + " " + atts + ">"); } if (!tableStack.empty() && !finished) { atts = listAttributes(a); if(a.getAttribute(HTML.Attribute.ENDTAG) != null) { handleEndTag(t, pos); return; } //if (tableStack.peek() == selectedTable) //pIndent(); tableLevel = (String)tableStack.peek(); if (Integer.parseInt(tableLevel) >= (Integer.parseInt(selectedTable))) System.out.println("<" + t.toString() + " " + atts + ">"); } } // ******************************************************** // ******************************************************** private String listAttributes(AttributeSet attributes) { Enumeration e = attributes.getAttributeNames(); String attString = ""; while (e.hasMoreElements()) { Object name = e.nextElement(); Object value = attributes.getAttribute(name); if (name.toString().equals("href") || name.toString().equals("src") || name.toString().equals("action")) { if (value.toString().charAt(0) == '/') value = host + value; } attString = attString + name + "=\"" + value + "\" "; } return attString; } // ******************************************************** // ******************************************************** public void handleError(String errorMsg, int pos){ //System.out.println("Parsing error: " + errorMsg + " at " + pos); } } |