Re: [Htmlparser-developer] HTMLParser Sample App
Brought to you by:
derrickoswald
From: Somik R. <so...@ya...> - 2002-03-12 08:52:14
|
Hi Don, It will be appreciated if you can post usage doubts in the htmlparser-user mailing list (link is at http://htmlparser.sourceforge.net). To your query - the code you posted seems rather complex to do a not so complex task :) Here's how you would do it in HTML Parser (in the attached code). The code I have given is the shortcut-way. There is a way to get much shorter code that what I am providing you, but that requires getting into the design docs of the parser - and writing a Table Scanner. Then your code could become some this like this : HTMLParser parser = new HTMLParser("http://www.nba.com"); HTMLNode node; int tableCount = 0; for (Enumeration e = parser.elements();e.hasMoreElements();) { node = (HTMLNode) e.nextElement(); if (node instanceof HTMLTableNode) { tableCount ++; if (tableCount==4) { HTMLTableNode tableNode = (HTMLTableNode)node; tableNode.print(); } } } Regards, Somik ----- Original Message ----- From: "Don Taggart" <dta...@e-...> To: <Htm...@li...> Sent: Tuesday, March 12, 2002 1:33 AM Subject: [Htmlparser-developer] HTMLParser Sample App > Hi, > I am attempting to grab the content of a certain table on any website. For > instance I'd like to get all of the text, tags, comments, etc contained in > the 4rth table I run across. I've been able to do this successfully using > the htmleditorkit in swing, but it has a few bugs. > > Would your HTML Parser be useful for this scenario, and If so, could you > give me some guidance on how to start. > > Thanks, > Don > > > Heres my code that goes and get the contents of the 4rth table at nba.com > > import java.io.*; > import java.net.*; > import java.util.*; > import javax.swing.text.*; > import javax.swing.text.html.*; > import javax.swing.text.html.parser.*; > > /** > * This small demo program shows how to use the > * HTMLEditorKit.Parser and its implementing class > * ParserDelegator in the Swing system. > */ > > public class HtmlParseDemo2 { > public static void main(String [] args) { > Reader r; > String host = ""; > String spec = "http://www.nba.com"; > long endTime; > long endTime2; > long startTime = System.currentTimeMillis(); > String snippet = ""; > > > try { > if (spec.indexOf("://") > 0) { > URL u = new URL(spec); > host = u.getHost(); > Object content = u.getContent(); > > if (content instanceof InputStream) { > > r = new InputStreamReader((InputStream)content); > } > else if (content instanceof Reader) { > r = (Reader)content; > } > else { > throw new Exception("Bad URL content type."); > } > } > else { > r = new FileReader(spec); > } > > endTime = System.currentTimeMillis(); > System.out.println("Time to complete connection: " + (endTime - > startTime)); > > HTMLEditorKit.Parser parser; > System.out.println("About to parse " + spec); > parser = new ParserDelegator(); > > HTMLParseLister2 snippetCallback = new HTMLParseLister2(host); > > file://Parse Away! > parser.parse(r, snippetCallback, true); > r.close(); > > > endTime2 = System.currentTimeMillis(); > System.out.println("Time to complete: " + (endTime2 - > startTime)); > } > catch (Exception e) { > System.err.println("Error: " + e); > e.printStackTrace(System.err); > } > } > } > > /** > * HTML parsing proceeds by calling a callback for > * each and every piece of the HTML document. This > * simple callback class simply prints an indented > * structural listing of the HTML data. > */ > class HTMLParseLister2 extends HTMLEditorKit.ParserCallback > { > > > > int indentSize = 0; > int tableNum = 0; > String atts; > String tabNum; > String endTable; > String tableLevel; > Stack tableStack = new Stack(); > boolean finished = false; > HTML.Tag selectedTag = HTML.Tag.TABLE; > String selectedTable = Integer.toString(4); > boolean inImportantTag = false; > StringBuffer snippetString = new StringBuffer(); > > > > private String host; > > > > public HTMLParseLister2(String host) { > this.host = host; > } > > public String getSnippet() { > return snippetString.toString(); > } > > protected void indent() { > indentSize += 4; > } > > protected void unIndent() { > indentSize -= 4; if (indentSize < 0) indentSize = 0; > } > > protected void pIndent() { > for(int i = 0; i < indentSize; i++) System.out.print(" "); > } > > public void handleText(char[] data, int pos) { > if (!tableStack.empty() && !finished) > { > tableLevel = (String)tableStack.peek(); > if (Integer.parseInt(tableLevel) >= > (Integer.parseInt(selectedTable))) > { > file://pIndent(); > String str = new String(data); > System.out.println(str); > } > } > > if (inImportantTag) > { > String str = new String(data); > System.out.println(str); > } > } > > // ******************************************************** > public void handleComment(char[] data, int pos) { > > if (!tableStack.empty() && !finished) > { > tableLevel = (String)tableStack.peek(); > if (Integer.parseInt(tableLevel) >= > (Integer.parseInt(selectedTable))) > { > file://pIndent(); > String str = new String(data); > file://System.out.println("<!--" + str + "-->"); > file://indent(); > file://pIndent(); > } > } > > if (inImportantTag) > { > String str = new String(data); > System.out.println("<!--" + str + "-->"); > } > > } > // ******************************************************** > > // ******************************************************** > public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { > // Is this Tag One of the few that we want to list outside the chosen > component > if (t == HTML.Tag.STYLE || t == HTML.Tag.LINK) > { > atts = listAttributes(a); > inImportantTag = true; > System.out.print("<" + t.toString() + " " + atts + ">"); > return; > } > > if (t == selectedTag && !finished) > { > > file://pIndent(); > tableNum++; > tabNum = Integer.toString(tableNum); > tableStack.push(tabNum); > atts = listAttributes(a); > tableLevel = (String)tableStack.peek(); > if (Integer.parseInt(tableLevel) >= > (Integer.parseInt(selectedTable))) > { > file://System.out.println("<Table#" + tableLevel + ">"); > > } > } > > if (!tableStack.empty() && !finished) { > tableLevel = (String)tableStack.peek(); > if (Integer.parseInt(tableLevel) >= > (Integer.parseInt(selectedTable))) > { > atts = listAttributes(a); > System.out.println("<" + t.toString() + " " + atts + ">"); > } > } > } > // ******************************************************** > > > // ******************************************************** > public void handleEndTag(HTML.Tag t, int pos) { > if (inImportantTag) > { > inImportantTag = false; > System.out.println("</" + t.toString() + ">"); > } > > if (!tableStack.empty() && !finished) > { > if (t == selectedTag) > { > file://unIndent(); > file://pIndent(); > tableLevel = (String)tableStack.peek(); > if (Integer.parseInt(tableLevel) >= > (Integer.parseInt(selectedTable))){ > System.out.println("</" + t.toString() + ">"); > } > if (tableStack.peek().equals(selectedTable)) > finished = true; > endTable = (String) tableStack.pop(); > } > } > if (!tableStack.empty() && !finished) { > tableLevel = (String)tableStack.peek(); > if (Integer.parseInt(tableLevel) >= > (Integer.parseInt(selectedTable)) && t != selectedTag) { > file://pIndent(); > System.out.println("</" + t.toString() + ">"); > file://pIndent(); > } > } > } > // ******************************************************** > > > > // ******************************************************** > public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) > { > > > > > if (t == HTML.Tag.LINK && !finished) > { > atts = listAttributes(a); > System.out.println("<" + t.toString() + " " + atts + ">"); > } > > if (!tableStack.empty() && !finished) > { > > > atts = listAttributes(a); > if(a.getAttribute(HTML.Attribute.ENDTAG) != null) > { > handleEndTag(t, pos); > return; > } > file://if (tableStack.peek() == selectedTable) > file://pIndent(); > > tableLevel = (String)tableStack.peek(); > if (Integer.parseInt(tableLevel) >= > (Integer.parseInt(selectedTable))) > System.out.println("<" + t.toString() + " " + atts + ">"); > } > } > // ******************************************************** > > > > > // ******************************************************** > private String listAttributes(AttributeSet attributes) { > Enumeration e = attributes.getAttributeNames(); > String attString = ""; > > while (e.hasMoreElements()) { > Object name = e.nextElement(); > Object value = attributes.getAttribute(name); > > if (name.toString().equals("href") || name.toString().equals("src") > || name.toString().equals("action")) > { > if (value.toString().charAt(0) == '/') > value = host + value; > } > attString = attString + name + "=\"" + value + "\" "; > > } > return attString; > } > // ******************************************************** > > // ******************************************************** > public void handleError(String errorMsg, int pos){ > file://System.out.println("Parsing error: " + errorMsg + " at " + pos); > } > } > > > _______________________________________________ > Htmlparser-developer mailing list > Htm...@li... > https://lists.sourceforge.net/lists/listinfo/htmlparser-developer |