[Htmlparser-cvs] htmlparser/docs/docs CustomTagLinks.html,NONE,1.1 CustomVisitorLinks.html,NONE,1.1
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/docs/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26789/docs/docs Modified Files: CustomTagExtraction.html EmailExtraction.html FactoryMethod.html ImageExtraction.html LinkExtraction.html PostOperation.html ReverseHtml.html SamplePrograms.html SearchingForData.html StringExtraction.html TemplateMethod.html WritingYourOwnScanners.html index.html Added Files: CustomTagLinks.html CustomVisitorLinks.html FilterLinks.html LexerLinks.html LinkBeanLinks.html VisitorLinks.html Log Message: Update version to 1.4-20040125 --- NEW FILE: CustomTagLinks.html --- <html><head><title>Custom Tag Links</title></head><body> <div class="wikitext"> <p><b>Using Custom Tags to Extract Links <p>The use of custom tags provides for altered behaviour during the parse: <pre> import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.ParserException; class MyLinkTag extends LinkTag { public void doSemanticAction () throws ParserException { System.out.print ("\"" + getLinkText () + "\" => "); System.out.println (getLink ()); } } public class LinkDemo { public static void main (String[] args) throws ParserException { Parser parser = new Parser ("http://urlIWantToParse.com"); PrototypicalNodeFactory factory = new PrototypicalNodeFactory (); factory.registerTag (new MyLinkTag ()); parser.setNodeFactory (factory); for (NodeIterator e = parser.elements (); e.hasMoreNodes (); ) e.nextNode (); // just parsing the nodes executes doSemanticAction } } <div id="actionbar" class="toolbar"> <hr class="printer" noshade="noshade" /> <p class="editdate">Last edited on Tuesday, January 13, 2004 5:39:34 pm. <hr class="toolbar" noshade="noshade" /> </body></html> --- NEW FILE: CustomVisitorLinks.html --- <html><head><title>Custom Visitor Links</title></head><body> <div class="wikitext"> <p><b>Using a Custom Visitor to Extract Links <p>Creating a custom visitor is more powerful than just the processing of links demonstrated here: <pre> import org.htmlparser.Parser; import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.NodeVisitor; class MyCustomizedVisitor extends NodeVisitor { public MyCustomizedVisitor () { super (true); // recurse into children } public void visitTag (Tag tag) { // process tags here if (tag instanceof LinkTag) { LinkTag linkTag = (LinkTag)tag; System.out.print ("\"" + linkTag.getLinkText () + "\" => "); System.out.println (linkTag.getLink ()); } } public void visitStringNode (StringNode stringNode) { // process text in the page here } public void visitEndTag (Tag endTag) { // process end tags here, // checking for end tags can be useful when performing // more involved page processing } public void visitRemarkNode (RemarkNode remarkNode) { // process remark nodes here } } public class LinkDemo { public static void main (String[] args) throws ParserException { Parser parser = new Parser ("http://urlIWantToParse.com"); MyCustomizedVisitor visitor = new MyCustomizedVisitor (); parser.visitAllNodesWith (visitor); } } <div id="actionbar" class="toolbar"> <hr class="printer" noshade="noshade" /> <p class="editdate">Last edited on Wednesday, January 7, 2004 5:24:34 pm. <hr class="toolbar" noshade="noshade" /> </body></html> --- NEW FILE: FilterLinks.html --- <html><head><title>Filter Links</title></head><body> <div class="wikitext"> <p><b>Using a NodeFilter to Extract Links <p>The filter capability is much more powerful than the simple link extraction illustrated here: <pre> import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class LinkDemo { public static void main (String[] args) throws ParserException { Parser parser = new Parser ("http://urlIWantToParse.com"); NodeFilter filter = new NodeClassFilter (LinkTag.class); NodeList links = new NodeList (); for (NodeIterator e = parser.elements (); e.hasMoreNodes (); ) e.nextNode ().collectInto (links, filter); for (int i = 0; i < links.size (); i++) { LinkTag linkTag = (LinkTag)links.elementAt (i); System.out.print ("\"" + linkTag.getLinkText () + "\" => "); System.out.println (linkTag.getLink ()); } } } <p>In fact, this is so useful that there is a convenience method to apply a NodeClassFilter directly from the parser: <pre> import org.htmlparser.Parser; import org.htmlparser.util.ParserException; import org.htmlparser.Node; import org.htmlparser.tags.LinkTag; public class LinkDemo { public static void main (String[] args) throws ParserException { Parser parser = new Parser ("http://urlIWantToParse.com"); Node [] links = parser.extractAllNodesThatAre (LinkTag.class); for (int i = 0; i < links.length; i++) { LinkTag linkTag = (LinkTag)links[i]; System.out.print ("\"" + linkTag.getLinkText () + "\" => "); System.out.println (linkTag.getLink ()); } } } <div id="actionbar" class="toolbar"> <hr class="printer" noshade="noshade" /> <p class="editdate">Last edited on Wednesday, January 7, 2004 4:48:39 pm. <hr class="toolbar" noshade="noshade" /> </body></html> --- NEW FILE: LexerLinks.html --- <html><head><title>Lexer Links</title></head><body> <div class="wikitext"> <p><b>Using a Lexer to Extract Links <p>If you are after raw link text only, then you can use a Lexer to access the links: <pre> import java.io.IOException; import java.net.URL; import java.net.URLConnection; import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.util.ParserException; public class LinkDemo { public static void main (String[] args) throws ParserException, IOException { Node node; URL url = new URL ("http://urlIWantToParse.com"); URLConnection connection = url.openConnection (); Lexer lexer = new Lexer (connection); while (null != (node = lexer.nextNode ())) if (node instanceof TagNode) { TagNode tag = (TagNode)node; if (tag.getTagName ().equals ("A") && !tag.isEndTag ()) { String href = tag.getAttribute ("href"); if (null != href) System.out.println (href); } } } } <div id="actionbar" class="toolbar"> <hr class="printer" noshade="noshade" /> <p class="editdate">Last edited on Thursday, January 8, 2004 4:06:57 am. <hr class="toolbar" noshade="noshade" /> </body></html> --- NEW FILE: LinkBeanLinks.html --- <html><head><title>Link Bean Links</title></head><body> <div class="wikitext"> <p><b>Using a LinkBean to Extract Links <p>A LinkBean is a pretty easy way to get just the links: <pre> import java.net.URL; import org.htmlparser.beans.LinkBean; public class LinkDemo { public static void main (String[] args) { LinkBean lb = new LinkBean (); lb.setURL ("http://urlIWantToParse.com"); URL[] urls = lb.getLinks (); for (int i = 0; i < urls.length; i++) System.out.println (urls[i]); } } <div id="actionbar" class="toolbar"> <hr class="printer" noshade="noshade" /> <p class="editdate">Last edited on Wednesday, January 7, 2004 4:10:21 pm. <hr class="toolbar" noshade="noshade" /> </body></html> --- NEW FILE: VisitorLinks.html --- <html><head><title>Visitor Links</title></head><body> <div class="wikitext"> <p><b>Using an ObjectFindingVisitor to Extract Links <p>A visitor visits all links, and an ObjectFindingVisitor is designed to find one specific class of nodes, in this case LinkTag tags: <pre> import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.ObjectFindingVisitor; public class LinkDemo { public static void main (String[] args) throws ParserException { Parser parser = new Parser ("http://urlIWantToParse.com"); ObjectFindingVisitor visitor = new ObjectFindingVisitor (LinkTag.class); parser.visitAllNodesWith (visitor); Node[] links = visitor.getTags (); for (int i = 0; i < links.length; i++) { LinkTag linkTag = (LinkTag)links[i]; System.out.print ("\"" + linkTag.getLinkText () + "\" => "); System.out.println (linkTag.getLink ()); } } } <div id="actionbar" class="toolbar"> <hr class="printer" noshade="noshade" /> <p class="editdate">Last edited on Wednesday, January 7, 2004 4:09:50 pm. <hr class="toolbar" noshade="noshade" /> </body></html> Index: CustomTagExtraction.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/CustomTagExtraction.html,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** CustomTagExtraction.html 9 Nov 2003 17:07:07 -0000 1.7 --- CustomTagExtraction.html 26 Jan 2004 01:02:09 -0000 1.8 *************** *** 6,25 **** <p><b>Custom Tag Extraction ! <p>Custom tag extraction is easy. Simply create an array of tag names that you want to extract from a page, and pass it in to <a href="TagFindingVisitor.html" class="wiki">TagFindingVisitor</a>, like so : <pre> ! Parser parser = new Parser(..); ! String [] tagsToBeFound = {"P","BR","MYTAG"}; ! TagFindingVisitor visitor = new TagFindingVisitor(tagsToBeFound); ! parser.visitAllNodesWith(visitor); ! // First tag specified in search ! Node [] allPTags = visitor.getTags(0); ! // Second tag specified in search ! Node [] allBRTags = visitor.getTags(1); ! // Third tag specified in search ! Node [] allMyTags = visitor.getTags(2); ! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a> ! // Just a test of wiki --- 6,33 ---- <p><b>Custom Tag Extraction ! <p>Custom tag extraction is easy. Simply create an array of tag names that you want to extract from a page, and pass it in to a TagFindingVisitor, like so: <pre> ! import org.htmlparser.Node; ! import org.htmlparser.Parser; ! import org.htmlparser.util.ParserException; ! import org.htmlparser.visitors.TagFindingVisitor; ! public class CustomTagDemo ! { ! public static void main (String[] args) throws ParserException ! { ! Parser parser = new Parser ("http://urlIWantToParse.com"); ! String [] tagsToBeFound = {"P","BR","MYTAG"}; ! TagFindingVisitor visitor = new TagFindingVisitor (tagsToBeFound); ! parser.visitAllNodesWith (visitor); ! // First tag specified in search ! Node [] allPTags = visitor.getTags(0); ! // Second tag specified in search ! Node [] allBRTags = visitor.getTags(1); ! // Third tag specified in search ! Node [] allMyTags = visitor.getTags(2); ! } ! } *************** *** 29,33 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, April 2, 2003 1:38:24 pm. <hr class="toolbar" noshade="noshade" /> --- 37,41 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, January 7, 2004 6:22:39 pm. <hr class="toolbar" noshade="noshade" /> Index: EmailExtraction.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/EmailExtraction.html,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** EmailExtraction.html 9 Nov 2003 17:07:07 -0000 1.5 --- EmailExtraction.html 26 Jan 2004 01:02:09 -0000 1.6 *************** *** 6,24 **** <p><b>Email Extraction ! <p>This is very similar to link extraction. You have to extract links from a page and verify that they are email addresses. Link tags have a method - <i>isMailLink() <pre> ! Parser parser = new Parser(..); ! parser.registerScanners(); ! Node links [] = parser.extractAllNodesThatAre(LinkTag.class); ! for (int i=0;i<links.length;i++) { ! LinkTag linkTag = links[i]; ! if (linkTag[i].isMailLink()) { ! // Yes, its an email id ! System.out.println("Email address: "+linkTag.getLink()); ! } ! } ! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a>, February 16, 2003 11:41 am --- 6,48 ---- <p><b>Email Extraction ! <p>This is very similar to <a href="LinkExtraction.html" class="named-wiki" title="LinkExtraction">link extraction</a>. You have to extract links from a page and verify that they are email addresses. Link tags have a method - <i>isMailLink() to check if the HREF starts with "mailto:". Using an inner class in the NodeFilter example: <pre> ! import org.htmlparser.Node; ! import org.htmlparser.NodeFilter; ! import org.htmlparser.Parser; ! import org.htmlparser.tags.LinkTag; ! import org.htmlparser.util.NodeIterator; ! import org.htmlparser.util.NodeList; ! import org.htmlparser.util.ParserException; ! public class EmailLinkDemo ! { ! public static void main (String[] args) throws ParserException ! { ! Parser parser = new Parser ("http://urlIWantToParse.com"); ! NodeFilter filter = new NodeFilter () ! { ! /** ! * Accept nodes that are mail links. ! * @param node The node to check. ! */ ! public boolean accept (Node node) ! { ! return (LinkTag.class.isAssignableFrom (node.getClass ()) ! && ((LinkTag)node).isMailLink ()); ! } ! }; ! NodeList links = new NodeList (); ! for (NodeIterator e = parser.elements (); e.hasMoreNodes (); ) ! e.nextNode ().collectInto (links, filter); ! for (int i = 0; i < links.size (); i++) ! { ! LinkTag linkTag = (LinkTag)links.elementAt (i); ! System.out.print ("\"" + linkTag.getLinkText () + "\" => "); ! System.out.println (linkTag.getLink ()); ! } ! } ! } *************** *** 28,32 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Sunday, February 23, 2003 5:24:25 pm. <hr class="toolbar" noshade="noshade" /> --- 52,56 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, January 7, 2004 5:26:12 pm. <hr class="toolbar" noshade="noshade" /> Index: FactoryMethod.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/FactoryMethod.html,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** FactoryMethod.html 9 Nov 2003 17:07:07 -0000 1.6 --- FactoryMethod.html 26 Jan 2004 01:02:09 -0000 1.7 *************** *** 6,10 **** <p><b>Factory Method ! <p><i><a href="TagScanner.html" class="wiki">TagScanner</a> possess an FM for the creation of a tag. <pre> --- 6,10 ---- <p><b>Factory Method ! <p><i><span class="wikiunknown"><u>TagScanner possess an FM for the creation of a tag. <pre> Index: ImageExtraction.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/ImageExtraction.html,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ImageExtraction.html 9 Nov 2003 17:07:07 -0000 1.6 --- ImageExtraction.html 26 Jan 2004 01:02:09 -0000 1.7 *************** *** 4,47 **** <div class="wikitext"> ! <p><b>Image Extractions ! ! <p>This is very similar to <a href="LinkExtraction.html" class="wiki">LinkExtraction</a>. ! ! <p>1. Use the <i><span class="wikiunknown"><u>ObjectFindingVisitor like so : ! ! <pre> ! Parser parser = new Parser("http://urlIWantToParse.com"); ! // Create a visitor, specify that you want to recurse through its children ! // Recursion is needed only if you register all scanners, and a link tag could be embedded ! // within a form tag. But if you register only the link scanner, you don't need recursion. ! ObjectFindingVisitor visitor = ! new ObjectFindingVisitor(ImageTag.class,true); ! ! parser.registerScanners(); ! ! // Instead of registering all scanners, ! // you could also do - parser.addScanner(new ImageScanner("")); ! parser.visitAllNodesWith(visitor); ! Node [] images = visitor.getTags(); ! for (int i=0;i<images.length;i++) { ! ImageTag imageTag = (ImageTag)images[i]; ! System.out.println(imageTag.getImageLocation()); ! } ! <p>2: Use <i>extractAllNodesThatAre() <pre> ! Parser parser = new Parser("http://urlIWantToParse.com"); ! parser.registerScanners(); ! // Instead of registering all scanners, ! // you could also do - parser.addScanner(new ImageScanner("")); ! ! Node [] images = parser.extractAllNodesThatAre(ImageTag.class); ! for (int i=0;i<images.length;i++) { ! ImageTag imageTag = (ImageTag)images[i]; ! System.out.println(imageTag.getImageLocation()); ! } ! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a>, Sunday, February 16, 2003 2:02:18 pm. --- 4,30 ---- <div class="wikitext"> ! <p><b>Image Extraction ! <p>This is very similar to <a href="LinkExtraction.html" class="named-wiki" title="LinkExtraction">link extraction</a>. Instead of looking for LinkTag nodes you look for ImageTag nodes: <pre> ! import org.htmlparser.Parser; ! import org.htmlparser.util.ParserException; ! import org.htmlparser.Node; ! import org.htmlparser.tags.ImageTag; ! public class ImageDemo ! { ! public static void main (String[] args) throws ParserException ! { ! Parser parser = new Parser ("http://urlIWantToParse.com"); ! Node [] images = parser.extractAllNodesThatAre (ImageTag.class); ! for (int i = 0; i < images.length; i++) ! { ! ImageTag imageTag = (ImageTag)images[i]; ! System.out.println (imageTag.getImageURL ()); ! } ! } ! } *************** *** 51,55 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, June 25, 2003 9:11:46 am. <hr class="toolbar" noshade="noshade" /> --- 34,38 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, January 7, 2004 5:33:01 pm. <hr class="toolbar" noshade="noshade" /> Index: LinkExtraction.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/LinkExtraction.html,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** LinkExtraction.html 9 Nov 2003 17:07:07 -0000 1.5 --- LinkExtraction.html 26 Jan 2004 01:02:09 -0000 1.6 *************** *** 8,101 **** <p>There are many ways of extracting links. ! <p>1. Use the <span class="wikiunknown"><u>ObjectFindingVisitor to extract links, like so: ! ! <pre> ! Parser parser = new Parser("http://urlIWantToParse.com"); ! // Create a visitor, specify that you want to recurse through its children ! // Recursion is needed only if you register all scanners, and a link tag could be embedded ! // within a form tag. But if you register only the link scanner, you don't need recursion. ! ObjectFindingVisitor visitor = ! new ObjectFindingVisitor(LinkTag.class,true); ! ! parser.registerScanners(); ! ! // Instead of registering all scanners, ! // you could also do - parser.addScanner(new LinkScanner("")); ! parser.visitAllNodesWith(visitor); ! Node [] links = visitor.getTags(); ! for (int i=0;i<links.length;i++) { ! LinkTag linkTag = (LinkTag)links[i]; ! System.out.println(linkTag.getLink()); ! System.out.println(linkTag.getLinkText()); ! } ! ! <p>2. Use the parser utility method - extractAllNodesThatAre(). ! ! <pre> ! Parser parser = new Parser("http://urlIWantToParse.com"); ! parser.registerScanners(); ! Node [] links = parser.extractAllNodesThatAre(LinkTag.class); ! // Instead of registering all scanners, ! // you could also do - parser.addScanner(new LinkScanner("")); ! for (int i=0;i<links.length;i++) { ! LinkTag linkTag = (LinkTag)links[i]; ! System.out.println(linkTag.getLink()); ! System.out.println(linkTag.getLinkText()); ! } ! ! <p>3. It is possible that you are interested in extracting more than just links. In order to customize extraction, write your own visitor. Extend the Visitor class (in the package org.htmlparser.visitors - Parser v1.3 upwards) like so : ! ! <pre> ! public class MyCustomizedVisitor extends Visitor { ! public MyCustomizedVisitor(Parser parser) { ! super(true); /// Its usually a good idea to perform recursion ! // Add the scanners you want. ! // This decouples your application from having to know which scanners are required ! parser.addScanner(new LinkScanner("")); ! parser.addScanner(new ImageScanner("")); ! // or add all scanners with registerScanners() ! } ! ! public void visitTag(Tag tag) { ! // Collect any tags you want ! // You can also do type checking like so: ! if (tag instanceof MetaTag) { ! // This tag is a meta tag ! MetaTag metaTag = (MetaTag)tag; ! } ! } ! ! public void visitStringNode(StringNode stringNode) { ! // Collect text in the page here ! } ! ! public void visitLinkTag(LinkTag linkTag) { ! // Collect links here ! } ! public void visitImageTag(ImageTag imageTag) { ! // Collect images here ! } ! public void visitEndTag(EndTag endTag) { ! // Checking for end tags can be useful when performing more involved ! // searches in a page ! } ! public void visitRemarkNode(RemarkNode remarkNode) { ! // Collect remark nodes here ! } ! // Add getters to get the data you have collected.. ! } ! In your app.. ! Parser parser = new Parser(...); ! MyCustomizedVisitor visitor = new MyCustomizedVisitor(parser); ! parser.visitAllNodesWith(visitor); ! // You can now get the data from the visitor interface. - <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a> --- 8,25 ---- <p>There are many ways of extracting links. ! <ul> ! <li><a href="VisitorLinks.html" class="named-wiki" title="VisitorLinks">Use an ObjectFindingVisitor</a> ! <li><a href="CustomVisitorLinks.html" class="named-wiki" title="CustomVisitorLinks">Use a custom Visitor</a> ! <li><a href="LinkBeanLinks.html" class="named-wiki" title="LinkBeanLinks">Use a LinkBean</a> ! <li><a href="CustomTagLinks.html" class="named-wiki" title="CustomTagLinks">Use a custom Tag</a> ! <li><a href="FilterLinks.html" class="named-wiki" title="FilterLinks">Use a NodeFilter</a> + <li><a href="LexerLinks.html" class="named-wiki" title="LexerLinks">Use a low level Lexer</a> *************** *** 105,109 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Tuesday, September 2, 2003 1:59:15 pm. <hr class="toolbar" noshade="noshade" /> --- 29,33 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, January 7, 2004 5:22:23 pm. <hr class="toolbar" noshade="noshade" /> Index: PostOperation.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/PostOperation.html,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** PostOperation.html 9 Nov 2003 17:07:07 -0000 1.4 --- PostOperation.html 26 Jan 2004 01:02:09 -0000 1.5 *************** *** 31,35 **** // ... do parser operations ! <p><a href="images/Zip.java" class="namedurl"><span style="white-space: nowrap"><img src="/docs/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Source</span> Code.</a><a href="images/Zip.java" class="namedurl"><span style="white-space: nowrap">Source</span> Code.</a> <a href="images/Zip.html" class="namedurl"><span style="white-space: nowrap"><img src="/docs/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Pretty</span> Print Source Code</a><a href="images/Zip.html" class="namedurl"><span style="white-space: nowrap">Pretty</span> Print Source Code</a> <pre> --- 31,35 ---- // ... do parser operations ! <p><a href="images/Zip.java" class="namedurl"><span style="white-space: nowrap"><img src="/wiki/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Source</span> Code.</a><a href="images/Zip.java" class="namedurl"><span style="white-space: nowrap">Source</span> Code.</a> <a href="images/Zip.html" class="namedurl"><span style="white-space: nowrap"><img src="/wiki/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Pretty</span> Print Source Code</a><a href="images/Zip.html" class="namedurl"><span style="white-space: nowrap">Pretty</span> Print Source Code</a> <pre> Index: ReverseHtml.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/ReverseHtml.html,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** ReverseHtml.html 9 Nov 2003 17:07:07 -0000 1.5 --- ReverseHtml.html 26 Jan 2004 01:02:09 -0000 1.6 *************** *** 6,41 **** <p><b>Reverse Html Rendering ! <p>In order to get back the html representation of a web page, you may use toHTML() recursively. Here's one way to get it: <pre> ! Parser parser = new Parser(..); ! parser.registerScanners(); ! StringBuffer htmlBuffer = new StringBuffer(); ! for (NodeIterator i = parser.elements();i.hasMoreNodes();) { ! htmlBuffer.append(i.nextNode().toHTML()); ! } ! System.out.println("reverse html rendered after parse : "+htmlBuffer.toString()); ! <p>This usually goes through child nodes of composite tags (like links, forms, etc..) ! <p>Often, it might be desired to modify the html being reconstructed. In such a case, you must change the tag's attributes prior to calling toHTML(). ! <p>e.g. if the tag in question is a link tag, and you wish to modify the href, do this : <pre> ! linkTag.setAttribute("SRC",newUrlString); ! doSomethingWith(linkTag.toHTML()); ! <p><i>toHtml() is basically a reconstruction of the tag using its attributes (at the atomic level) and its children (at the macro/composite level). ! <p>You can also change the name of the tag by setting its <i>TAGNAME attribute, like so: <pre> ! tag.setAttribute(Tag.TAGNAME,newTagName); ! <p>This should enable you to perform any transformations on the html. ! Take a look at another way of modifying tags in <a href="WebRipper.html" class="wiki">WebRipper</a>. ! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a> --- 6,62 ---- <p><b>Reverse Html Rendering ! <p>In order to get back the html representation of a web page, you may use toHtml() recursively. Here's one way to get it: <pre> ! import org.htmlparser.Parser; ! import org.htmlparser.util.NodeIterator; ! import org.htmlparser.util.ParserException; ! public class ToHtmlDemo ! { ! public static void main (String[] args) throws ParserException ! { ! Parser parser = new Parser ("http://urlIWantToParse.com"); ! StringBuffer html = new StringBuffer (4096); ! for (NodeIterator i = parser.elements();i.hasMoreNodes();) ! html.append (i.nextNode().toHtml ()); ! System.out.println (html); ! } ! } ! <p>Often, it might be desired to modify the html being reconstructed. In such a case, you must change the tag's attributes prior to calling toHtml(). ! For example, if the tag in question is a link tag, and you wish to modify the href, do this: ! <pre> ! linkTag.setLink ("http://newUrlString"); ! linkTag.toHtml (); ! ! <p>This is equivalent to: <pre> ! linkTag.setAttribute ("href", "http://newUrlString"); ! linkTag.toHtml (); ! <p>This latter would work on any tag, but few other tags have an HREF attribute according to the <a href="http://www.w3.org/TR/html4/" class="namedurl"><span style="white-space: nowrap">HTML</span> specification</a>. ! The <i>toHtml() method applies to all nodes, not just tags. For tags it is basically a reconstruction of the tag using its attributes (at the atomic level) and its children (at the macro/composite level). ! <p>You can also change the name of the tag like so: <pre> ! tag.setTagName (newTagName); ! <p>and there are numerous ways to add, remove or change the attributes of a tag. For example, to add or change the ID attribute to "EditArea" use: ! <pre> ! tag.setAttribute ("id", "EditArea", '"'); ! ! <p>Whole tags can be added and removed from the list of children held by each tag. For example, to add a <P> tag at the same level as another tag: ! ! <pre> ! newTag = new Tag (); ! newTag.setTagName ("P"); ! tag.getParent ().getChildren ().add (newTag); ! ! <p>Be careful, getChildren () may return null for an arbitrary tag. *************** *** 45,49 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Sunday, February 23, 2003 5:34:12 pm. <hr class="toolbar" noshade="noshade" /> --- 66,70 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, January 7, 2004 6:14:37 pm. <hr class="toolbar" noshade="noshade" /> Index: SamplePrograms.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/SamplePrograms.html,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** SamplePrograms.html 9 Nov 2003 17:07:07 -0000 1.6 --- SamplePrograms.html 26 Jan 2004 01:02:09 -0000 1.7 *************** *** 10,14 **** <li><a href="StringExtraction.html" class="wiki">StringExtraction</a> ! <li><a href="LinkExtraction.html" class="wiki">LinkExtraction</a> (includes example of customized parsing with HTMLVisitor) <li><a href="EmailExtraction.html" class="wiki">EmailExtraction</a> --- 10,14 ---- <li><a href="StringExtraction.html" class="wiki">StringExtraction</a> ! <li><a href="LinkExtraction.html" class="wiki">LinkExtraction</a> <li><a href="EmailExtraction.html" class="wiki">EmailExtraction</a> *************** *** 16,23 **** <li><a href="ImageExtraction.html" class="wiki">ImageExtraction</a> - <li><a href="WebCrawler.html" class="wiki">WebCrawler</a> - - <li><a href="WebRipper.html" class="wiki">WebRipper</a> - <li><a href="ReverseHtml.html" class="named-wiki" title="ReverseHtml">ReverseHtml rendering</a> --- 16,19 ---- *************** *** 26,29 **** --- 22,29 ---- <li><a href="JavaBeans.html" class="wiki">JavaBeans</a> + <li><a href="WebCrawler.html" class="wiki">WebCrawler</a> - ignore this, it's old + + <li><a href="WebRipper.html" class="wiki">WebRipper</a> - ignore this, it's old + *************** *** 33,37 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Thursday, April 24, 2003 4:45:21 am. <hr class="toolbar" noshade="noshade" /> --- 33,37 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Wednesday, January 7, 2004 6:12:30 pm. <hr class="toolbar" noshade="noshade" /> Index: SearchingForData.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/SearchingForData.html,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** SearchingForData.html 26 Oct 2003 19:46:17 -0000 1.3 --- SearchingForData.html 26 Jan 2004 01:02:09 -0000 1.4 *************** *** 29,33 **** <pre> - parser.registerScanners(); Node nodes [] = parser.extractAllNodesThatAre(TableTag.class); // Get the first table found --- 29,32 ---- *************** *** 73,77 **** <pre> - parser.registerScanners(); Node nodes [] = parser.extractAllNodesThatAre(TableTag.class); // Get the first table found --- 72,75 ---- *************** *** 103,107 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Saturday, April 19, 2003 10:38:30 pm. <hr class="toolbar" noshade="noshade" /> --- 101,105 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Thursday, January 8, 2004 4:15:12 am. <hr class="toolbar" noshade="noshade" /> Index: StringExtraction.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/StringExtraction.html,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** StringExtraction.html 9 Nov 2003 17:07:07 -0000 1.6 --- StringExtraction.html 26 Jan 2004 01:02:09 -0000 1.7 *************** *** 6,24 **** <p><b>String Extraction ! <p>To get all the text content from a web page, use the <a href="TextExtractingVisitor.html" class="wiki">TextExtractingVisitor</a>, like so : <pre> ! Parser parser = new Parser("http://pageIwantToParse.com"); ! TextExtractingVisitor visitor = new TextExtractingVisitor(); ! parser.visitAllNodesWith(visitor); ! System.out.println(visitor.getExtractedText()); ! <p>If you want to strip all escape characters, do: <pre> ! String cleanText = ! ParserUtils.removeEscapeCharacters( ! visitor.getExtractedText() ! ); --- 6,42 ---- <p><b>String Extraction ! <p>To get all the text content from a web page, use the TextExtractingVisitor, like so: <pre> ! import org.htmlparser.Parser; ! import org.htmlparser.util.ParserException; ! import org.htmlparser.visitors.TextExtractingVisitor; ! public class StringDemo ! { ! public static void main (String[] args) throws ParserException ! { ! Parser parser = new Parser ("http://pageIwantToParse.com"); ! TextExtractingVisitor visitor = new TextExtractingVisitor (); ! parser.visitAllNodesWith (visitor); ! System.out.println (visitor.getExtractedText()); ! } ! } ! <p>If you want a more browser like behaviour, use the StringBean like so: <pre> ! import org.htmlparser.beans.StringBean; ! public class StringDemo ! { ! public static void main (String[] args) ! { ! StringBean sb = new StringBean (); ! sb.setLinks (false); ! sb.setReplaceNonBreakingSpaces (true); ! sb.setCollapse (true); ! sb.setURL ("http://pageIwantToParse.com"); ! System.out.println (sb.getStrings ()); ! } ! } *************** *** 28,32 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Sunday, February 23, 2003 5:20:23 pm. <hr class="toolbar" noshade="noshade" /> --- 46,50 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Tuesday, January 6, 2004 6:36:18 pm. <hr class="toolbar" noshade="noshade" /> Index: TemplateMethod.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/TemplateMethod.html,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** TemplateMethod.html 9 Nov 2003 17:07:07 -0000 1.6 --- TemplateMethod.html 26 Jan 2004 01:02:09 -0000 1.7 *************** *** 6,10 **** <p><b>Template Method ! <p><i><a href="TagScanner.html" class="wiki">TagScanner</a> uses a template method to create a scanned node - it calls a matching tag scanner to do its job and produce a scanned node in a series of steps. <pre> --- 6,10 ---- <p><b>Template Method ! <p><i><span class="wikiunknown"><u>TagScanner uses a template method to create a scanned node - it calls a matching tag scanner to do its job and produce a scanned node in a series of steps. <pre> Index: WritingYourOwnScanners.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/WritingYourOwnScanners.html,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** WritingYourOwnScanners.html 9 Nov 2003 17:07:07 -0000 1.7 --- WritingYourOwnScanners.html 26 Jan 2004 01:02:09 -0000 1.8 *************** *** 5,14 **** <div class="wikitext"> <p><b>Writing Your Own Scanners ! ! <p>There are two types of scanners, depending on the type of tags that you wish to parse: <ul> ! <li><a href="TagScanner.html" class="wiki">TagScanner</a> - for parsing tags that have no child elements <li>CompositeTagScanner - for parsing tags with children --- 5,14 ---- <div class="wikitext"> <p><b>Writing Your Own Scanners ! <b>Warning: this is out of date and needs to be completely rewritten ! There are two types of scanners, depending on the type of tags that you wish to parse: <ul> ! <li>TagScanner - for parsing tags that have no child elements <li>CompositeTagScanner - for parsing tags with children *************** *** 29,33 **** <br /> <br /> ! 3. If a match was found, call the scan() method. For both <a href="TagScanner.html" class="wiki">TagScanner</a> and CompositeTagScanner, overriding this method is optional, and NOT recommended for standard cases. The default scan() methods will make a call to createTag. <br /> <br /> --- 29,33 ---- <br /> <br /> ! 3. If a match was found, call the scan() method. For both <span class="wikiunknown"><u>TagScanner and CompositeTagScanner, overriding this method is optional, and NOT recommended for standard cases. The default scan() methods will make a call to createTag. <br /> <br /> *************** *** 109,113 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Thursday, May 1, 2003 6:54:01 pm. <hr class="toolbar" noshade="noshade" /> --- 109,113 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Thursday, January 8, 2004 4:13:18 am. <hr class="toolbar" noshade="noshade" /> Index: index.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/index.html,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** index.html 2 Jan 2004 16:24:52 -0000 1.11 --- index.html 26 Jan 2004 01:02:09 -0000 1.12 *************** *** 6,9 **** --- 6,11 ---- <p><b>HTMLParser documentation + <p><a href="http://htmlparser.sourceforge.net/wiki/" class="namedurl"><span style="white-space: nowrap">This</span> page has moved to http://htmlparser.sourceforge.net/wiki</a> + <p>Welcome to the HTMLParser documentation page. You may visit *************** *** 13,17 **** <li><a href="SamplePrograms.html" class="wiki">SamplePrograms</a> - A quick tutorial on getting started with the parser ! <li><a href="WritingYourOwnScanners.html" class="wiki">WritingYourOwnScanners</a> - Learn how to write your own scanners to extend the capability of the parser <li><a href="SearchingForData.html" class="wiki">SearchingForData</a> - Learn how to perform powerful searches in html pages --- 15,19 ---- <li><a href="SamplePrograms.html" class="wiki">SamplePrograms</a> - A quick tutorial on getting started with the parser ! <li><a href="WritingYourOwnScanners.html" class="wiki">WritingYourOwnScanners</a> - ignore this, this is old <li><a href="SearchingForData.html" class="wiki">SearchingForData</a> - Learn how to perform powerful searches in html pages *************** *** 29,43 **** <li><a href="TestDrivenDevelopment.html" class="wiki">TestDrivenDevelopment</a> ! <li><a href="ParsingXml.html" class="wiki">ParsingXml</a> ! ! <li><a href="UnitTestingXsl.html" class="wiki">UnitTestingXsl</a> ! ! <li><a href="UnitTestingPdf.html" class="wiki">UnitTestingPdf</a> ! <li><a href="http://htmlparser.sourceforge.net/javadoc/" class="namedurl"><span style="white-space: nowrap">Javadocs</span> for Version 1.2</a> <li><a CLASS="namedurl" HREF="../javadoc/index.html"><span STYLE="white-space: nowrap">Javadocs</span></a> ! <li><a href="Benchmarks.html" class="named-wiki" title="Benchmarks">Benchmarks vs. JTidy</a> --- 31,41 ---- <li><a href="TestDrivenDevelopment.html" class="wiki">TestDrivenDevelopment</a> ! <li><a href="Benchmarks.html" class="named-wiki" title="Benchmarks">Benchmarks vs. JTidy</a> ! <li><a href="http://htmlparser.sourceforge.net/javadoc/" class="namedurl"><span style="white-space: nowrap">Javadocs</span></a> <li><a CLASS="namedurl" HREF="../javadoc/index.html"><span STYLE="white-space: nowrap">Javadocs</span></a> ! <li><a href="http://htmlparser.sourceforge.net/javadoc_1_2/" class="namedurl"><span style="white-space: nowrap">Javadocs</span> for Version 1.2</a> *************** *** 48,52 **** <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Tuesday, November 25, 2003 4:50:49 am. <hr class="toolbar" noshade="noshade" /> --- 46,50 ---- <hr class="printer" noshade="noshade" /> ! <p class="editdate">Last edited on Thursday, January 8, 2004 4:14:03 am. <hr class="toolbar" noshade="noshade" /> |