[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications LinkExtractor.java,1.47,1.48 MailR
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-12-07 23:42:13
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv16537/parserapplications Modified Files: LinkExtractor.java MailRipper.java Robot.java Log Message: Remove most of the scanners. The only scanners left are ones that really do something different (script and jsp). Instead of registering a scanner to enable returning a specific tag you now add a tag to the a PrototypicalNodeFactory. All known tags are 'registered' by default in a new Parser which is similar to having called the old 'registerDOMScanners()', so tags are fully nested. This is different behaviour, and specifically, you will need to recurse into returned nodes to get at what you want. I've tried to adjust the applications accordingly, but worked examples are still scarce. If you want to return only some of the derived tags while keeping most as generic tags, there are various constructors and manipulators on the factory. See the javadocs and examples in the tests package. Nearly all the old scanner tests are folded into the tag tests. toString() has been revamped. This means that the default Parser mainline now returns an indented listing of tags, making it easy to see the structure of a page. The downside is the text of the page had to have newlines, tabs etc. turned into escape sequences. But if you were really interested in content you would be using toHtml() or toPlainTextString(). Index: LinkExtractor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/LinkExtractor.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** LinkExtractor.java 9 Nov 2003 17:07:09 -0000 1.47 --- LinkExtractor.java 7 Dec 2003 23:41:40 -0000 1.48 *************** *** 45,49 **** try { this.parser = new Parser(location); // Create the parser object - parser.registerScanners(); // Register standard scanners (Very Important) } catch (ParserException e) { --- 45,48 ---- Index: MailRipper.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/MailRipper.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** MailRipper.java 9 Nov 2003 17:07:09 -0000 1.48 --- MailRipper.java 7 Dec 2003 23:41:40 -0000 1.49 *************** *** 33,40 **** --- 33,44 ---- import org.htmlparser.Node; + import org.htmlparser.NodeFilter; import org.htmlparser.Parser; + import org.htmlparser.filters.AndFilter; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.DefaultParserFeedback; import org.htmlparser.util.NodeIterator; + import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; *************** *** 44,113 **** * Pass a web site (or html file on your local disk) as an argument. */ ! public class MailRipper { ! private org.htmlparser.Parser parser; /** * MailRipper c'tor takes the url to be ripped * @param resourceLocation url to be ripped */ ! public MailRipper(String resourceLocation) { ! try { ! parser = new Parser(resourceLocation,new DefaultParserFeedback()); ! parser.registerScanners(); } ! catch (ParserException e) { ! System.err.println("Could not create parser object"); ! e.printStackTrace(); } } - public static void main(String[] args) { - System.out.println("Mail Ripper v" + Parser.getVersion ()); - if (args.length<1 || args[0].equals("-help")) - { - System.out.println(); - System.out.println("Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.MailRipper <resourceLocn/website>"); - System.out.println(); - System.out.println(" <resourceLocn> the name of the file to be parsed (with complete path "); - System.out.println(" if not in current directory)"); - System.out.println(" -help This screen"); - System.out.println(); - System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); - System.out.println(); - System.out.println("Example : java -classpath htmlparser.jar com.kizna.parserapplications.MailRipper http://htmlparser.sourceforge.net"); - System.out.println(); - System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); - System.exit(-1); - } - String resourceLocation = "http://htmlparser.sourceforge.net"; - if (args.length!=0) resourceLocation = args[0]; ! MailRipper ripper = new MailRipper(resourceLocation); ! System.out.println("Ripping Site "+resourceLocation); ! try { ! for (Enumeration e=ripper.rip();e.hasMoreElements();) { ! LinkTag tag = (LinkTag)e.nextElement(); ! System.out.println("Ripped mail address : "+tag.getLink()); ! } ! } ! catch (ParserException e) { ! e.printStackTrace(); ! } } /** * Rip all mail addresses from the given url, and return an enumeration of such mail addresses. ! * @return Enumeration of mail addresses (a vector of LinkTag) */ ! public Enumeration rip() throws ParserException { ! Node node; ! Vector mailAddresses = new Vector(); ! for (NodeIterator e = parser.elements();e.hasMoreNodes();) ! { ! node = e.nextNode(); ! if (node instanceof LinkTag) ! { ! LinkTag linkTag = (LinkTag)node; ! if (linkTag.isMailLink()) mailAddresses.addElement(linkTag); ! } ! } ! return mailAddresses.elements(); } } --- 48,134 ---- * Pass a web site (or html file on your local disk) as an argument. */ ! public class MailRipper ! { ! private Parser parser; ! /** * MailRipper c'tor takes the url to be ripped * @param resourceLocation url to be ripped */ ! public MailRipper (String resourceLocation) ! { ! try ! { ! parser = new Parser (resourceLocation,new DefaultParserFeedback ()); } ! catch (ParserException e) ! { ! System.err.println ("Could not create parser object"); ! e.printStackTrace (); } } ! public static void main (String[] args) ! { ! System.out.println ("Mail Ripper v" + Parser.getVersion ()); ! if (args.length<1 || args[0].equals ("-help")) ! { ! System.out.println (); ! System.out.println ("Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.MailRipper <resourceLocn/website>"); ! System.out.println (); ! System.out.println (" <resourceLocn> the name of the file to be parsed (with complete path "); ! System.out.println (" if not in current directory)"); ! System.out.println (" -help This screen"); ! System.out.println (); ! System.out.println ("HTML Parser home page : http://htmlparser.sourceforge.net"); ! System.out.println (); ! System.out.println ("Example : java -classpath htmlparser.jar com.kizna.parserapplications.MailRipper http://htmlparser.sourceforge.net"); ! System.out.println (); ! System.out.println ("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); ! System.exit (-1); ! } ! String resourceLocation = "http://htmlparser.sourceforge.net"; ! if (args.length!=0) resourceLocation = args[0]; ! ! MailRipper ripper = new MailRipper (resourceLocation); ! System.out.println ("Ripping Site "+resourceLocation); ! try ! { ! NodeList list; ! ! list = ripper.rip (); ! for (NodeIterator iterator = list.elements (); iterator.hasMoreNodes (); ) ! { ! LinkTag mail = (LinkTag)iterator.nextNode (); ! System.out.println (mail.getLink ()); ! } ! } ! catch (ParserException e) ! { ! e.printStackTrace (); ! } } + /** * Rip all mail addresses from the given url, and return an enumeration of such mail addresses. ! * @return A node list of mail addresses (LinkTag type). */ ! public NodeList rip() throws ParserException ! { ! NodeList ret; ! ! ret = parser.extractAllNodesThatMatch ( ! new AndFilter ( ! new NodeClassFilter (LinkTag.class), ! new NodeFilter () ! { ! public boolean accept (Node node) ! { ! return (((LinkTag)node).isMailLink ()); ! } ! } ! )); ! ! return (ret); } } Index: Robot.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/Robot.java,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** Robot.java 9 Nov 2003 17:07:09 -0000 1.50 --- Robot.java 7 Dec 2003 23:41:40 -0000 1.51 *************** *** 46,50 **** try { parser = new Parser(resourceLocation,new DefaultParserFeedback()); - parser.registerScanners(); } catch (ParserException e) { --- 46,49 ---- *************** *** 89,93 **** { Parser newParser = new Parser(linkTag.getLink(),new DefaultParserFeedback()); - newParser.registerScanners(); System.out.print("Crawling to "+linkTag.getLink()); crawl(newParser,crawlDepth-1); --- 88,91 ---- |