[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications MailRipper.java,1.42,1.43 Robot.ja
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-09-03 23:36:52
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv31228/parserapplications Modified Files: MailRipper.java Robot.java Log Message: Change tabs to spaces in all source files. Index: MailRipper.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/MailRipper.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** MailRipper.java 24 Aug 2003 21:59:42 -0000 1.42 --- MailRipper.java 3 Sep 2003 23:36:19 -0000 1.43 *************** *** 46,113 **** public class MailRipper { private org.htmlparser.Parser parser; ! /** ! * MailRipper c'tor takes the url to be ripped ! * @param resourceLocation url to be ripped ! */ ! public MailRipper(String resourceLocation) { ! try { ! parser = new Parser(resourceLocation,new DefaultParserFeedback()); ! parser.registerScanners(); ! } ! catch (ParserException e) { ! System.err.println("Could not create parser object"); ! e.printStackTrace(); ! } ! } ! public static void main(String[] args) { ! System.out.println("Mail Ripper v" + Parser.getVersion ()); ! if (args.length<1 || args[0].equals("-help")) ! { ! System.out.println(); ! System.out.println("Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.MailRipper <resourceLocn/website>"); ! System.out.println(); ! System.out.println(" <resourceLocn> the name of the file to be parsed (with complete path "); ! System.out.println(" if not in current directory)"); ! System.out.println(" -help This screen"); ! System.out.println(); ! System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); ! System.out.println(); ! System.out.println("Example : java -classpath htmlparser.jar com.kizna.parserapplications.MailRipper http://htmlparser.sourceforge.net"); ! System.out.println(); ! System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); ! System.exit(-1); ! } ! String resourceLocation = "http://htmlparser.sourceforge.net"; ! if (args.length!=0) resourceLocation = args[0]; ! ! MailRipper ripper = new MailRipper(resourceLocation); ! System.out.println("Ripping Site "+resourceLocation); ! try { ! for (Enumeration e=ripper.rip();e.hasMoreElements();) { ! LinkTag tag = (LinkTag)e.nextElement(); ! System.out.println("Ripped mail address : "+tag.getLink()); ! } ! } ! catch (ParserException e) { ! e.printStackTrace(); ! } ! } ! /** ! * Rip all mail addresses from the given url, and return an enumeration of such mail addresses. ! * @return Enumeration of mail addresses (a vector of LinkTag) ! */ ! public Enumeration rip() throws ParserException { ! Node node; ! Vector mailAddresses = new Vector(); ! for (NodeIterator e = parser.elements();e.hasMoreNodes();) ! { ! node = e.nextNode(); ! if (node instanceof LinkTag) ! { ! LinkTag linkTag = (LinkTag)node; ! if (linkTag.isMailLink()) mailAddresses.addElement(linkTag); ! } ! } ! return mailAddresses.elements(); ! } } --- 46,113 ---- public class MailRipper { private org.htmlparser.Parser parser; ! /** ! * MailRipper c'tor takes the url to be ripped ! * @param resourceLocation url to be ripped ! */ ! public MailRipper(String resourceLocation) { ! try { ! parser = new Parser(resourceLocation,new DefaultParserFeedback()); ! parser.registerScanners(); ! } ! catch (ParserException e) { ! System.err.println("Could not create parser object"); ! e.printStackTrace(); ! } ! } ! public static void main(String[] args) { ! System.out.println("Mail Ripper v" + Parser.getVersion ()); ! if (args.length<1 || args[0].equals("-help")) ! { ! System.out.println(); ! System.out.println("Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.MailRipper <resourceLocn/website>"); ! System.out.println(); ! System.out.println(" <resourceLocn> the name of the file to be parsed (with complete path "); ! System.out.println(" if not in current directory)"); ! System.out.println(" -help This screen"); ! System.out.println(); ! System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); ! System.out.println(); ! System.out.println("Example : java -classpath htmlparser.jar com.kizna.parserapplications.MailRipper http://htmlparser.sourceforge.net"); ! System.out.println(); ! System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); ! System.exit(-1); ! } ! String resourceLocation = "http://htmlparser.sourceforge.net"; ! if (args.length!=0) resourceLocation = args[0]; ! ! MailRipper ripper = new MailRipper(resourceLocation); ! System.out.println("Ripping Site "+resourceLocation); ! try { ! for (Enumeration e=ripper.rip();e.hasMoreElements();) { ! LinkTag tag = (LinkTag)e.nextElement(); ! System.out.println("Ripped mail address : "+tag.getLink()); ! } ! } ! catch (ParserException e) { ! e.printStackTrace(); ! } ! } ! /** ! * Rip all mail addresses from the given url, and return an enumeration of such mail addresses. ! * @return Enumeration of mail addresses (a vector of LinkTag) ! */ ! public Enumeration rip() throws ParserException { ! Node node; ! Vector mailAddresses = new Vector(); ! for (NodeIterator e = parser.elements();e.hasMoreNodes();) ! { ! node = e.nextNode(); ! if (node instanceof LinkTag) ! { ! LinkTag linkTag = (LinkTag)node; ! if (linkTag.isMailLink()) mailAddresses.addElement(linkTag); ! } ! } ! return mailAddresses.elements(); ! } } Index: Robot.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/Robot.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** Robot.java 24 Aug 2003 21:59:42 -0000 1.44 --- Robot.java 3 Sep 2003 23:36:19 -0000 1.45 *************** *** 40,138 **** public class Robot { private org.htmlparser.Parser parser; ! /** ! * Robot crawler - Provide the starting url ! */ ! public Robot(String resourceLocation) { ! try { ! parser = new Parser(resourceLocation,new DefaultParserFeedback()); ! parser.registerScanners(); ! } ! catch (ParserException e) { ! System.err.println("Error, could not create parser object"); ! e.printStackTrace(); ! } ! } ! /** ! * Crawl using a given crawl depth. ! * @param crawlDepth Depth of crawling ! */ ! public void crawl(int crawlDepth) throws ParserException ! { ! try { ! crawl(parser,crawlDepth); ! } ! catch (ParserException e) { ! throw new ParserException("HTMLParserException at crawl("+crawlDepth+")",e); ! } ! } ! /** ! * Crawl using a given parser object, and a given crawl depth. ! * @param parser Parser object ! * @param crawlDepth Depth of crawling ! */ ! public void crawl(Parser parser,int crawlDepth) throws ParserException { ! System.out.println(" crawlDepth = "+crawlDepth); ! for (NodeIterator e = parser.elements();e.hasMoreNodes();) ! { ! Node node = e.nextNode(); ! if (node instanceof LinkTag) ! { ! LinkTag linkTag = (LinkTag)node; ! { ! if (!linkTag.isMailLink()) ! { ! if (linkTag.getLink().toUpperCase().indexOf("HTM")!=-1 || ! linkTag.getLink().toUpperCase().indexOf("COM")!=-1 || ! linkTag.getLink().toUpperCase().indexOf("ORG")!=-1) ! { ! if (crawlDepth>0) ! { ! Parser newParser = new Parser(linkTag.getLink(),new DefaultParserFeedback()); ! newParser.registerScanners(); ! System.out.print("Crawling to "+linkTag.getLink()); ! crawl(newParser,crawlDepth-1); ! } ! else System.out.println(linkTag.getLink()); ! } ! } ! } ! } ! } ! } ! public static void main(String[] args) ! { ! System.out.println("Robot Crawler v" + Parser.getVersion ()); ! if (args.length<2 || args[0].equals("-help")) ! { ! System.out.println(); ! System.out.println("Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.Robot <resourceLocn/website> <depth>"); ! System.out.println(); ! System.out.println(" <resourceLocn> the name of the file to be parsed (with complete path "); ! System.out.println(" if not in current directory)"); ! System.out.println(" <depth> No of links to be followed from each link"); ! System.out.println(" -help This screen"); ! System.out.println(); ! System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); ! System.out.println(); ! System.out.println("Example : java -classpath htmlparser.jar com.kizna.parserapplications.Robot http://www.google.com 3"); ! System.out.println(); ! System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); ! System.exit(-1); ! } ! String resourceLocation=""; ! int crawlDepth = 1; ! if (args.length!=0) resourceLocation = args[0]; ! if (args.length==2) crawlDepth=Integer.valueOf(args[1]).intValue(); ! ! ! Robot robot = new Robot(resourceLocation); ! System.out.println("Crawling Site "+resourceLocation); ! try { ! robot.crawl(crawlDepth); ! } ! catch (ParserException e) { ! e.printStackTrace(); ! } ! } } --- 40,138 ---- public class Robot { private org.htmlparser.Parser parser; ! /** ! * Robot crawler - Provide the starting url ! */ ! public Robot(String resourceLocation) { ! try { ! parser = new Parser(resourceLocation,new DefaultParserFeedback()); ! parser.registerScanners(); ! } ! catch (ParserException e) { ! System.err.println("Error, could not create parser object"); ! e.printStackTrace(); ! } ! } ! /** ! * Crawl using a given crawl depth. ! * @param crawlDepth Depth of crawling ! */ ! public void crawl(int crawlDepth) throws ParserException ! { ! try { ! crawl(parser,crawlDepth); ! } ! catch (ParserException e) { ! throw new ParserException("HTMLParserException at crawl("+crawlDepth+")",e); ! } ! } ! /** ! * Crawl using a given parser object, and a given crawl depth. ! * @param parser Parser object ! * @param crawlDepth Depth of crawling ! */ ! public void crawl(Parser parser,int crawlDepth) throws ParserException { ! System.out.println(" crawlDepth = "+crawlDepth); ! for (NodeIterator e = parser.elements();e.hasMoreNodes();) ! { ! Node node = e.nextNode(); ! if (node instanceof LinkTag) ! { ! LinkTag linkTag = (LinkTag)node; ! { ! if (!linkTag.isMailLink()) ! { ! if (linkTag.getLink().toUpperCase().indexOf("HTM")!=-1 || ! linkTag.getLink().toUpperCase().indexOf("COM")!=-1 || ! linkTag.getLink().toUpperCase().indexOf("ORG")!=-1) ! { ! if (crawlDepth>0) ! { ! Parser newParser = new Parser(linkTag.getLink(),new DefaultParserFeedback()); ! newParser.registerScanners(); ! System.out.print("Crawling to "+linkTag.getLink()); ! crawl(newParser,crawlDepth-1); ! } ! else System.out.println(linkTag.getLink()); ! } ! } ! } ! } ! } ! } ! public static void main(String[] args) ! { ! System.out.println("Robot Crawler v" + Parser.getVersion ()); ! if (args.length<2 || args[0].equals("-help")) ! { ! System.out.println(); ! System.out.println("Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.Robot <resourceLocn/website> <depth>"); ! System.out.println(); ! System.out.println(" <resourceLocn> the name of the file to be parsed (with complete path "); ! System.out.println(" if not in current directory)"); ! System.out.println(" <depth> No of links to be followed from each link"); ! System.out.println(" -help This screen"); ! System.out.println(); ! System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net"); ! System.out.println(); ! System.out.println("Example : java -classpath htmlparser.jar com.kizna.parserapplications.Robot http://www.google.com 3"); ! System.out.println(); ! System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); ! System.exit(-1); ! } ! String resourceLocation=""; ! int crawlDepth = 1; ! if (args.length!=0) resourceLocation = args[0]; ! if (args.length==2) crawlDepth=Integer.valueOf(args[1]).intValue(); ! ! ! Robot robot = new Robot(resourceLocation); ! System.out.println("Crawling Site "+resourceLocation); ! try { ! robot.crawl(crawlDepth); ! } ! catch (ParserException e) { ! e.printStackTrace(); ! } ! } } |