htmlparser-cvs Mailing List for HTML Parser (Page 32)
Brought to you by:
derrickoswald
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: <der...@us...> - 2003-10-29 03:31:30
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv12839/src/org/htmlparser/tests/scannersTests Modified Files: BaseHREFScannerTest.java ImageScannerTest.java LinkScannerTest.java Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: BaseHREFScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/BaseHREFScannerTest.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** BaseHREFScannerTest.java 27 Oct 2003 02:18:05 -0000 1.30 --- BaseHREFScannerTest.java 29 Oct 2003 03:31:18 -0000 1.31 *************** *** 65,72 **** public void testScan() throws ParserException{ createParser("<html><head><TITLE>test page</TITLE><BASE HREF=\"http://www.abc.com/\"><a href=\"home.cfm\">Home</a>...</html>","http://www.google.com/test/index.html"); ! LinkScanner linkScanner = new LinkScanner("-l"); ! parser.addScanner(linkScanner); parser.addScanner(new TitleScanner("-t")); ! parser.addScanner(linkScanner.createBaseHREFScanner("-b")); parseAndAssertNodeCount(7); //Base href tag should be the 4th tag --- 65,71 ---- public void testScan() throws ParserException{ createParser("<html><head><TITLE>test page</TITLE><BASE HREF=\"http://www.abc.com/\"><a href=\"home.cfm\">Home</a>...</html>","http://www.google.com/test/index.html"); ! parser.addScanner(new LinkScanner("-l")); parser.addScanner(new TitleScanner("-t")); ! parser.addScanner(new BaseHrefScanner("-b")); parseAndAssertNodeCount(7); //Base href tag should be the 4th tag Index: ImageScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ImageScannerTest.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** ImageScannerTest.java 27 Oct 2003 02:18:05 -0000 1.37 --- ImageScannerTest.java 29 Oct 2003 03:31:18 -0000 1.38 *************** *** 59,63 **** createParser("<IMG SRC=\"../abc/def/mypic.jpg\">","http://www.yahoo.com/ghi?abcdefg"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 59,63 ---- createParser("<IMG SRC=\"../abc/def/mypic.jpg\">","http://www.yahoo.com/ghi?abcdefg"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 68,72 **** public void testEvaluate() { ! ImageScanner scanner = new ImageScanner("-i",new LinkProcessor()); Tag tag = new Tag (); tag.setTagName ("img"); --- 68,72 ---- public void testEvaluate() { ! ImageScanner scanner = new ImageScanner("-i"); Tag tag = new Tag (); tag.setTagName ("img"); *************** *** 83,87 **** createParser ("<img width=638 height=53 border=0 usemap=\"#m\" src=" + locn + " alt=Yahoo>"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 83,87 ---- createParser ("<img width=638 height=53 border=0 usemap=\"#m\" src=" + locn + " alt=Yahoo>"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 99,103 **** // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 99,103 ---- // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 113,117 **** // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 113,117 ---- // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 122,126 **** public void testRelativeImageScan2() throws ParserException { createParser("<IMG SRC=\"abc/def/mypic.jpg\">","http://www.yahoo.com"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 122,126 ---- public void testRelativeImageScan2() throws ParserException { createParser("<IMG SRC=\"abc/def/mypic.jpg\">","http://www.yahoo.com"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 132,136 **** createParser("<IMG SRC=\"../abc/def/mypic.jpg\">","http://www.yahoo.com/ghi"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 132,136 ---- createParser("<IMG SRC=\"../abc/def/mypic.jpg\">","http://www.yahoo.com/ghi"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 147,151 **** createParser("<IMG SRC=\"../abc/def/Hello World.jpg\">","http://www.yahoo.com/ghi"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 147,151 ---- createParser("<IMG SRC=\"../abc/def/Hello World.jpg\">","http://www.yahoo.com/ghi"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 159,163 **** Parser.setLineSeparator("\r\n"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); --- 159,163 ---- Parser.setLineSeparator("\r\n"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); assertTrue("Node identified should be HTMLImageTag",node[0] instanceof ImageTag); *************** *** 177,181 **** Node [] node = new AbstractNode[10]; // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); int i = 0; Node thisNode; --- 177,181 ---- Node [] node = new AbstractNode[10]; // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); int i = 0; Node thisNode; Index: LinkScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/LinkScannerTest.java,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** LinkScannerTest.java 27 Oct 2003 02:18:05 -0000 1.46 --- LinkScannerTest.java 29 Oct 2003 03:31:18 -0000 1.47 *************** *** 36,39 **** --- 36,40 ---- import org.htmlparser.StringNode; import org.htmlparser.lexer.nodes.Attribute; + import org.htmlparser.scanners.ImageScanner; import org.htmlparser.scanners.LinkScanner; import org.htmlparser.tags.ImageTag; *************** *** 269,275 **** createParser("<A HREF=\"mytest.html\"><IMG SRC=\"abcd.jpg\">Hello World</A>","http://www.yahoo.com"); // Register the image scanner ! LinkScanner linkScanner = new LinkScanner("-l"); ! parser.addScanner(linkScanner); ! parser.addScanner(linkScanner.createImageScanner("-i")); parseAndAssertNodeCount(1); --- 270,275 ---- createParser("<A HREF=\"mytest.html\"><IMG SRC=\"abcd.jpg\">Hello World</A>","http://www.yahoo.com"); // Register the image scanner ! parser.addScanner(new LinkScanner("-l")); ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); *************** *** 346,352 **** createParser("<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>","http://transfer.go.com"); // Register the image scanner ! LinkScanner linkScanner = new LinkScanner("-l"); ! parser.addScanner(linkScanner); ! parser.addScanner(linkScanner.createImageScanner("-i")); parseAndAssertNodeCount(1); --- 346,351 ---- createParser("<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>","http://transfer.go.com"); // Register the image scanner ! parser.addScanner(new LinkScanner("-l")); ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); |
From: <der...@us...> - 2003-10-29 03:31:29
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv12839/src/org/htmlparser Modified Files: Parser.java Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.71 retrieving revision 1.72 diff -C2 -d -r1.71 -r1.72 *** Parser.java 28 Oct 2003 12:54:21 -0000 1.71 --- Parser.java 29 Oct 2003 03:31:17 -0000 1.72 *************** *** 49,52 **** --- 49,53 ---- import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; import org.htmlparser.scanners.AppletScanner; + import org.htmlparser.scanners.BaseHrefScanner; import org.htmlparser.scanners.BodyScanner; import org.htmlparser.scanners.BulletListScanner; *************** *** 57,60 **** --- 58,62 ---- import org.htmlparser.scanners.HeadScanner; import org.htmlparser.scanners.HtmlScanner; + import org.htmlparser.scanners.ImageScanner; import org.htmlparser.scanners.JspScanner; import org.htmlparser.scanners.LinkScanner; *************** *** 686,700 **** * This method should be invoked in order to register some common scanners. The scanners that get added are : <br> * LinkScanner (filter key "-l")<br> ! * HTMLImageScanner (filter key "-i")<br> ! * HTMLScriptScanner (filter key "-s") <br> ! * HTMLStyleScanner (filter key "-t") <br> ! * HTMLJspScanner (filter key "-j") <br> ! * HTMLAppletScanner (filter key "-a") <br> ! * HTMLMetaTagScanner (filter key "-m") <br> ! * HTMLTitleScanner (filter key "-t") <br> ! * HTMLDoctypeScanner (filter key "-d") <br> ! * HTMLFormScanner (filter key "-f") <br> ! * HTMLFrameSetScanner(filter key "-r") <br> ! * HTMLBaseHREFScanner(filter key "-b") <br> * <br> * Call this method after creating the Parser object. e.g. <BR> --- 688,704 ---- * This method should be invoked in order to register some common scanners. The scanners that get added are : <br> * LinkScanner (filter key "-l")<br> ! * ImageScanner (filter key "-i")<br> ! * ScriptScanner (filter key "-s") <br> ! * StyleScanner (filter key "-t") <br> ! * JspScanner (filter key "-j") <br> ! * AppletScanner (filter key "-a") <br> ! * MetaTagScanner (filter key "-m") <br> ! * TitleScanner (filter key "-t") <br> ! * DoctypeScanner (filter key "-d") <br> ! * FormScanner (filter key "-f") <br> ! * FrameSetScanner(filter key "-r") <br> ! * BulletListScanner(filter key "-bulletList") <br> ! * DivScanner(filter key "-div") <br> ! * TableScanner(filter key "") <br> * <br> * Call this method after creating the Parser object. e.g. <BR> *************** *** 710,719 **** return; } ! LinkScanner linkScanner = new LinkScanner(LinkTag.LINK_TAG_FILTER); ! // Note - The BaseHREF and Image scanners share the same ! // link processor - internally linked up with the factory ! // method in the link scanner class ! addScanner(linkScanner); ! addScanner(linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER)); addScanner(new ScriptScanner("-s")); addScanner(new StyleScanner("-t")); --- 714,719 ---- return; } ! addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER)); ! addScanner(new ImageScanner(ImageTag.IMAGE_TAG_FILTER)); addScanner(new ScriptScanner("-s")); addScanner(new StyleScanner("-t")); *************** *** 725,729 **** addScanner(new FormScanner("-f",this)); addScanner(new FrameSetScanner("-r")); ! addScanner(linkScanner.createBaseHREFScanner("-b")); addScanner(new BulletListScanner("-bulletList",this)); // addScanner(new SpanScanner("-p")); --- 725,729 ---- addScanner(new FormScanner("-f",this)); addScanner(new FrameSetScanner("-r")); ! addScanner(new BaseHrefScanner("-b")); addScanner(new BulletListScanner("-bulletList",this)); // addScanner(new SpanScanner("-p")); |
From: <der...@us...> - 2003-10-29 03:31:29
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs1:/tmp/cvs-serv12839/src/org/htmlparser/tests Modified Files: FunctionalTests.java Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: FunctionalTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/FunctionalTests.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** FunctionalTests.java 26 Oct 2003 19:46:25 -0000 1.47 --- FunctionalTests.java 29 Oct 2003 03:31:18 -0000 1.48 *************** *** 99,103 **** public int countImageTagsWithHTMLParser() throws ParserException { Parser parser = new Parser("http://education.yahoo.com/",new DefaultParserFeedback()); ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); int parserImgTagCount = 0; Node node; --- 99,103 ---- public int countImageTagsWithHTMLParser() throws ParserException { Parser parser = new Parser("http://education.yahoo.com/",new DefaultParserFeedback()); ! parser.addScanner(new ImageScanner("-i")); int parserImgTagCount = 0; Node node; |
From: <der...@us...> - 2003-10-29 03:31:29
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv12839/src/org/htmlparser/lexer Modified Files: Page.java Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** Page.java 27 Oct 2003 02:18:04 -0000 1.22 --- Page.java 29 Oct 2003 03:31:17 -0000 1.23 *************** *** 40,43 **** --- 40,44 ---- import org.htmlparser.util.*; + import org.htmlparser.util.LinkProcessor; /** *************** *** 81,84 **** --- 82,91 ---- /** + * The processor of relative links on this page. + * Holds any overridden base HREF. + */ + protected LinkProcessor mProcessor; + + /** * Messages for page not there (404). */ *************** *** 121,124 **** --- 128,132 ---- throw new IllegalArgumentException ("connection cannot be null"); setConnection (connection); + mProcessor = null; } *************** *** 142,145 **** --- 150,154 ---- mConnection = null; mUrl = null; + mProcessor = null; } *************** *** 163,166 **** --- 172,176 ---- mConnection = null; mUrl = null; + mProcessor = null; } *************** *** 660,663 **** --- 670,694 ---- throw new ParserException (ioe.getMessage (), ioe); } + } + + /** + * Get the link processor associated with this page. + * @return The link processor that has the base HREF. + */ + public LinkProcessor getLinkProcessor () + { + if (null == mProcessor) + mProcessor = new LinkProcessor (); + + return (mProcessor); + } + + /** + * Set the link processor associated with this page. + * @param processor The new link processor for this page. + */ + public void setLinkProcessor (LinkProcessor processor) + { + mProcessor = processor; } |
From: <der...@us...> - 2003-10-29 03:31:28
|
Update of /cvsroot/htmlparser/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv12839 Modified Files: build.xml Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: build.xml =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/build.xml,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** build.xml 26 Oct 2003 19:46:16 -0000 1.51 --- build.xml 29 Oct 2003 03:31:17 -0000 1.52 *************** *** 268,271 **** --- 268,272 ---- <include name="org/htmlparser/util/SimpleNodeIterator.class"/> <include name="org/htmlparser/util/SpecialHashtable.class"/> + <include name="org/htmlparser/util/LinkProcessor.class"/> <include name="org/htmlparser/util/sort/**/*.class"/> <include name="org/htmlparser/parserHelper/SpecialHashtable.class"/> |
From: <der...@us...> - 2003-10-29 03:31:28
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1:/tmp/cvs-serv12839/src/org/htmlparser/tests/tagTests Modified Files: ImageTagTest.java Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: ImageTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/ImageTagTest.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** ImageTagTest.java 26 Oct 2003 19:46:27 -0000 1.34 --- ImageTagTest.java 29 Oct 2003 03:31:18 -0000 1.35 *************** *** 61,65 **** createParser("<IMG alt=Google height=115 src=\"goo/title_homepage4.gif\" width=305>","http://www.google.com/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); --- 61,65 ---- createParser("<IMG alt=Google height=115 src=\"goo/title_homepage4.gif\" width=305>","http://www.google.com/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); *************** *** 81,85 **** createParser("<IMG alt=Google height=115 src=\"../goo/title_homepage4.gif\" width=305>","http://www.google.com/test/"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); --- 81,85 ---- createParser("<IMG alt=Google height=115 src=\"../goo/title_homepage4.gif\" width=305>","http://www.google.com/test/"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); *************** *** 101,105 **** createParser("<IMG alt=Google height=115 src=\"../../goo/title_homepage4.gif\" width=305>","http://www.google.com/test/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); --- 101,105 ---- createParser("<IMG alt=Google height=115 src=\"../../goo/title_homepage4.gif\" width=305>","http://www.google.com/test/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); *************** *** 118,122 **** createParser("<IMG SRC='abcd.jpg'>","http://www.cj.com/"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); --- 118,122 ---- createParser("<IMG SRC='abcd.jpg'>","http://www.cj.com/"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); *************** *** 137,141 **** createParser("<IMG SRC=>","http://www.google.com/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); --- 137,141 ---- createParser("<IMG SRC=>","http://www.google.com/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); *************** *** 150,154 **** createParser(img,"http://www.google.com/test/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i",new LinkProcessor())); parseAndAssertNodeCount(1); --- 150,154 ---- createParser(img,"http://www.google.com/test/test/index.html"); // Register the image scanner ! parser.addScanner(new ImageScanner("-i")); parseAndAssertNodeCount(1); |
From: <der...@us...> - 2003-10-29 03:31:28
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors In directory sc8-pr-cvs1:/tmp/cvs-serv12839/src/org/htmlparser/visitors Modified Files: UrlModifyingVisitor.java Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: UrlModifyingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/UrlModifyingVisitor.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** UrlModifyingVisitor.java 26 Oct 2003 19:46:29 -0000 1.35 --- UrlModifyingVisitor.java 29 Oct 2003 03:31:18 -0000 1.36 *************** *** 32,35 **** --- 32,36 ---- import org.htmlparser.Parser; import org.htmlparser.StringNode; + import org.htmlparser.scanners.ImageScanner; import org.htmlparser.scanners.LinkScanner; import org.htmlparser.tags.ImageTag; *************** *** 45,55 **** super(true,false); this.parser = parser; ! LinkScanner linkScanner = new LinkScanner(); ! parser.addScanner(linkScanner); ! parser.addScanner( ! linkScanner.createImageScanner( ! ImageTag.IMAGE_TAG_FILTER ! ) ! ); this.linkPrefix =linkPrefix; modifiedResult = new StringBuffer(); --- 46,51 ---- super(true,false); this.parser = parser; ! parser.addScanner(new LinkScanner()); ! parser.addScanner(new ImageScanner(ImageTag.IMAGE_TAG_FILTER)); this.linkPrefix =linkPrefix; modifiedResult = new StringBuffer(); |
From: <der...@us...> - 2003-10-29 03:31:28
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv12839/src/org/htmlparser/scanners Modified Files: BaseHrefScanner.java FormScanner.java ImageScanner.java LinkScanner.java Log Message: Move LinkProcess out of scanners and into Page, untangling A, IMG and BASE scanners. Move form action determination to tag. The scanners have no special actions on behalf of tags anymore. Index: BaseHrefScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/BaseHrefScanner.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** BaseHrefScanner.java 26 Oct 2003 19:46:19 -0000 1.28 --- BaseHrefScanner.java 29 Oct 2003 03:31:17 -0000 1.29 *************** *** 33,37 **** import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.Tag; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.ParserException; --- 33,36 ---- *************** *** 43,48 **** public class BaseHrefScanner extends TagScanner { - private LinkProcessor processor; - public BaseHrefScanner() { --- 42,45 ---- *************** *** 50,57 **** } ! public BaseHrefScanner(String filter,LinkProcessor processor) { super(filter); - this.processor = processor; } --- 47,53 ---- } ! public BaseHrefScanner(String filter) { super(filter); } *************** *** 73,81 **** ret.setAttributesEx (attributes); - // special step here - // Need to set the base url for the current link processor, - // which can't be done in the tag because it doesn't have it. - processor.setBaseUrl (ret.getBaseUrl ()); - return (ret); } --- 69,72 ---- Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** FormScanner.java 28 Oct 2003 12:54:21 -0000 1.51 --- FormScanner.java 29 Oct 2003 03:31:17 -0000 1.52 *************** *** 70,106 **** /** - * Extract the location of the image, given the tag, and the url - * of the html page in which this tag exists. - * @param tag The form tag with the 'ACTION' attribute. - * @param url URL of web page being parsed. - */ - public String extractFormLocn(Tag tag,String url) throws ParserException - { - try { - String formURL= tag.getAttribute("ACTION"); - if (formURL==null) return ""; else - return (new LinkProcessor()).extract(formURL, url); - } - catch (Exception e) { - String msg; - if (tag!=null) msg= tag.getText(); else msg=""; - throw new ParserException("HTMLFormScanner.extractFormLocn() : Error in extracting form location, tag = "+msg+", url = "+url,e); - } - } - - public String extractFormName(Tag tag) - { - return tag.getAttribute("NAME"); - } - - public String extractFormMethod(Tag tag) - { - String method = tag.getAttribute("METHOD"); - if (method==null) method = FormTag.GET; - return method.toUpperCase(); - - } - - /** * @see org.htmlparser.scanners.TagScanner#getID() */ --- 70,73 ---- *************** *** 122,131 **** ret.setEndTag (endTag); ret.setChildren (children); - - // special step here... - // ... is it true that without an ACTION the default is to send it back to the same page? - String formUrl = extractFormLocn(startTag, page.getUrl ()); - if (formUrl!=null && formUrl.length()>0) - startTag.setAttribute("ACTION",formUrl); return (ret); --- 89,92 ---- Index: ImageScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ImageScanner.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** ImageScanner.java 26 Oct 2003 19:46:20 -0000 1.32 --- ImageScanner.java 29 Oct 2003 03:31:17 -0000 1.33 *************** *** 39,45 **** import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.Tag; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; /** * Scans for the Image Tag. This is a subclass of TagScanner, and is called using a --- 39,45 ---- import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; + /** * Scans for the Image Tag. This is a subclass of TagScanner, and is called using a *************** *** 51,55 **** { public static final String IMAGE_SCANNER_ID = "IMG"; ! private LinkProcessor processor; /** * Overriding the default constructor --- 51,55 ---- { public static final String IMAGE_SCANNER_ID = "IMG"; ! /** * Overriding the default constructor *************** *** 58,70 **** { super(); - processor = new LinkProcessor(); } /** * Overriding the constructor to accept the filter */ ! public ImageScanner(String filter,LinkProcessor processor) { super(filter); - this.processor = processor; } --- 58,69 ---- { super(); } + /** * Overriding the constructor to accept the filter */ ! public ImageScanner(String filter) { super(filter); } *************** *** 84,100 **** ret.setEndPosition (end); ret.setAttributesEx (attributes); - - // special step here... - // Need to update the imageURL string in the image tag, - // but not the SRC attribute which it does when you set the ImageURL - // property. Can't do it in the tag, because the tag doesn't have the - // current link processor object which might have a BASE href different - // than the page. - String src = ret.getAttribute ("SRC"); - ret.setImageURL (processor.extract (ret.getImageURL (), page.getUrl ())); - if (null == src) - ret.removeAttribute ("SRC"); - else - ret.setAttribute ("SRC", src); return (ret); --- 83,86 ---- Index: LinkScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/LinkScanner.java,v retrieving revision 1.58 retrieving revision 1.59 diff -C2 -d -r1.58 -r1.59 *** LinkScanner.java 28 Oct 2003 10:31:02 -0000 1.58 --- LinkScanner.java 29 Oct 2003 03:31:17 -0000 1.59 *************** *** 53,57 **** private static final String MATCH_NAME [] = {"A"}; public static final String LINK_SCANNER_ID = "A"; - public LinkProcessor processor; private final static String ENDERS [] = { "A","TD","TR","FORM","LI","BODY", "HTML" }; private final static String ENDTAG_ENDERS [] = { "TD","TR","FORM","LI","BODY", "HTML" }; --- 53,56 ---- *************** *** 69,73 **** public LinkScanner(String filter) { super(filter,MATCH_NAME,ENDERS,ENDTAG_ENDERS); - processor = new LinkProcessor(); } --- 68,71 ---- *************** *** 99,110 **** } - public BaseHrefScanner createBaseHREFScanner(String filter) { - return new BaseHrefScanner(filter,processor); - } - - public ImageScanner createImageScanner(String filter) { - return new ImageScanner(filter,processor); - } - /** * @see org.htmlparser.scanners.TagScanner#getID() --- 97,100 ---- *************** *** 113,116 **** return MATCH_NAME; } - } --- 103,105 ---- |
From: <der...@us...> - 2003-10-28 12:55:45
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv5437/tests/scannersTests Modified Files: TagScannerTest.java Log Message: Remove TagScanner cruft. Index: TagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/TagScannerTest.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** TagScannerTest.java 27 Oct 2003 02:18:05 -0000 1.35 --- TagScannerTest.java 28 Oct 2003 12:54:22 -0000 1.36 *************** *** 53,97 **** } - public void testAbsorbLeadingBlanks() - { - String test = " This is a test"; - String result = TagScanner.absorbLeadingBlanks(test); - assertEquals("Absorb test","This is a test",result); - } - - public void testExtractXMLData() throws ParserException { - createParser( - "<MESSAGE>\n"+ - "Abhi\n"+ - "Sri\n"+ - "</MESSAGE>"); - Parser.setLineSeparator("\r\n"); - NodeIterator e = parser.elements(); - - Node node = e.nextNode(); - try { - String result = TagScanner.extractXMLData (node, "MESSAGE", e); - assertEquals("Result","\nAbhi\nSri\n",result); - } - catch (ParserException ex) { - assertTrue(e.toString(),false); - } - } - - public void testExtractXMLDataSingle() throws ParserException { - createParser( - "<MESSAGE>Test</MESSAGE>"); - NodeIterator e = parser.elements(); - - Node node = (Node)e.nextNode(); - try { - String result = TagScanner.extractXMLData (node, "MESSAGE", e); - assertEquals("Result","Test",result); - } - catch (ParserException ex) { - assertTrue(e.toString(),false); - } - } - public void testTagExtraction() throws ParserException { --- 53,56 ---- *************** *** 102,118 **** } - /** - * Captures bug reported by Raghavender Srimantula - * Problem is in isXMLTag - when it uses equals() to - * find a match - */ - public void testIsXMLTag() throws ParserException { - createParser("<OPTION value=\"#\">Select a destination</OPTION>"); - Node node; - NodeIterator e = parser.elements(); - node = (Node)e.nextNode(); - assertTrue("OPTION tag could not be identified",TagScanner.isXMLTagFound(node,"OPTION")); - } - public void testRemoveChars() { String test = "hello\nworld\n\tqsdsds"; --- 61,64 ---- *************** *** 120,146 **** assertEquals("Removing Chars","helloworld\tqsdsds",result); } - - public void testRemoveChars2() { - String test = "hello\r\nworld\r\n\tqsdsds"; - TagScanner scanner = new TagScanner() { - public Tag scan(Tag tag,String url,Lexer lexer) { return null;} - public boolean evaluate(Tag tag,TagScanner previousOpenScanner) { return false; } - public String [] getID() { return null; } - protected Tag createTag (Page page, int start, int end, Vector attributes, Tag tag, String url) { return null; } - }; - String result = scanner.removeChars(test,"\r\n"); - assertEquals("Removing Chars","helloworld\tqsdsds",result); - } - - /** - * Bug report by Cedric Rosa - * in absorbLeadingBlanks - crashes if the tag - * is empty - */ - public void testAbsorbLeadingBlanksBlankTag() { - String testData = new String(""); - String result=TagScanner.absorbLeadingBlanks(testData); - assertEquals("",result); - } - } --- 66,68 ---- |
From: <der...@us...> - 2003-10-28 12:55:11
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv5437/scanners Modified Files: CompositeTagScanner.java FormScanner.java ScriptScanner.java TagScanner.java Log Message: Remove TagScanner cruft. Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.75 retrieving revision 1.76 diff -C2 -d -r1.75 -r1.76 *** CompositeTagScanner.java 28 Oct 2003 10:31:02 -0000 1.75 --- CompositeTagScanner.java 28 Oct 2003 12:54:21 -0000 1.76 *************** *** 238,242 **** scanner = parser.getScanner (name); if ((null != scanner) && scanner.evaluate (next, this)) ! node = scanner.createScannedNode (next, lexer.getPage ().getUrl (), lexer); } } --- 238,242 ---- scanner = parser.getScanner (name); if ((null != scanner) && scanner.evaluate (next, this)) ! node = scanner.scan (next, lexer.getPage ().getUrl (), lexer); } } *************** *** 252,255 **** --- 252,256 ---- composite = (CompositeTag)createTag (lexer.getPage (), tag.elementBegin (), endTag.elementEnd (), tag.getAttributesEx (), tag, endTag, nodeList); + composite.setThisScanner (this); for (int i = 0; i < composite.getChildCount (); i++) composite.childAt (i).setParent (composite); Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** FormScanner.java 28 Oct 2003 10:31:02 -0000 1.50 --- FormScanner.java 28 Oct 2003 12:54:21 -0000 1.51 *************** *** 45,50 **** { private static final String [] MATCH_ID = { "FORM" }; - public static final String PREVIOUS_DIRTY_LINK_MESSAGE="Encountered a form tag after an open link tag.\nThere should have been an end tag for the link before the form tag began.\nCorrecting this.."; - private boolean linkScannerAlreadyOpen=false; private static final String [] formTagEnders = {"FORM","HTML","BODY"}; --- 45,48 ---- *************** *** 110,133 **** { return MATCH_ID; - } - - public boolean evaluate(Tag tag, TagScanner previousOpenScanner) - { - if (previousOpenScanner instanceof LinkScanner) - { - linkScannerAlreadyOpen = true; - StringBuffer msg= new StringBuffer(); - msg.append(tag.toHtml ()); - msg.append(PREVIOUS_DIRTY_LINK_MESSAGE); - feedback.warning(msg.toString()); - // This is dirty HTML. Assume the current tag is - // not a new link tag - but an end tag. This is actually a really wild bug - - // Internet Explorer actually parses such tags. - // So - we shall then proceed to fool the scanner into sending an endtag of type </A> - // For this - set the dirty flag to true and return - } - else - linkScannerAlreadyOpen = false; - return super.evaluate(tag, previousOpenScanner); } --- 108,111 ---- Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** ScriptScanner.java 28 Oct 2003 03:04:18 -0000 1.45 --- ScriptScanner.java 28 Oct 2003 12:54:21 -0000 1.46 *************** *** 169,172 **** --- 169,173 ---- //TODO: use the factory: ret = createTag (lexer.getPage (), tag.elementBegin(), end.elementEnd(), tag.getAttributesEx (), tag, end, new NodeList (last)); + ret.setThisScanner (this); } finally Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** TagScanner.java 27 Oct 2003 02:18:04 -0000 1.45 --- TagScanner.java 28 Oct 2003 12:54:21 -0000 1.46 *************** *** 70,138 **** Serializable { ! /** ! * A filter which is used to associate this tag. The filter contains a string ! * that is used to match which tags are to be allowed to pass through. This can ! * be useful when one wishes to dynamically filter out all tags except one type ! * which may be programmed later than the parser. Is also useful for command line ! * implementations of the parser. ! */ ! protected String filter; ! ! /** ! * HTMLParserFeedback object automatically initialized ! */ ! protected ParserFeedback feedback; ! /** ! * Default Constructor, automatically registers the scanner into a static array of ! * scanners inside Tag ! */ ! public TagScanner() ! { ! this.filter=""; ! } ! /** ! * This constructor automatically registers the scanner, and sets the filter for this ! * tag. ! * @param filter The filter which will allow this tag to pass through. ! */ ! public TagScanner(String filter) ! { ! this.filter=filter; ! } ! /** ! * Insert the method's description here. ! * Creation date: (6/4/2001 11:44:09 AM) ! * @return java.lang.String ! * @param c char */ ! public String absorb(String s,char c) { ! int index = s.indexOf(c); ! if (index!=-1) s=s.substring(index+1,s.length()); ! return s; } /** ! * Remove whitespace from the front of the given string. ! * @param s The string to trim. ! * @return Either the same string or a string with whitespace chopped off. */ ! public static String absorbLeadingBlanks (String s) { ! int length; ! int i; ! String ret; ! ! i = 0; ! length = s.length (); ! while (i < length && Character.isWhitespace (s.charAt (i))) ! i++; ! if (0 == i) ! ret = s; ! else if (length == i) ! ret = ""; ! else ! ret = s.substring (i); ! ! return (ret); } --- 70,99 ---- Serializable { ! /** ! * A filter which is used to associate this tag. The filter contains a string ! * that is used to match which tags are to be allowed to pass through. This can ! * be useful when one wishes to dynamically filter out all tags except one type ! * which may be programmed later than the parser. Is also useful for command line ! * implementations of the parser. */ ! protected String filter; ! ! /** ! * Default Constructor, automatically registers the scanner into a static array of ! * scanners inside Tag ! */ ! public TagScanner () ! { ! this (""); } /** ! * This constructor automatically registers the scanner, and sets the filter for this ! * tag. ! * @param filter The filter which will allow this tag to pass through. */ ! public TagScanner (String filter) { ! this.filter=filter; } *************** *** 153,356 **** } ! /** ! * Pull the text between two matching capitalized 'XML' tags. ! * @deprecated This reads ahead on your iterator and doesn't put them back if it's not an XML tag. ! */ ! public static String extractXMLData (Node node, String tagName, NodeIterator iterator) ! throws ! ParserException { - try - { - String xmlData = ""; - - boolean xmlTagFound = isXMLTagFound (node, tagName); - if (xmlTagFound) - { - try - { - do - { - node = iterator.nextNode (); - if (node!=null) - { - if (node instanceof StringNode) - { - StringNode stringNode = (StringNode)node; - if (xmlData.length ()>0) - xmlData+=" "; - xmlData += stringNode.getText (); - } - else - if (!(node instanceof Tag && ((Tag)node).isEndTag ())) - xmlTagFound = false; - } - } - while (node instanceof StringNode); - - } - - catch (Exception e) - { - throw new ParserException ("TagScanner.extractXMLData() : error while trying to find xml tag",e); - } - } - // check end tag matches start tag - if (xmlTagFound) - { - if (node!=null) - { - if (node instanceof Tag && ((Tag)node).isEndTag ()) - { - Tag endTag = (Tag)node; - if (!endTag.getTagName ().equals (tagName)) - xmlTagFound = false; - } - - } - - } - if (xmlTagFound) - return xmlData; - else - return null; - } - catch (Exception e) - { - throw new ParserException ("TagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); - } - } - - public String getFilter() { return filter; } - public static boolean isXMLTagFound(Node node, String tagName) { - boolean xmlTagFound=false; - if (node instanceof Tag) { - Tag tag = (Tag)node; - if (tag.getText().toUpperCase().indexOf(tagName)==0) { - xmlTagFound=true; - } - } - return xmlTagFound; - } - - public final Tag createScannedNode(Tag tag,String url,Lexer lexer) throws ParserException { - Tag thisTag = scan(tag,url,lexer); - thisTag.setThisScanner(this); - thisTag.setAttributesEx(tag.getAttributesEx()); - return thisTag; - } - - /** - * Override this method to create your own tag type - * @param tagData - * @param tag - * @param url - * @return Tag - * @throws ParserException - */ - protected abstract Tag createTag(Page page, int start, int end, Vector attributes, Tag tag, String url) throws ParserException; - /** * Scan the tag and extract the information related to this type. The url of the * initiating scan has to be provided in case relative links are found. The initial * url is then prepended to it to give an absolute link. ! * The NodeReader is provided in order to do a lookahead operation. We assume that * the identification has already been performed using the evaluate() method. ! * @param tag HTML Tag to be scanned for identification ! * @param url The initiating url of the scan (Where the html page lies) ! * @param reader The reader object responsible for reading the html page */ ! public Tag scan(Tag tag,String url,Lexer lexer) throws ParserException ! { ! return (createTag(lexer.getPage (), tag.elementBegin(), tag.elementEnd(), tag.getAttributesEx (), tag, url)); ! } ! ! public String removeChars(String s,String occur) { ! StringBuffer newString = new StringBuffer(); ! int index; ! do { ! index = s.indexOf(occur); ! if (index!=-1) { ! newString.append(s.substring(0,index)); ! s=s.substring(index+occur.length()); ! } ! } ! while (index!=-1); ! newString.append(s); ! return newString.toString(); ! } ! ! public abstract String [] getID(); ! ! public final void setFeedback(ParserFeedback feedback) { ! this.feedback = feedback; ! } ! ! public static Map adjustScanners(Parser parser) { ! Map ret; ! ! ret = parser.getScanners(); ! // Remove all existing scanners ! parser.flushScanners(); return (ret); } - public static void restoreScanners(Parser parser, Hashtable tempScanners) - { - // Flush the scanners - parser.setScanners(tempScanners); - } - /** ! * Insert an EndTag in the currentLine, just before the occurence of the provided tag */ ! public String insertEndTagBeforeNode(AbstractNode node, String currentLine) { ! String newLine = currentLine.substring(0,node.elementBegin()); ! newLine += "</A>"; ! newLine += currentLine.substring(node.elementBegin(),currentLine.length()); ! return newLine; ! } ! ! // protected Tag getReplacedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // // Replace tag - it was a <A> tag - replace with </a> ! // String newLine = replaceFaultyTagWithEndTag(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } ! ! public String replaceFaultyTagWithEndTag(Tag tag, String currentLine) { ! String newLine = currentLine.substring(0,tag.elementBegin()); ! newLine+="</"+tag.getTagName()+">"; ! newLine+=currentLine.substring(tag.elementEnd()+1,currentLine.length()); ! ! return newLine; ! } ! ! // protected Tag getInsertedEndTag(Tag tag, String currentLine) { ! // // Insert end tag ! // String newLine = insertEndTagBeforeNode(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } ! } --- 114,154 ---- } ! public String getFilter() { return filter; } /** * Scan the tag and extract the information related to this type. The url of the * initiating scan has to be provided in case relative links are found. The initial * url is then prepended to it to give an absolute link. ! * The Lexer is provided in order to do a lookahead operation. We assume that * the identification has already been performed using the evaluate() method. ! * @param tag HTML Tag to be scanned for identification. ! * @param url The initiating url of the scan (Where the html page lies). ! * @param lexer Provides html page access. ! * @return The resultant tag (may be unchanged). */ ! public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException { ! Tag ret; ! ! ret = createTag(lexer.getPage (), tag.elementBegin(), tag.elementEnd(), tag.getAttributesEx (), tag, url); ! ret.setThisScanner(this); return (ret); } /** ! * Create a tag. ! * Override this method to create your own tag type. ! * @param tagData ! * @param tag ! * @param url ! * @return Tag ! * @throws ParserException */ ! protected abstract Tag createTag(Page page, int start, int end, Vector attributes, Tag tag, String url) throws ParserException; + public abstract String [] getID(); } |
From: <der...@us...> - 2003-10-28 12:55:07
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv5437/util Modified Files: IteratorImpl.java Log Message: Remove TagScanner cruft. Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** IteratorImpl.java 28 Oct 2003 03:04:19 -0000 1.32 --- IteratorImpl.java 28 Oct 2003 12:54:22 -0000 1.33 *************** *** 76,80 **** scanner = parser.getScanner (name); if ((null != scanner) && scanner.evaluate (tag, null)) ! ret = scanner.createScannedNode (tag, mLexer.getPage ().getUrl (), mLexer); } } --- 76,80 ---- scanner = parser.getScanner (name); if ((null != scanner) && scanner.evaluate (tag, null)) ! ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); } } |
From: <der...@us...> - 2003-10-28 12:55:07
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv5437 Modified Files: Parser.java Log Message: Remove TagScanner cruft. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.70 retrieving revision 1.71 diff -C2 -d -r1.70 -r1.71 *** Parser.java 28 Oct 2003 03:04:17 -0000 1.70 --- Parser.java 28 Oct 2003 12:54:21 -0000 1.71 *************** *** 536,545 **** * @param scanner TagScanner object (or derivative) to be added to the list of registered scanners */ ! public void addScanner(TagScanner scanner) { String ids[] = scanner.getID(); for (int i=0;i<ids.length;i++) { scanners.put(ids[i],scanner); } - scanner.setFeedback(feedback); } --- 536,545 ---- * @param scanner TagScanner object (or derivative) to be added to the list of registered scanners */ ! public void addScanner(TagScanner scanner) ! { String ids[] = scanner.getID(); for (int i=0;i<ids.length;i++) { scanners.put(ids[i],scanner); } } |
From: <der...@us...> - 2003-10-28 10:31:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv15003/tests/scannersTests Modified Files: CompositeTagScannerTest.java Log Message: Replaced isAllowSelfChildren() using tagEnders set. Index: CompositeTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** CompositeTagScannerTest.java 28 Oct 2003 03:04:19 -0000 1.46 --- CompositeTagScannerTest.java 28 Oct 2003 10:31:02 -0000 1.47 *************** *** 559,563 **** public CustomScanner(boolean selfChildrenAllowed) { ! super("", MATCH_NAME, new String[] {}, selfChildrenAllowed); } --- 559,563 ---- public CustomScanner(boolean selfChildrenAllowed) { ! super("", MATCH_NAME, selfChildrenAllowed ? new String[] {} : MATCH_NAME); } *************** *** 590,594 **** public AnotherScanner(boolean acceptCustomTagsButDontAcceptCustomEndTags) { ! super("", MATCH_NAME, new String[] {}, new String[] {"CUSTOM"}, true); } --- 590,594 ---- public AnotherScanner(boolean acceptCustomTagsButDontAcceptCustomEndTags) { ! super("", MATCH_NAME, new String[] {}, new String[] {"CUSTOM"}); } |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv15003/scanners Modified Files: BodyScanner.java BulletScanner.java CompositeTagScanner.java FormScanner.java HeadScanner.java LabelScanner.java LinkScanner.java OptionTagScanner.java SelectTagScanner.java TableColumnScanner.java TableRowScanner.java TableScanner.java TextareaTagScanner.java TitleScanner.java Log Message: Replaced isAllowSelfChildren() using tagEnders set. Index: BodyScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/BodyScanner.java,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** BodyScanner.java 26 Oct 2003 19:46:19 -0000 1.20 --- BodyScanner.java 28 Oct 2003 10:31:02 -0000 1.21 *************** *** 43,47 **** { private static final String MATCH_NAME [] = {"BODY"}; - private static final String ENDERS [] = {}; private static final String END_TAG_ENDERS [] = {"HTML"}; --- 43,46 ---- *************** *** 53,57 **** public BodyScanner(String filter) { ! super(filter,MATCH_NAME,ENDERS,END_TAG_ENDERS,false); } --- 52,56 ---- public BodyScanner(String filter) { ! super(filter,MATCH_NAME,MATCH_NAME,END_TAG_ENDERS); } Index: BulletScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/BulletScanner.java,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** BulletScanner.java 28 Oct 2003 03:04:18 -0000 1.25 --- BulletScanner.java 28 Oct 2003 10:31:02 -0000 1.26 *************** *** 54,58 **** public BulletScanner(String filter) { ! super(filter, MATCH_STRING, ENDERS, END_TAG_ENDERS, false); } --- 54,58 ---- public BulletScanner(String filter) { ! super(filter, MATCH_STRING, ENDERS, END_TAG_ENDERS); } Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.74 retrieving revision 1.75 diff -C2 -d -r1.74 -r1.75 *** CompositeTagScanner.java 28 Oct 2003 03:04:18 -0000 1.74 --- CompositeTagScanner.java 28 Oct 2003 10:31:02 -0000 1.75 *************** *** 48,52 **** * <li>Tags which will trigger a match</li> * <li>Tags which when encountered before a legal end tag, should force a correction</li> - * <li>Preventing more tags of its own type to appear as children * </ul> * Here are examples of each:<BR> --- 48,51 ---- *************** *** 79,90 **** * This is useful when you know that a certain tag can never hold children of its own type. * e.g. <FORM> can never have more form tags within it. If it does, it is an error and should ! * be corrected. The default behavior is to allow nesting. * <pre> * MyScanner extends CompositeTagScanner { * private static final String [] MATCH_IDS = { "FORM" }; - * private static final String [] ENDERS = {}; * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" }; * MyScanner() { ! * super(MATCH_IDS, ENDERS,END_TAG_ENDERS, false); * } * ... --- 78,88 ---- * This is useful when you know that a certain tag can never hold children of its own type. * e.g. <FORM> can never have more form tags within it. If it does, it is an error and should ! * be corrected. Specify the tagEnders set to contain (at least) the match ids. * <pre> * MyScanner extends CompositeTagScanner { * private static final String [] MATCH_IDS = { "FORM" }; * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" }; * MyScanner() { ! * super(MATCH_IDS, MATCH_IDS, END_TAG_ENDERS, false); * } * ... *************** *** 96,100 **** { protected String [] nameOfTagToMatch; - private boolean allowSelfChildren; protected Set tagEnderSet; private Set endTagEnderSet; --- 94,97 ---- *************** *** 111,127 **** } - public CompositeTagScanner(String [] nameOfTagToMatch, String [] tagEnders, boolean allowSelfChildren) - { - this("",nameOfTagToMatch,tagEnders,allowSelfChildren); - } - public CompositeTagScanner(String filter, String [] nameOfTagToMatch) { ! this(filter,nameOfTagToMatch,new String [] {},true); ! } ! ! public CompositeTagScanner(String filter, String [] nameOfTagToMatch, String [] tagEnders) ! { ! this(filter,nameOfTagToMatch,tagEnders,true); } --- 108,114 ---- } public CompositeTagScanner(String filter, String [] nameOfTagToMatch) { ! this(filter,nameOfTagToMatch,new String [] {}); } *************** *** 129,136 **** String filter, String [] nameOfTagToMatch, ! String [] tagEnders, ! boolean allowSelfChildren) { ! this(filter,nameOfTagToMatch,tagEnders,new String[] {}, allowSelfChildren); } --- 116,122 ---- String filter, String [] nameOfTagToMatch, ! String [] tagEnders) { ! this(filter,nameOfTagToMatch,tagEnders,new String[] {}); } *************** *** 139,146 **** String [] nameOfTagToMatch, String [] tagEnders, ! String [] endTagEnders, ! boolean allowSelfChildren) { ! this(filter,nameOfTagToMatch,tagEnders,endTagEnders, allowSelfChildren, false); } --- 125,131 ---- String [] nameOfTagToMatch, String [] tagEnders, ! String [] endTagEnders) { ! this(filter,nameOfTagToMatch,tagEnders,endTagEnders, false); } *************** *** 171,180 **** String [] tagEnders, String [] endTagEnders, - boolean allowSelfChildren, boolean balance_quotes) { super(filter); this.nameOfTagToMatch = nameOfTagToMatch; - this.allowSelfChildren = allowSelfChildren; this.balance_quotes = balance_quotes; this.tagEnderSet = new HashSet(); --- 156,163 ---- *************** *** 193,200 **** * If it's not an empty XML tag, the lexer is repeatedly asked for * subsequent nodes until an end tag is found or a node is encountered ! * that matches the tag ender set or end tag ender set, or a node of ! * the same type is found and {@link #isAllowSelfChildren} returns ! * <code>false</code>. In all but the first case, a virtual end tag ! * is created. Each node found that is not the end tag is added to * the list of children.<p> * The scanner's {@link #createTag} method is called with details about --- 176,182 ---- * If it's not an empty XML tag, the lexer is repeatedly asked for * subsequent nodes until an end tag is found or a node is encountered ! * that matches the tag ender set or end tag ender set. ! * In the latter case, a virtual end tag is created. ! * Each node found that is not the end tag is added to * the list of children.<p> * The scanner's {@link #createTag} method is called with details about *************** *** 213,216 **** --- 195,199 ---- NodeList nodeList; Tag endTag; + String match; String name; TagScanner scanner; *************** *** 220,223 **** --- 203,207 ---- nodeList = new NodeList (); endTag = null; + match = tag.getTagName (); if (tag.isEmptyXmlTag ()) *************** *** 234,248 **** name = next.getTagName (); // check for normal end tag ! if (next.isEndTag () && name.equals (tag.getTagName ())) { endTag = next; node = null; } ! else if (isTagToBeEndedFor (next) || // check DTD ! ( // check for child of same name not allowed ! !(next.isEndTag ()) && ! !isAllowSelfChildren () && ! name.equals (tag.getTagName ()) ! )) { // insert a virtual end tag and backup one node --- 218,227 ---- name = next.getTagName (); // check for normal end tag ! if (next.isEndTag () && name.equals (match)) { endTag = next; node = null; } ! else if (isTagToBeEndedFor (next)) // check DTD { // insert a virtual end tag and backup one node *************** *** 338,341 **** --- 317,321 ---- ret = false; + name = tag.getTagName (); if (tag.isEndTag ()) *************** *** 345,353 **** return (ret); - } - - public final boolean isAllowSelfChildren() - { - return allowSelfChildren; } } --- 325,328 ---- Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** FormScanner.java 28 Oct 2003 03:04:18 -0000 1.49 --- FormScanner.java 28 Oct 2003 10:31:02 -0000 1.50 *************** *** 47,51 **** public static final String PREVIOUS_DIRTY_LINK_MESSAGE="Encountered a form tag after an open link tag.\nThere should have been an end tag for the link before the form tag began.\nCorrecting this.."; private boolean linkScannerAlreadyOpen=false; ! private static final String [] formTagEnders = {"HTML","BODY"}; /** --- 47,51 ---- public static final String PREVIOUS_DIRTY_LINK_MESSAGE="Encountered a form tag after an open link tag.\nThere should have been an end tag for the link before the form tag began.\nCorrecting this.."; private boolean linkScannerAlreadyOpen=false; ! private static final String [] formTagEnders = {"FORM","HTML","BODY"}; /** *************** *** 64,68 **** public FormScanner(String filter, Parser parser) { ! super(filter,MATCH_ID,formTagEnders,false); parser.addScanner(new InputTagScanner("-i")); parser.addScanner(new TextareaTagScanner("-t")); --- 64,68 ---- public FormScanner(String filter, Parser parser) { ! super(filter,MATCH_ID,formTagEnders); parser.addScanner(new InputTagScanner("-i")); parser.addScanner(new TextareaTagScanner("-t")); Index: HeadScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/HeadScanner.java,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** HeadScanner.java 26 Oct 2003 19:46:19 -0000 1.17 --- HeadScanner.java 28 Oct 2003 10:31:02 -0000 1.18 *************** *** 45,49 **** { private static final String MATCH_NAME [] = {"HEAD"}; ! private static final String ENDERS [] = {"BODY"}; private static final String END_TAG_ENDERS [] = {"HTML"}; --- 45,49 ---- { private static final String MATCH_NAME [] = {"HEAD"}; ! private static final String ENDERS [] = {"HEAD","BODY"}; private static final String END_TAG_ENDERS [] = {"HTML"}; *************** *** 55,59 **** public HeadScanner(String filter) { ! super(filter,MATCH_NAME,ENDERS,END_TAG_ENDERS,false); } --- 55,59 ---- public HeadScanner(String filter) { ! super(filter,MATCH_NAME,ENDERS,END_TAG_ENDERS); } Index: LabelScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/LabelScanner.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** LabelScanner.java 26 Oct 2003 19:46:20 -0000 1.35 --- LabelScanner.java 28 Oct 2003 10:31:02 -0000 1.36 *************** *** 43,51 **** public LabelScanner() { ! super(MATCH_NAME,new String [] {},false); } public LabelScanner(String filter) { ! super(filter,MATCH_NAME,new String [] {},false); } --- 43,51 ---- public LabelScanner() { ! super(MATCH_NAME,MATCH_NAME); } public LabelScanner(String filter) { ! super(filter,MATCH_NAME,MATCH_NAME); } Index: LinkScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/LinkScanner.java,v retrieving revision 1.57 retrieving revision 1.58 diff -C2 -d -r1.57 -r1.58 *** LinkScanner.java 27 Oct 2003 02:18:04 -0000 1.57 --- LinkScanner.java 28 Oct 2003 10:31:02 -0000 1.58 *************** *** 54,58 **** public static final String LINK_SCANNER_ID = "A"; public LinkProcessor processor; ! private final static String ENDERS [] = { "TD","TR","FORM","LI","BODY", "HTML" }; private final static String ENDTAG_ENDERS [] = { "TD","TR","FORM","LI","BODY", "HTML" }; --- 54,58 ---- public static final String LINK_SCANNER_ID = "A"; public LinkProcessor processor; ! private final static String ENDERS [] = { "A","TD","TR","FORM","LI","BODY", "HTML" }; private final static String ENDTAG_ENDERS [] = { "TD","TR","FORM","LI","BODY", "HTML" }; *************** *** 68,72 **** */ public LinkScanner(String filter) { ! super(filter,MATCH_NAME,ENDERS,ENDTAG_ENDERS, false); processor = new LinkProcessor(); } --- 68,72 ---- */ public LinkScanner(String filter) { ! super(filter,MATCH_NAME,ENDERS,ENDTAG_ENDERS); processor = new LinkProcessor(); } Index: OptionTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/OptionTagScanner.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** OptionTagScanner.java 28 Oct 2003 03:04:18 -0000 1.36 --- OptionTagScanner.java 28 Oct 2003 10:31:02 -0000 1.37 *************** *** 44,48 **** public OptionTagScanner(String filter) { ! super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } --- 44,48 ---- public OptionTagScanner(String filter) { ! super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS); } Index: SelectTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/SelectTagScanner.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** SelectTagScanner.java 28 Oct 2003 03:04:18 -0000 1.34 --- SelectTagScanner.java 28 Oct 2003 10:31:02 -0000 1.35 *************** *** 48,52 **** public SelectTagScanner(String filter) { ! super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } --- 48,52 ---- public SelectTagScanner(String filter) { ! super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS); } Index: TableColumnScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TableColumnScanner.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** TableColumnScanner.java 26 Oct 2003 19:46:21 -0000 1.37 --- TableColumnScanner.java 28 Oct 2003 10:31:02 -0000 1.38 *************** *** 44,48 **** public TableColumnScanner(String filter) { ! this(filter, MATCH_STRING, new String[] {}, new String[] {}, false); } --- 44,48 ---- public TableColumnScanner(String filter) { ! this(filter, MATCH_STRING, MATCH_STRING, new String[] {}); } *************** *** 51,63 **** String[] nameOfTagToMatch, String [] tagEnders, ! String [] endTagEnders, ! boolean allowSelfChildren) { super( filter, nameOfTagToMatch, tagEnders, ! endTagEnders, ! allowSelfChildren ! ); } public Tag createTag(Page page, int start, int end, Vector attributes, Tag startTag, Tag endTag, NodeList children) throws ParserException --- 51,60 ---- String[] nameOfTagToMatch, String [] tagEnders, ! String [] endTagEnders) { super( filter, nameOfTagToMatch, tagEnders, ! endTagEnders); } public Tag createTag(Page page, int start, int end, Vector attributes, Tag startTag, Tag endTag, NodeList children) throws ParserException Index: TableRowScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TableRowScanner.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** TableRowScanner.java 26 Oct 2003 19:46:21 -0000 1.40 --- TableRowScanner.java 28 Oct 2003 10:31:02 -0000 1.41 *************** *** 46,50 **** public TableRowScanner(String filter,Parser parser) { ! this(filter, parser, MATCH_STRING, new String[] {}, new String[] {}, false); } --- 46,50 ---- public TableRowScanner(String filter,Parser parser) { ! this(filter, parser, MATCH_STRING, MATCH_STRING, new String[] {}); } *************** *** 54,66 **** String[] nameOfTagToMatch, String [] tagEnders, ! String [] endTagEnders, ! boolean allowSelfChildren) { super( filter, nameOfTagToMatch, tagEnders, ! endTagEnders, ! allowSelfChildren ! ); parser.addScanner(new TableColumnScanner()); } --- 54,63 ---- String[] nameOfTagToMatch, String [] tagEnders, ! String [] endTagEnders) { super( filter, nameOfTagToMatch, tagEnders, ! endTagEnders); parser.addScanner(new TableColumnScanner()); } Index: TableScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TableScanner.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** TableScanner.java 26 Oct 2003 19:46:21 -0000 1.39 --- TableScanner.java 28 Oct 2003 10:31:02 -0000 1.40 *************** *** 48,54 **** public TableScanner(Parser parser,String filter) { ! super(filter, MATCH_STRING, ENDERS, ENDTAG_ENDERS, true); parser.addScanner(new TableRowScanner(parser)); - } --- 48,53 ---- public TableScanner(Parser parser,String filter) { ! super(filter, MATCH_STRING, ENDERS, ENDTAG_ENDERS); parser.addScanner(new TableRowScanner(parser)); } Index: TextareaTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TextareaTagScanner.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** TextareaTagScanner.java 28 Oct 2003 03:04:18 -0000 1.31 --- TextareaTagScanner.java 28 Oct 2003 10:31:02 -0000 1.32 *************** *** 45,49 **** public TextareaTagScanner(String filter) { ! super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } --- 45,49 ---- public TextareaTagScanner(String filter) { ! super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS); } Index: TitleScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TitleScanner.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** TitleScanner.java 27 Oct 2003 02:18:04 -0000 1.33 --- TitleScanner.java 28 Oct 2003 10:31:02 -0000 1.34 *************** *** 42,50 **** public class TitleScanner extends CompositeTagScanner { private static final String MATCH_NAME [] = {"TITLE"}; ! private static final String ENDERS [] = {"BODY"}; private static final String END_TAG_ENDERS [] = {"HEAD", "HTML"}; public TitleScanner(String filter) { ! super(filter,MATCH_NAME,ENDERS,END_TAG_ENDERS,false); } --- 42,50 ---- public class TitleScanner extends CompositeTagScanner { private static final String MATCH_NAME [] = {"TITLE"}; ! private static final String ENDERS [] = {"TITLE","BODY"}; private static final String END_TAG_ENDERS [] = {"HEAD", "HTML"}; public TitleScanner(String filter) { ! super(filter,MATCH_NAME,ENDERS,END_TAG_ENDERS); } |
From: <der...@us...> - 2003-10-28 03:05:42
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1:/tmp/cvs-serv19975/tests/tagTests Modified Files: OptionTagTest.java SelectTagTest.java TextareaTagTest.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: OptionTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/OptionTagTest.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** OptionTagTest.java 26 Oct 2003 19:46:27 -0000 1.35 --- OptionTagTest.java 28 Oct 2003 03:04:19 -0000 1.36 *************** *** 32,37 **** package org.htmlparser.tests.tagTests; - import java.util.Stack; - import org.htmlparser.scanners.OptionTagScanner; import org.htmlparser.tags.OptionTag; --- 32,35 ---- *************** *** 70,74 **** super.setUp(); createParser(testHTML); ! parser.addScanner(new OptionTagScanner("-option", new Stack ())); parseAndAssertNodeCount(13); } --- 68,72 ---- super.setUp(); createParser(testHTML); ! parser.addScanner(new OptionTagScanner("-option")); parseAndAssertNodeCount(13); } Index: SelectTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/SelectTagTest.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** SelectTagTest.java 26 Oct 2003 19:46:27 -0000 1.35 --- SelectTagTest.java 28 Oct 2003 03:04:19 -0000 1.36 *************** *** 32,37 **** package org.htmlparser.tests.tagTests; - import java.util.Stack; - import org.htmlparser.scanners.OptionTagScanner; import org.htmlparser.scanners.SelectTagScanner; --- 32,35 ---- *************** *** 80,85 **** super.setUp(); createParser(testHTML); ! parser.addScanner(new SelectTagScanner("-s", new Stack ())); ! parser.addScanner(new OptionTagScanner("-o", new Stack ())); parseAndAssertNodeCount(1); assertTrue("Node 1 should be Select Tag",node[0] instanceof SelectTag); --- 78,83 ---- super.setUp(); createParser(testHTML); ! parser.addScanner(new SelectTagScanner("-s")); ! parser.addScanner(new OptionTagScanner("-o")); parseAndAssertNodeCount(1); assertTrue("Node 1 should be Select Tag",node[0] instanceof SelectTag); Index: TextareaTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/TextareaTagTest.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** TextareaTagTest.java 26 Oct 2003 19:46:27 -0000 1.33 --- TextareaTagTest.java 28 Oct 2003 03:04:19 -0000 1.34 *************** *** 29,34 **** package org.htmlparser.tests.tagTests; - import java.util.Stack; - import org.htmlparser.scanners.TextareaTagScanner; import org.htmlparser.tags.TextareaTag; --- 29,32 ---- *************** *** 62,66 **** super.setUp(); createParser(testHTML); ! parser.addScanner(new TextareaTagScanner("-t", new Stack ())); parseAndAssertNodeCount(5); } --- 60,64 ---- super.setUp(); createParser(testHTML); ! parser.addScanner(new TextareaTagScanner("-t")); parseAndAssertNodeCount(5); } |
From: <der...@us...> - 2003-10-28 03:05:42
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv19975/scanners Modified Files: BulletListScanner.java BulletScanner.java CompositeTagScanner.java FormScanner.java OptionTagScanner.java ScriptScanner.java SelectTagScanner.java TextareaTagScanner.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: BulletListScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/BulletListScanner.java,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** BulletListScanner.java 26 Oct 2003 19:46:19 -0000 1.19 --- BulletListScanner.java 28 Oct 2003 03:04:18 -0000 1.20 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; --- 29,32 ---- *************** *** 47,51 **** private static final String [] MATCH_STRING = { "UL", "OL" }; private final static String ENDERS [] = { "BODY", "HTML" }; - private Stack ulli = new Stack(); public BulletListScanner(Parser parser) --- 46,49 ---- *************** *** 57,61 **** { super(filter, MATCH_STRING, ENDERS); ! parser.addScanner(new BulletScanner("-bullet",ulli)); } --- 55,59 ---- { super(filter, MATCH_STRING, ENDERS); ! parser.addScanner(new BulletScanner("-bullet")); } *************** *** 80,88 **** return MATCH_STRING; } - - public void beforeScanningStarts() - { - ulli.push(this); - } - } --- 78,80 ---- Index: BulletScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/BulletScanner.java,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** BulletScanner.java 26 Oct 2003 19:46:19 -0000 1.24 --- BulletScanner.java 28 Oct 2003 03:04:18 -0000 1.25 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; import org.htmlparser.lexer.Page; --- 29,32 ---- *************** *** 50,68 **** { private static final String [] MATCH_STRING = {"LI"}; ! private final static String ENDERS [] = { "BODY", "HTML" }; private final static String END_TAG_ENDERS [] = { "UL" }; - private Stack ulli; - - public BulletScanner(Stack ulli) - { - this("",ulli); - } ! public BulletScanner(String filter, Stack ulli) { super(filter, MATCH_STRING, ENDERS, END_TAG_ENDERS, false); - this.ulli = ulli; } ! public Tag createTag(Page page, int start, int end, Vector attributes, Tag startTag, Tag endTag, NodeList children) throws ParserException { --- 49,60 ---- { private static final String [] MATCH_STRING = {"LI"}; ! private final static String ENDERS [] = { "LI", "BODY", "HTML" }; private final static String END_TAG_ENDERS [] = { "UL" }; ! public BulletScanner(String filter) { super(filter, MATCH_STRING, ENDERS, END_TAG_ENDERS, false); } ! public Tag createTag(Page page, int start, int end, Vector attributes, Tag startTag, Tag endTag, NodeList children) throws ParserException { *************** *** 85,110 **** return MATCH_STRING; } - - /** - * This is the logic that decides when a bullet tag can be allowed - */ - public boolean shouldCreateEndTagAndExit() - { - if (ulli.size()==0) - return false; - CompositeTagScanner parentScanner = (CompositeTagScanner)ulli.peek(); - if (parentScanner == this) - { - ulli.pop(); - return true; - } - else - return false; - } - - public void beforeScanningStarts() - { - ulli.push(this); - } - } --- 77,79 ---- Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.73 retrieving revision 1.74 diff -C2 -d -r1.73 -r1.74 *** CompositeTagScanner.java 26 Oct 2003 19:46:19 -0000 1.73 --- CompositeTagScanner.java 28 Oct 2003 03:04:18 -0000 1.74 *************** *** 188,196 **** /** * Collect the children. - * Performs an immediate call to {@link #shouldCreateEndTagAndExit} to - * allow subclasses to override the scan is a primitive way. If - * <code>true</code>, returns a virtual end tag and repositions the lexer - * to re-read that same tag.<p> - * Otherwise, calls {@link #beforeScanningStarts} and begins scanning. * An initial test is performed for an empty XML tag, in which case * the start tag and end tag of the returned tag are the same and it has --- 188,191 ---- *************** *** 202,206 **** * <code>false</code>. In all but the first case, a virtual end tag * is created. Each node found that is not the end tag is added to ! * the list of children and a call made to {@link #childNodeEncountered}.<p> * The scanner's {@link #createTag} method is called with details about * the start tag, end tag and children. The attributes from the start tag --- 197,201 ---- * <code>false</code>. In all but the first case, a virtual end tag * is created. Each node found that is not the end tag is added to ! * the list of children.<p> * The scanner's {@link #createTag} method is called with details about * the start tag, end tag and children. The attributes from the start tag *************** *** 211,217 **** * @param url The url for the page the tag is discovered on. * @param lexer The source of subsequent nodes. ! * @return The scanner specific tag from the call to {@link #createTag}., ! * or the virtual end tag if {@link #shouldCreateEndTagAndExit} returned ! * <code>true</code>. */ public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException --- 206,210 ---- * @param url The url for the page the tag is discovered on. * @param lexer The source of subsequent nodes. ! * @return The scanner specific tag from the call to {@link #createTag}. */ public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException *************** *** 220,284 **** NodeList nodeList; Tag endTag; CompositeTag composite; Tag ret; ! if (shouldCreateEndTagAndExit ()) ! { ! ret = createVirtualEndTag (tag, lexer.getPage (), tag.elementBegin ()); ! lexer.setPosition (tag.elementBegin ()); ! } else ! { ! beforeScanningStarts (); ! nodeList = new NodeList (); ! endTag = null; ! ! if (tag.isEmptyXmlTag ()) ! endTag = tag; ! else ! do { ! node = lexer.nextNode (balance_quotes); ! if (null != node) { ! if (node instanceof Tag) { ! Tag end = (Tag)node; ! // check for normal end tag ! if (end.isEndTag () && end.getTagName ().equals (tag.getTagName ())) ! { ! endTag = end; ! node = null; ! } ! else if (isTagToBeEndedFor (end) || // check DTD ! ( // check for child of same name not allowed ! !(end.isEndTag ()) && ! !isAllowSelfChildren () && ! end.getTagName ().equals (tag.getTagName ()) ! )) ! { ! endTag = createVirtualEndTag (tag, lexer.getPage (), end.elementBegin ()); ! lexer.setPosition (end.elementBegin ()); ! node = null; ! } } ! ! if (null != node) { ! nodeList.add (node); ! childNodeEncountered (node); } } } while (null != node); ! ! if (null == endTag) ! endTag = createVirtualEndTag (tag, lexer.getPage (), lexer.getCursor ().getPosition ()); ! ! composite = (CompositeTag)createTag (lexer.getPage (), tag.elementBegin (), endTag.elementEnd (), tag.getAttributesEx (), tag, endTag, nodeList); ! for (int i = 0; i < composite.getChildCount (); i++) ! composite.childAt (i).setParent (composite); ! ret = composite; ! } return (ret); --- 213,280 ---- NodeList nodeList; Tag endTag; + String name; + TagScanner scanner; CompositeTag composite; Tag ret; ! nodeList = new NodeList (); ! endTag = null; ! ! if (tag.isEmptyXmlTag ()) ! endTag = tag; else ! do ! { ! node = lexer.nextNode (balance_quotes); ! if (null != node) { ! if (node instanceof Tag) { ! Tag next = (Tag)node; ! name = next.getTagName (); ! // check for normal end tag ! if (next.isEndTag () && name.equals (tag.getTagName ())) { ! endTag = next; ! node = null; } ! else if (isTagToBeEndedFor (next) || // check DTD ! ( // check for child of same name not allowed ! !(next.isEndTag ()) && ! !isAllowSelfChildren () && ! name.equals (tag.getTagName ()) ! )) { ! // insert a virtual end tag and backup one node ! endTag = createVirtualEndTag (tag, lexer.getPage (), next.elementBegin ()); ! lexer.setPosition (next.elementBegin ()); ! node = null; ! } ! else if (!next.isEndTag ()) ! { ! // now recurse if there is a scanner for this type of tag ! // whoah! really cheat here to get the parser ! // maybe eventually the tag will know it's own scanner eh ! org.htmlparser.Parser parser = (org.htmlparser.Parser)lexer.getNodeFactory (); ! scanner = parser.getScanner (name); ! if ((null != scanner) && scanner.evaluate (next, this)) ! node = scanner.createScannedNode (next, lexer.getPage ().getUrl (), lexer); } } + + if (null != node) + nodeList.add (node); } + } while (null != node); ! ! if (null == endTag) ! endTag = createVirtualEndTag (tag, lexer.getPage (), lexer.getCursor ().getPosition ()); ! ! composite = (CompositeTag)createTag (lexer.getPage (), tag.elementBegin (), endTag.elementEnd (), tag.getAttributesEx (), tag, endTag, nodeList); ! for (int i = 0; i < composite.getChildCount (); i++) ! composite.childAt (i).setParent (composite); ! ret = composite; ! return (ret); *************** *** 312,332 **** /** - * Override this method if you wish to create any data structures or do anything - * before the start of the scan. This is just after a tag has triggered the scanner - * but before the scanner begins its processing. - */ - public void beforeScanningStarts() - { - } - - /** - * This method is called everytime a child to the composite is found. It is useful when we - * need to store special children seperately. Though, all children are collected anyway into a node list. - */ - public void childNodeEncountered(Node node) - { - } - - /** * For composite tags this shouldn't be used and hence throws an exception. */ --- 308,311 ---- *************** *** 368,383 **** } ! public final boolean isAllowSelfChildren() { return allowSelfChildren; - } - - /** - * Override this method to implement scanner logic that determines if the current scanner is - * to be allowed. This is useful when there are rules which dont allow recursive tags of the same - * type. @see BulletScanner - * @return boolean true/false - */ - public boolean shouldCreateEndTagAndExit() { - return false; } } --- 347,353 ---- } ! public final boolean isAllowSelfChildren() ! { return allowSelfChildren; } } Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** FormScanner.java 27 Oct 2003 02:18:04 -0000 1.48 --- FormScanner.java 28 Oct 2003 03:04:18 -0000 1.49 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; --- 29,32 ---- *************** *** 50,55 **** private static final String [] formTagEnders = {"HTML","BODY"}; - private Stack stack = new Stack(); - /** * Constructs a form scanner. --- 49,52 ---- *************** *** 69,75 **** super(filter,MATCH_ID,formTagEnders,false); parser.addScanner(new InputTagScanner("-i")); ! parser.addScanner(new TextareaTagScanner("-t",stack)); ! parser.addScanner(new SelectTagScanner("-select", stack)); ! parser.addScanner(new OptionTagScanner("-option",stack)); } --- 66,72 ---- super(filter,MATCH_ID,formTagEnders,false); parser.addScanner(new InputTagScanner("-i")); ! parser.addScanner(new TextareaTagScanner("-t")); ! parser.addScanner(new SelectTagScanner("-select")); ! parser.addScanner(new OptionTagScanner("-option")); } *************** *** 139,148 **** FormTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new FormTag (); ret.setPage (page); --- 136,139 ---- *************** *** 161,169 **** return (ret); - } - - public void beforeScanningStarts() - { - stack.push(this); } } --- 152,155 ---- Index: OptionTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/OptionTagScanner.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** OptionTagScanner.java 26 Oct 2003 19:46:21 -0000 1.35 --- OptionTagScanner.java 28 Oct 2003 03:04:18 -0000 1.36 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; import org.htmlparser.lexer.Page; --- 29,32 ---- *************** *** 43,55 **** private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = { "SELECT", "FORM", "BODY", "HTML" }; - private Stack stack; - - public OptionTagScanner(Stack stack) { - this("", stack); - } ! public OptionTagScanner(String filter, Stack stack) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); - this.stack = stack; } --- 42,48 ---- private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = { "SELECT", "FORM", "BODY", "HTML" }; ! public OptionTagScanner(String filter) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } *************** *** 62,71 **** OptionTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new OptionTag (); ret.setPage (page); --- 55,58 ---- *************** *** 76,110 **** ret.setEndTag (endTag); ret.setChildren (children); - - return (ret); - } - - public void beforeScanningStarts () - { - stack.push (this); - } - - /** - * This is the logic that decides when a option tag can be allowed - */ - public boolean shouldCreateEndTagAndExit () - { - boolean ret; - - ret = false; - - if (0 != stack.size ()) - { - TagScanner parentScanner = (TagScanner)stack.peek (); - if (parentScanner instanceof CompositeTagScanner) - { - CompositeTagScanner scanner = (CompositeTagScanner)parentScanner; - if (scanner.tagEnderSet.contains (MATCH_NAME[0])) // should loop over names - { - stack.pop (); - ret = true; - } - } - } return (ret); --- 63,66 ---- Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** ScriptScanner.java 26 Oct 2003 19:46:21 -0000 1.44 --- ScriptScanner.java 28 Oct 2003 03:04:18 -0000 1.45 *************** *** 135,139 **** else // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); } else if (node instanceof RemarkNode) --- 135,139 ---- else // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer.getPage (), node.elementBegin (), node.elementEnd ()); } else if (node instanceof RemarkNode) *************** *** 145,149 **** // TODO: need to remove this cast // last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); ! last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); } } --- 145,149 ---- // TODO: need to remove this cast // last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); ! last = (StringNode)factory.createStringNode (lexer.getPage (), node.elementBegin (), node.elementEnd ()); } } *************** *** 163,167 **** if (null == last) // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer, position, position); // build new end tag if required if (null == end) --- 163,167 ---- if (null == last) // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer.getPage (), position, position); // build new end tag if required if (null == end) Index: SelectTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/SelectTagScanner.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** SelectTagScanner.java 26 Oct 2003 19:46:21 -0000 1.33 --- SelectTagScanner.java 28 Oct 2003 03:04:18 -0000 1.34 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; --- 29,32 ---- *************** *** 46,60 **** private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; - private Stack stack; - - public SelectTagScanner(Stack stack) - { - this("", stack); - } ! public SelectTagScanner(String filter, Stack stack) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); - this.stack = stack; } --- 45,52 ---- private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; ! public SelectTagScanner(String filter) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } *************** *** 68,77 **** SelectTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new SelectTag (); ret.setPage (page); --- 60,63 ---- *************** *** 82,116 **** ret.setEndTag (endTag); ret.setChildren (children); - - return (ret); - } - - public void beforeScanningStarts () - { - stack.push (this); - } - - /** - * This is the logic that decides when a option tag can be allowed - */ - public boolean shouldCreateEndTagAndExit () - { - boolean ret; - - ret = false; - - if (0 != stack.size ()) - { - TagScanner parentScanner = (TagScanner)stack.peek (); - if (parentScanner instanceof CompositeTagScanner) - { - CompositeTagScanner scanner = (CompositeTagScanner)parentScanner; - if (scanner.tagEnderSet.contains (MATCH_NAME[0])) // should loop over names - { - stack.pop (); - ret = true; - } - } - } return (ret); --- 68,71 ---- Index: TextareaTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TextareaTagScanner.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** TextareaTagScanner.java 26 Oct 2003 19:46:21 -0000 1.30 --- TextareaTagScanner.java 28 Oct 2003 03:04:18 -0000 1.31 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; import org.htmlparser.lexer.Page; --- 29,32 ---- *************** *** 43,57 **** private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; - private Stack stack; - - public TextareaTagScanner(Stack stack) - { - this("", stack); - } ! public TextareaTagScanner(String filter, Stack stack) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); - this.stack = stack; } --- 42,49 ---- private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; ! public TextareaTagScanner(String filter) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } *************** *** 64,73 **** TextareaTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new TextareaTag (); ret.setPage (page); --- 56,59 ---- *************** *** 78,112 **** ret.setEndTag (endTag); ret.setChildren (children); - - return (ret); - } - - public void beforeScanningStarts () - { - stack.push (this); - } - - /** - * This is the logic that decides when a option tag can be allowed - */ - public boolean shouldCreateEndTagAndExit () - { - boolean ret; - - ret = false; - - if (0 != stack.size ()) - { - TagScanner parentScanner = (TagScanner)stack.peek (); - if (parentScanner instanceof CompositeTagScanner) - { - CompositeTagScanner scanner = (CompositeTagScanner)parentScanner; - if (scanner.tagEnderSet.contains (MATCH_NAME[0])) // should loop over names - { - stack.pop (); - ret = true; - } - } - } return (ret); --- 64,67 ---- |
From: <der...@us...> - 2003-10-28 03:05:42
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv19975 Modified Files: Parser.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.69 retrieving revision 1.70 diff -C2 -d -r1.69 -r1.70 *** Parser.java 27 Oct 2003 02:18:02 -0000 1.69 --- Parser.java 28 Oct 2003 03:04:17 -0000 1.70 *************** *** 981,993 **** /** * Create a new string node. ! * @param lexer The lexer parsing this string. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ ! public Node createStringNode (Lexer lexer, int start, int end) { Node ret; ! ret = new StringNode (lexer.getPage (), start, end); if (null != stringNodeFactory) { --- 981,993 ---- /** * Create a new string node. ! * @param page The page the node is on. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ ! public Node createStringNode (Page page, int start, int end) { Node ret; ! ret = new StringNode (page, start, end); if (null != stringNodeFactory) { *************** *** 1005,1015 **** /** * Create a new remark node. ! * @param lexer The lexer parsing this remark. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ ! public Node createRemarkNode (Lexer lexer, int start, int end) { ! return (new RemarkNode (lexer.getPage (), start, end)); } --- 1005,1015 ---- /** * Create a new remark node. ! * @param page The page the node is on. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ ! public Node createRemarkNode (Page page, int start, int end) { ! return (new RemarkNode (page, start, end)); } *************** *** 1020,1091 **** * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. ! * @param lexer The lexer parsing this tag. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ ! public Node createTagNode (Lexer lexer, int start, int end, Vector attributes) ! throws ! ParserException ! { ! return (new Tag (lexer.getPage (), start, end, attributes)); ! } ! ! /** ! * Scan a new tag node. ! * Provides composite tags the opportunity to collect their children by ! * scanning forward using the same lexer that created the composite tag. ! * On isolating a tag, processing in the lexer is: ! * <pre><code> ! * Node node = getNodeFactory ().createTagNode (this, begin, end, attributes); ! * node = getNodeFactory ().scanTagNode (this, node); ! * </code></pre> ! * This two step process, allows a node factory to only handle node ! * creation if it wishes, and delegate the recursion and scanning of child ! * nodes to the original factory. ! * Without giving too much implementation details, the low level lexer node ! * factory simply returns the same tag, while the higher level parser node ! * factory checks for a scanner registered for the node type and if there ! * is one, calls the scanner to create the specific type of node, which ! * advances the lexer past the children of the node. ! * @param lexer The lexer that parsed this tag. ! * @param tag The tag (just) created by createTagNode. Although this is ! * of type Node, it can safely be cast to the type returned by ! * {@link #createTagNode createTagNode}. ! * @return Either the same node or a new node containing children. ! * In any case the lexer should be positioned to proceed with the isolation ! * of the next unknown node. ! */ ! public Node scanTagNode (Lexer lexer, Node tag) throws ParserException { ! String name; ! TagScanner save; ! TagScanner scanner; ! Tag ret; ! ! ret = (Tag)tag; ! if (!ret.isEndTag ()) ! { ! // now recurse if there is a scanner for this type of tag ! name = ret.getTagName (); ! scanner = (TagScanner)scanners.get (name); ! save = mScanner; ! if ((null != scanner) && scanner.evaluate (ret, save)) ! { ! mScanner = scanner; ! try ! { ! ret = scanner.createScannedNode (ret, lexer.getPage ().getUrl (), lexer); ! } ! finally ! { ! mScanner = save; ! } ! } ! } ! ! return (ret); } } --- 1020,1033 ---- * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. ! * @param page The page the node is on. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ ! public Node createTagNode (Page page, int start, int end, Vector attributes) throws ParserException { ! return (new Tag (page, start, end, attributes)); } } |
From: <der...@us...> - 2003-10-28 03:05:42
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv19975/lexer Modified Files: Lexer.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** Lexer.java 26 Oct 2003 19:46:18 -0000 1.17 --- Lexer.java 28 Oct 2003 03:04:18 -0000 1.18 *************** *** 368,372 **** { // got some characters mCursor = cursor; ! ret = getNodeFactory ().createStringNode (this, begin, end); } else --- 368,372 ---- { // got some characters mCursor = cursor; ! ret = getNodeFactory ().createStringNode (this.getPage (), begin, end); } else *************** *** 752,757 **** return (makeString (cursor)); mCursor = cursor; ! ret = getNodeFactory ().createTagNode (this, begin, end, attributes); ! ret = getNodeFactory ().scanTagNode (this, ret); } else --- 752,756 ---- return (makeString (cursor)); mCursor = cursor; ! ret = getNodeFactory ().createTagNode (this.getPage (), begin, end, attributes); } else *************** *** 896,900 **** return (makeString (cursor)); mCursor = cursor; ! ret = getNodeFactory ().createRemarkNode (this, begin, end); } else --- 895,899 ---- return (makeString (cursor)); mCursor = cursor; ! ret = getNodeFactory ().createRemarkNode (this.getPage (), begin, end); } else *************** *** 910,931 **** /** * Create a new string node. ! * @param lexer The lexer parsing this string. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ ! public Node createStringNode (Lexer lexer, int start, int end) { ! return (new StringNode (lexer.getPage (), start, end)); } /** * Create a new remark node. ! * @param lexer The lexer parsing this remark. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ ! public Node createRemarkNode (Lexer lexer, int start, int end) { ! return (new RemarkNode (lexer.getPage (), start, end)); } --- 909,930 ---- /** * Create a new string node. ! * @param page The page the node is on. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ ! public Node createStringNode (Page page, int start, int end) { ! return (new StringNode (page, start, end)); } /** * Create a new remark node. ! * @param page The page the node is on. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ ! public Node createRemarkNode (Page page, int start, int end) { ! return (new RemarkNode (page, start, end)); } *************** *** 936,977 **** * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. ! * @param lexer The lexer parsing this tag. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ ! public Node createTagNode (Lexer lexer, int start, int end, Vector attributes) ! { ! return (new TagNode (lexer.getPage (), start, end, attributes)); ! } ! ! /** ! * Scan a new tag node. ! * Provides composite tags the opportunity to collect their children by ! * scanning forward using the same lexer that created the composite tag. ! * On isolating a tag, processing in the lexer is: ! * <pre><code> ! * Node node = getNodeFactory ().createTagNode (this, begin, end, attributes); ! * node = getNodeFactory ().scanTagNode (this, node); ! * </code></pre> ! * This two step process, allows a node factory to only handle node ! * creation if it wishes, and delegate the recursion and scanning of child ! * nodes to the original factory. ! * Without giving too much implementation details, the low level lexer node ! * factory simply returns the same tag, while the higher level parser node ! * factory checks for a scanner registered for the node type and if there ! * is one, calls the scanner to create the specific type of node, which ! * advances the lexer past the children of the node. ! * @param lexer The lexer that parsed this tag. ! * @param tag The tag (just) created by createTagNode. Although this is ! * of type Node, it can safely be cast to the type returned by ! * {@link #createTagNode createTagNode}. ! * @return Either the same node or a new node containing children. ! * In any case the lexer should be positioned to proceed with the isolation ! * of the next unknown node. ! */ ! public Node scanTagNode (Lexer lexer, Node tag) { ! return (tag); } --- 935,946 ---- * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. ! * @param page The page the node is on. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ ! public Node createTagNode (Page page, int start, int end, Vector attributes) { ! return (new TagNode (page, start, end, attributes)); } |
From: <der...@us...> - 2003-10-28 03:05:41
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1:/tmp/cvs-serv19975/lexer/nodes Modified Files: NodeFactory.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: NodeFactory.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/NodeFactory.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** NodeFactory.java 20 Oct 2003 01:28:02 -0000 1.2 --- NodeFactory.java 28 Oct 2003 03:04:18 -0000 1.3 *************** *** 28,32 **** import java.util.Vector; ! import org.htmlparser.lexer.Lexer; import org.htmlparser.Node; import org.htmlparser.util.ParserException; --- 28,32 ---- import java.util.Vector; ! import org.htmlparser.lexer.Page; import org.htmlparser.Node; import org.htmlparser.util.ParserException; *************** *** 41,49 **** /** * Create a new string node. ! * @param lexer The lexer parsing this string. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ ! public Node createStringNode (Lexer lexer, int start, int end) throws ParserException; --- 41,49 ---- /** * Create a new string node. ! * @param page The page the node is on. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ ! public Node createStringNode (Page page, int start, int end) throws ParserException; *************** *** 51,59 **** /** * Create a new remark node. ! * @param lexer The lexer parsing this remark. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ ! public Node createRemarkNode (Lexer lexer, int start, int end) throws ParserException; --- 51,59 ---- /** * Create a new remark node. ! * @param page The page the node is on. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ ! public Node createRemarkNode (Page page, int start, int end) throws ParserException; *************** *** 65,103 **** * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. ! * @param lexer The lexer parsing this tag. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ ! public Node createTagNode (Lexer lexer, int start, int end, Vector attributes) ! throws ! ParserException; ! ! /** ! * Scan a new tag node. ! * Provides composite tags the opportunity to collect their children by ! * scanning forward using the same lexer that created the composite tag. ! * On isolating a tag, processing in the lexer is: ! * <pre><code> ! * Node node = getNodeFactory ().createTagNode (this, begin, end, attributes); ! * node = getNodeFactory ().scanTagNode (this, node); ! * </code></pre> ! * This two step process, allows a node factory to only handle node ! * creation if it wishes, and delegate the recursion and scanning of child ! * nodes to the original factory. ! * Without giving too much implementation details, the low level lexer node ! * factory simply returns the same tag, while the higher level parser node ! * factory checks for a scanner registered for the node type and if there ! * is one, calls the scanner to create the specific type of node, which ! * advances the lexer past the children of the node. ! * @param lexer The lexer that parsed this tag. ! * @param tag The tag (just) created by createTagNode. Although this is ! * of type Node, it can safely be cast to the type returned by ! * {@link #createTagNode createTagNode}. ! * @return Either the same node or a new node containing children. ! * In any case the lexer should be positioned to proceed with the isolation ! * of the next unknown node. ! */ ! public Node scanTagNode (Lexer lexer, Node tag) throws ParserException; --- 65,74 ---- * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. ! * @param page The page the node is on. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ ! public Node createTagNode (Page page, int start, int end, Vector attributes) throws ParserException; |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv19975/tests/scannersTests Modified Files: CompositeTagScannerTest.java OptionTagScannerTest.java SelectTagScannerTest.java TextareaTagScannerTest.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: CompositeTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** CompositeTagScannerTest.java 26 Oct 2003 19:46:26 -0000 1.45 --- CompositeTagScannerTest.java 28 Oct 2003 03:04:19 -0000 1.46 *************** *** 432,436 **** customTag = (CustomTag)node[1]; assertStringEquals( ! "first custom tag html", "<CUSTOM>something</CUSTOM>", customTag.toHtml() --- 432,436 ---- customTag = (CustomTag)node[1]; assertStringEquals( ! "second custom tag html", "<CUSTOM>something</CUSTOM>", customTag.toHtml() *************** *** 438,442 **** Tag endTag = (Tag)node[2]; assertStringEquals( ! "first custom tag html", "</CUSTOM>", endTag.toHtml() --- 438,442 ---- Tag endTag = (Tag)node[2]; assertStringEquals( ! "third custom tag html", "</CUSTOM>", endTag.toHtml() *************** *** 468,472 **** customTag = (CustomTag)node[1]; assertStringEquals( ! "first custom tag html", "<CUSTOM>something</CUSTOM>", customTag.toHtml() --- 468,472 ---- customTag = (CustomTag)node[1]; assertStringEquals( ! "second custom tag html", "<CUSTOM>something</CUSTOM>", customTag.toHtml() *************** *** 485,489 **** Tag endTag = (Tag)node[2]; assertStringEquals( ! "first custom tag html", "</CUSTOM>", endTag.toHtml() --- 485,489 ---- Tag endTag = (Tag)node[2]; assertStringEquals( ! "third custom tag html", "</CUSTOM>", endTag.toHtml() Index: OptionTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/OptionTagScannerTest.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** OptionTagScannerTest.java 26 Oct 2003 19:46:27 -0000 1.32 --- OptionTagScannerTest.java 28 Oct 2003 03:04:19 -0000 1.33 *************** *** 29,34 **** package org.htmlparser.tests.scannersTests; - import java.util.Stack; - import org.htmlparser.Node; import org.htmlparser.StringNode; --- 29,32 ---- *************** *** 67,71 **** public void testScan() throws ParserException { ! scanner = new OptionTagScanner("-i", new Stack ()); createParser(testHTML,"http://www.google.com/test/index.html"); parser.addScanner(scanner); --- 65,69 ---- public void testScan() throws ParserException { ! scanner = new OptionTagScanner("-i"); createParser(testHTML,"http://www.google.com/test/index.html"); parser.addScanner(scanner); Index: SelectTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/SelectTagScannerTest.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** SelectTagScannerTest.java 26 Oct 2003 19:46:27 -0000 1.31 --- SelectTagScannerTest.java 28 Oct 2003 03:04:19 -0000 1.32 *************** *** 29,34 **** package org.htmlparser.tests.scannersTests; - import java.util.Stack; - import org.htmlparser.scanners.OptionTagScanner; import org.htmlparser.scanners.SelectTagScanner; --- 29,32 ---- *************** *** 69,77 **** { ! scanner = new SelectTagScanner("-i", new Stack ()); createParser(testHTML,"http://www.google.com/test/index.html"); ! scanner = new SelectTagScanner("-ta", new Stack ()); parser.addScanner(scanner); ! parser.addScanner(new OptionTagScanner("", new Stack ())); --- 67,75 ---- { ! scanner = new SelectTagScanner("-i"); createParser(testHTML,"http://www.google.com/test/index.html"); ! scanner = new SelectTagScanner("-ta"); parser.addScanner(scanner); ! parser.addScanner(new OptionTagScanner("")); Index: TextareaTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/TextareaTagScannerTest.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** TextareaTagScannerTest.java 26 Oct 2003 19:46:27 -0000 1.29 --- TextareaTagScannerTest.java 28 Oct 2003 03:04:19 -0000 1.30 *************** *** 29,34 **** package org.htmlparser.tests.scannersTests; - import java.util.Stack; - import org.htmlparser.scanners.TextareaTagScanner; import org.htmlparser.tags.TextareaTag; --- 29,32 ---- *************** *** 61,67 **** public void testScan() throws ParserException { ! scanner = new TextareaTagScanner("-i", new Stack ()); createParser(testHTML); ! scanner = new TextareaTagScanner("-ta", new Stack ()); parser.addScanner(scanner); parseAndAssertNodeCount(5); --- 59,65 ---- public void testScan() throws ParserException { ! scanner = new TextareaTagScanner("-i"); createParser(testHTML); ! scanner = new TextareaTagScanner("-ta"); parser.addScanner(scanner); parseAndAssertNodeCount(5); |
From: <der...@us...> - 2003-10-28 03:05:22
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv19975/util Modified Files: IteratorImpl.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** IteratorImpl.java 26 Oct 2003 19:46:28 -0000 1.31 --- IteratorImpl.java 28 Oct 2003 03:04:19 -0000 1.32 *************** *** 58,62 **** --- 58,85 ---- ret = mLexer.nextNode (); if (null != ret) + { + // kick off recursion for the top level node + if (ret instanceof org.htmlparser.tags.Tag) + { + org.htmlparser.tags.Tag tag; + String name; + org.htmlparser.scanners.TagScanner scanner; + + tag = (org.htmlparser.tags.Tag)ret; + if (!tag.isEndTag ()) + { + // now recurse if there is a scanner for this type of tag + name = tag.getTagName (); + // whoah! really cheat here to get the parser + // maybe eventually the tag will know it's own scanner eh + org.htmlparser.Parser parser = (org.htmlparser.Parser)mLexer.getNodeFactory (); + scanner = parser.getScanner (name); + if ((null != scanner) && scanner.evaluate (tag, null)) + ret = scanner.createScannedNode (tag, mLexer.getPage ().getUrl (), mLexer); + } + } + preRead.addElement (ret); + } } catch (Exception e) { |
From: <der...@us...> - 2003-10-27 02:20:20
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv25308/tests/scannersTests Modified Files: AppletScannerTest.java BaseHREFScannerTest.java FormScannerTest.java FrameSetScannerTest.java ImageScannerTest.java LinkScannerTest.java ScriptScannerTest.java StyleScannerTest.java TagScannerTest.java Log Message: Some speed improvements; passing tags to evaluate, creating strings without string buffers, etc. Index: AppletScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/AppletScannerTest.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** AppletScannerTest.java 26 Oct 2003 19:46:26 -0000 1.30 --- AppletScannerTest.java 27 Oct 2003 02:18:04 -0000 1.31 *************** *** 47,57 **** } - public void testEvaluate() - { - AppletScanner scanner = new AppletScanner("-a"); - boolean retVal = scanner.evaluate(" Applet ",null); - assertEquals("Evaluation of APPLET tag",new Boolean(true),new Boolean(retVal)); - } - public void testScan() throws ParserException { --- 47,50 ---- Index: BaseHREFScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/BaseHREFScannerTest.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** BaseHREFScannerTest.java 26 Oct 2003 19:46:26 -0000 1.29 --- BaseHREFScannerTest.java 27 Oct 2003 02:18:05 -0000 1.30 *************** *** 63,73 **** } - public void testEvaluate() { - String testData1 = "BASE HREF=\"http://www.abc.com/\""; - assertTrue("Data 1 Should have evaluated true",scanner.evaluate(testData1,null)); - String testData2 = "Base href=\"http://www.abc.com/\""; - assertTrue("Data 2 Should have evaluated true",scanner.evaluate(testData2,null)); - } - public void testScan() throws ParserException{ createParser("<html><head><TITLE>test page</TITLE><BASE HREF=\"http://www.abc.com/\"><a href=\"home.cfm\">Home</a>...</html>","http://www.google.com/test/index.html"); --- 63,66 ---- Index: FormScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/FormScannerTest.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** FormScannerTest.java 26 Oct 2003 19:46:26 -0000 1.38 --- FormScannerTest.java 27 Oct 2003 02:18:05 -0000 1.39 *************** *** 88,101 **** } - public void testEvaluate() { - String line1="form method=\"post\" onsubmit=\"return implementsearch()\" name=frmsearch id=form"; - String line2="FORM method=\"post\" onsubmit=\"return implementsearch()\" name=frmsearch id=form"; - String line3="Form method=\"post\" onsubmit=\"return implementsearch()\" name=frmsearch id=form"; - FormScanner formScanner = new FormScanner("",Parser.createParser("")); - assertTrue("Line 1",formScanner.evaluate(line1,null)); - assertTrue("Line 2",formScanner.evaluate(line2,null)); - assertTrue("Line 3",formScanner.evaluate(line3,null)); - } - public void assertTypeNameSize(String description,String type,String name,String size,InputTag inputTag) { assertEquals(description+" type",type,inputTag.getAttribute("TYPE")); --- 88,91 ---- Index: FrameSetScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/FrameSetScannerTest.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** FrameSetScannerTest.java 26 Oct 2003 19:46:26 -0000 1.29 --- FrameSetScannerTest.java 27 Oct 2003 02:18:05 -0000 1.30 *************** *** 33,36 **** --- 33,37 ---- import org.htmlparser.tags.FrameSetTag; import org.htmlparser.tags.FrameTag; + import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.ParserException; *************** *** 45,58 **** public FrameSetScannerTest(String name) { super(name); - } - - public void testEvaluate() { - String line1="frameset rows=\"115,*\" frameborder=\"NO\" border=\"0\" framespacing=\"0\""; - String line2="FRAMESET rows=\"115,*\" frameborder=\"NO\" border=\"0\" framespacing=\"0\""; - String line3="Frameset rows=\"115,*\" frameborder=\"NO\" border=\"0\" framespacing=\"0\""; - FrameSetScanner frameSetScanner = new FrameSetScanner(""); - assertTrue("Line 1",frameSetScanner.evaluate(line1,null)); - assertTrue("Line 2",frameSetScanner.evaluate(line2,null)); - assertTrue("Line 3",frameSetScanner.evaluate(line3,null)); } --- 46,49 ---- Index: ImageScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ImageScannerTest.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** ImageScannerTest.java 26 Oct 2003 19:46:26 -0000 1.36 --- ImageScannerTest.java 27 Oct 2003 02:18:05 -0000 1.37 *************** *** 31,34 **** --- 31,35 ---- import org.htmlparser.Node; import org.htmlparser.Parser; + import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.scanners.ImageScanner; import org.htmlparser.scanners.TableScanner; *************** *** 68,72 **** { ImageScanner scanner = new ImageScanner("-i",new LinkProcessor()); ! boolean retVal = scanner.evaluate(" img ",null); assertEquals("Evaluation of IMG tag",new Boolean(true),new Boolean(retVal)); } --- 69,75 ---- { ImageScanner scanner = new ImageScanner("-i",new LinkProcessor()); ! Tag tag = new Tag (); ! tag.setTagName ("img"); ! boolean retVal = scanner.evaluate (tag ,null); assertEquals("Evaluation of IMG tag",new Boolean(true),new Boolean(retVal)); } Index: LinkScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/LinkScannerTest.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** LinkScannerTest.java 26 Oct 2003 19:46:26 -0000 1.45 --- LinkScannerTest.java 27 Oct 2003 02:18:05 -0000 1.46 *************** *** 84,88 **** assertTrue("Third node should be a tag",node[2] instanceof Tag); assertTrue("Fourth node should be a link",node[3] instanceof LinkTag); ! // LinkScanner.evaluate() says anythin less than 5 characters isn't a link: assertTrue("Fifth node should be a tag",node[4] instanceof Tag); assertTrue("Sixth node should be a tag",node[5] instanceof Tag); --- 84,88 ---- assertTrue("Third node should be a tag",node[2] instanceof Tag); assertTrue("Fourth node should be a link",node[3] instanceof LinkTag); ! // LinkScanner.evaluate() says no HREF means it isn't a link: assertTrue("Fifth node should be a tag",node[4] instanceof Tag); assertTrue("Sixth node should be a tag",node[5] instanceof Tag); *************** *** 158,162 **** { LinkScanner scanner = new LinkScanner("-l"); ! boolean retVal = scanner.evaluate(" a href ",null); assertEquals("Evaluation of the Link tag",new Boolean(true),new Boolean(retVal)); } --- 158,165 ---- { LinkScanner scanner = new LinkScanner("-l"); ! Tag tag = new Tag (); ! tag.setTagName ("a"); ! tag.setAttribute ("href", "https://www.redhat.com/"); ! boolean retVal = scanner.evaluate(tag,null); assertEquals("Evaluation of the Link tag",new Boolean(true),new Boolean(retVal)); } Index: ScriptScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** ScriptScannerTest.java 26 Oct 2003 19:46:27 -0000 1.44 --- ScriptScannerTest.java 27 Oct 2003 02:18:05 -0000 1.45 *************** *** 49,59 **** } - public void testEvaluate() - { - ScriptScanner scanner = new ScriptScanner("-s"); - boolean retVal = scanner.evaluate(" script ",null); - assertEquals("Evaluation of SCRIPT tag",new Boolean(true),new Boolean(retVal)); - } - public void testScan() throws ParserException { --- 49,52 ---- Index: StyleScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/StyleScannerTest.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** StyleScannerTest.java 26 Oct 2003 19:46:27 -0000 1.31 --- StyleScannerTest.java 27 Oct 2003 02:18:05 -0000 1.32 *************** *** 44,54 **** } - public void testEvaluate() - { - StyleScanner scanner = new StyleScanner("-s"); - boolean retVal = scanner.evaluate("style ",null); - assertEquals("Evaluation of STYLE tag",new Boolean(true),new Boolean(retVal)); - } - public void testScan() throws ParserException { --- 44,47 ---- Index: TagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/TagScannerTest.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** TagScannerTest.java 26 Oct 2003 19:46:27 -0000 1.34 --- TagScannerTest.java 27 Oct 2003 02:18:05 -0000 1.35 *************** *** 125,129 **** TagScanner scanner = new TagScanner() { public Tag scan(Tag tag,String url,Lexer lexer) { return null;} ! public boolean evaluate(String s,TagScanner previousOpenScanner) { return false; } public String [] getID() { return null; } protected Tag createTag (Page page, int start, int end, Vector attributes, Tag tag, String url) { return null; } --- 125,129 ---- TagScanner scanner = new TagScanner() { public Tag scan(Tag tag,String url,Lexer lexer) { return null;} ! public boolean evaluate(Tag tag,TagScanner previousOpenScanner) { return false; } public String [] getID() { return null; } protected Tag createTag (Page page, int start, int end, Vector attributes, Tag tag, String url) { return null; } |
From: <der...@us...> - 2003-10-27 02:18:38
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv25308/scanners Modified Files: FormScanner.java LinkScanner.java TagScanner.java TitleScanner.java Log Message: Some speed improvements; passing tags to evaluate, creating strings without string buffers, etc. Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** FormScanner.java 26 Oct 2003 19:46:19 -0000 1.47 --- FormScanner.java 27 Oct 2003 02:18:04 -0000 1.48 *************** *** 115,126 **** } ! public boolean evaluate(String s, TagScanner previousOpenScanner) { ! if (previousOpenScanner instanceof LinkScanner) { linkScannerAlreadyOpen = true; StringBuffer msg= new StringBuffer(); ! msg.append("<"); ! msg.append(s); ! msg.append(">"); msg.append(PREVIOUS_DIRTY_LINK_MESSAGE); feedback.warning(msg.toString()); --- 115,125 ---- } ! public boolean evaluate(Tag tag, TagScanner previousOpenScanner) { ! if (previousOpenScanner instanceof LinkScanner) ! { linkScannerAlreadyOpen = true; StringBuffer msg= new StringBuffer(); ! msg.append(tag.toHtml ()); msg.append(PREVIOUS_DIRTY_LINK_MESSAGE); feedback.warning(msg.toString()); *************** *** 133,137 **** else linkScannerAlreadyOpen = false; ! return super.evaluate(s, previousOpenScanner); } --- 132,136 ---- else linkScannerAlreadyOpen = false; ! return super.evaluate(tag, previousOpenScanner); } Index: LinkScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/LinkScanner.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** LinkScanner.java 26 Oct 2003 19:46:20 -0000 1.56 --- LinkScanner.java 27 Oct 2003 02:18:04 -0000 1.57 *************** *** 89,117 **** /** ! * Template Method, used to decide if this scanner can handle the Link tag type. If ! * the evaluation returns true, the calling side makes a call to scan(). ! * @param s The complete text contents of the Tag. ! * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current ! * scan has begun, and hence allows us to write scanners that can work with dirty html */ ! public boolean evaluate (String s, TagScanner previousOpenScanner) { ! char ch; ! boolean ret; ! ! // eat up leading blanks ! s = absorbLeadingBlanks (s); ! if (5 > s.length ()) ! ret = false; ! else ! { ! ch = s.charAt (0); ! if ((ch=='a' || ch=='A') && Character.isWhitespace (s.charAt (1))) ! ret = -1 != s.toUpperCase().indexOf ("HREF"); ! else ! ret = false; ! } ! ! return (ret); } --- 89,100 ---- /** ! * Check if we can handle this tag. ! * @param tag The generic tag with the name A. ! * @param previousOpenScanner Indicates any previous scanner which hasn't ! * completed, before the current scan has begun. */ ! public boolean evaluate (Tag tag, TagScanner previousOpenScanner) { ! return (null != tag.getAttributeEx ("HREF")); } Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** TagScanner.java 26 Oct 2003 19:46:21 -0000 1.44 --- TagScanner.java 27 Oct 2003 02:18:04 -0000 1.45 *************** *** 57,61 **** * <br> * If you wish to write your own scanner, then you must implement scan(). ! * You MAY implement evaluate() as well, if your evaluation logic is not based on a simple text match. * You MUST implement getID() - which identifies your scanner uniquely in the hashtable of scanners. * --- 57,62 ---- * <br> * If you wish to write your own scanner, then you must implement scan(). ! * You MAY implement evaluate() as well, if your evaluation logic is not based ! * on a match of the tag name. * You MUST implement getID() - which identifies your scanner uniquely in the hashtable of scanners. * *************** *** 136,221 **** } ! /** ! * This method is used to decide if this scanner can handle this tag type. If the ! * evaluation returns true, the calling side makes a call to scan(). ! * <strong>This method has to be implemented meaningfully only if a first-word match with ! * the scanner id does not imply a match (or extra processing needs to be done). ! * Default returns true</strong> ! * @param tagContents The complete text contents of the Tag. ! * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current ! * scan has begun, and hence allows us to write scanners that can work with dirty html ! */ ! public boolean evaluate(String tagContents,TagScanner previousOpenScanner) { ! return true; ! } ! ! /** ! * Pull the text between two matching capitalized 'XML' tags. ! * @deprecated This reads ahead on your iterator and doesn't put them back if it's not an XML tag. ! */ ! public static String extractXMLData (Node node, String tagName, NodeIterator iterator) throws ! ParserException ! { ! try ! { ! String xmlData = ""; ! ! boolean xmlTagFound = isXMLTagFound (node, tagName); ! if (xmlTagFound) ! { ! try ! { ! do ! { ! node = iterator.nextNode (); ! if (node!=null) ! { ! if (node instanceof StringNode) ! { ! StringNode stringNode = (StringNode)node; ! if (xmlData.length ()>0) ! xmlData+=" "; ! xmlData += stringNode.getText (); ! } ! else ! if (!(node instanceof Tag && ((Tag)node).isEndTag ())) ! xmlTagFound = false; ! } ! } ! while (node instanceof StringNode); ! ! } ! ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : error while trying to find xml tag",e); ! } ! } ! // check end tag matches start tag ! if (xmlTagFound) ! { ! if (node!=null) ! { ! if (node instanceof Tag && ((Tag)node).isEndTag ()) ! { ! Tag endTag = (Tag)node; ! if (!endTag.getTagName ().equals (tagName)) ! xmlTagFound = false; ! } ! ! } ! ! } ! if (xmlTagFound) ! return xmlData; ! else ! return null; ! } ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! } ! } public String getFilter() { --- 137,224 ---- } ! /** ! * This method is used to decide if this scanner can handle this tag type. If the ! * evaluation returns true, the calling side makes a call to scan(). ! * <strong>This method has to be implemented meaningfully only if a first-word match with ! * the scanner id does not imply a match (or extra processing needs to be done). ! * Default returns true</strong> ! * @param tag The tag with a name that matches a value from {@link #getID}. ! * @param previousOpenScanner Indicates any previous scanner which hasn't ! * completed, before the current scan has begun, and hence allows us to ! * write scanners that can work with dirty html. ! */ ! public boolean evaluate (Tag tag, TagScanner previousOpenScanner) ! { ! return (true); ! } ! ! /** ! * Pull the text between two matching capitalized 'XML' tags. ! * @deprecated This reads ahead on your iterator and doesn't put them back if it's not an XML tag. ! */ ! public static String extractXMLData (Node node, String tagName, NodeIterator iterator) throws ! ParserException ! { ! try ! { ! String xmlData = ""; ! ! boolean xmlTagFound = isXMLTagFound (node, tagName); ! if (xmlTagFound) ! { ! try ! { ! do ! { ! node = iterator.nextNode (); ! if (node!=null) ! { ! if (node instanceof StringNode) ! { ! StringNode stringNode = (StringNode)node; ! if (xmlData.length ()>0) ! xmlData+=" "; ! xmlData += stringNode.getText (); ! } ! else ! if (!(node instanceof Tag && ((Tag)node).isEndTag ())) ! xmlTagFound = false; ! } ! } ! while (node instanceof StringNode); ! ! } ! ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : error while trying to find xml tag",e); ! } ! } ! // check end tag matches start tag ! if (xmlTagFound) ! { ! if (node!=null) ! { ! if (node instanceof Tag && ((Tag)node).isEndTag ()) ! { ! Tag endTag = (Tag)node; ! if (!endTag.getTagName ().equals (tagName)) ! xmlTagFound = false; ! } ! ! } ! ! } ! if (xmlTagFound) ! return xmlData; ! else ! return null; ! } ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! } ! } public String getFilter() { Index: TitleScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TitleScanner.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** TitleScanner.java 26 Oct 2003 19:46:21 -0000 1.32 --- TitleScanner.java 27 Oct 2003 02:18:04 -0000 1.33 *************** *** 53,60 **** } ! public boolean evaluate(String tagNameBeingChecked, TagScanner previousOpenScanner) { ! absorbLeadingBlanks(tagNameBeingChecked); ! return (tagNameBeingChecked.toUpperCase ().startsWith (MATCH_NAME[0]) && ((null == previousOpenScanner) ! || !previousOpenScanner.getID ()[0].equals ("TITLE"))); } --- 53,59 ---- } ! public boolean evaluate(Tag tag, TagScanner previousOpenScanner) ! { ! return ((null == previousOpenScanner) || !previousOpenScanner.getID ()[0].equals ("TITLE")); } |
From: <der...@us...> - 2003-10-27 02:18:24
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1:/tmp/cvs-serv25308/lexer/nodes Modified Files: TagNode.java Log Message: Some speed improvements; passing tags to evaluate, creating strings without string buffers, etc. Index: TagNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/TagNode.java,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** TagNode.java 26 Oct 2003 19:46:18 -0000 1.19 --- TagNode.java 27 Oct 2003 02:18:04 -0000 1.20 *************** *** 157,160 **** --- 157,161 ---- StringBuffer buffer; char quote; + Attribute attribute; // first determine if there's whitespace in the value *************** *** 203,207 **** else quote = 0; ! setAttribute (key, value, quote); } --- 204,216 ---- else quote = 0; ! attribute = getAttributeEx (key); ! if (null != attribute) ! { // see if we can splice it in rather than replace it ! attribute.setValue (value); ! if (0 != quote) ! attribute.setQuote (quote); ! } ! else ! setAttribute (key, value, quote); } *************** *** 594,605 **** public String toHtml () { ! StringBuffer ret; Vector attributes; Attribute attribute; ! ret = new StringBuffer (); attributes = getAttributesEx (); ret.append ("<"); ! for (int i = 0; i < attributes.size (); i++) { attribute = (Attribute)attributes.elementAt (i); --- 603,623 ---- public String toHtml () { ! int length; ! int size; Vector attributes; Attribute attribute; + StringBuffer ret; ! length = 2; attributes = getAttributesEx (); + size = attributes.size (); + for (int i = 0; i < size; i++) + { + attribute = (Attribute)attributes.elementAt (i); + length += attribute.getLength (); + } + ret = new StringBuffer (length); ret.append ("<"); ! for (int i = 0; i < size; i++) { attribute = (Attribute)attributes.elementAt (i); |
From: <der...@us...> - 2003-10-27 02:18:24
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv25308 Modified Files: Parser.java Log Message: Some speed improvements; passing tags to evaluate, creating strings without string buffers, etc. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.68 retrieving revision 1.69 diff -C2 -d -r1.68 -r1.69 *** Parser.java 26 Oct 2003 19:46:17 -0000 1.68 --- Parser.java 27 Oct 2003 02:18:02 -0000 1.69 *************** *** 1073,1077 **** scanner = (TagScanner)scanners.get (name); save = mScanner; ! if ((null != scanner) && scanner.evaluate (ret.getText (), save)) { mScanner = scanner; --- 1073,1077 ---- scanner = (TagScanner)scanners.get (name); save = mScanner; ! if ((null != scanner) && scanner.evaluate (ret, save)) { mScanner = scanner; |