htmlparser-cvs Mailing List for HTML Parser (Page 26)
Brought to you by:
derrickoswald
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: <der...@us...> - 2004-01-02 05:01:32
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1:/tmp/cvs-serv27740/lexerTests Modified Files: AttributeTests.java LexerTests.java Log Message: Added testcases but was unable to reproduce the following bugs in the version 1.4 codebase: 839264 toHtml() parse error in Javascripts with "form" keyword 833592 DOCTYPE element is not parsed correctly 826764 ParserException occurs only when using setInputHTML() instea 825820 Words conjoined 825645 <input> not getting parsed inside table 813838 links not parsed correctly and #851882 zero length alt tag causes bug in ImageScanner #832530 empty attribute causes parser to fail #805598 attribute src in tag img sometimes not correctly parsed (these 3 are all the same bug, duplicates of the following): #753012 IMG SRC not parsed v1.3 & v1.4 #755929 Empty string attr. value causes attr parsing to be stopped #778781 SRC-attribute suppression in IMG-tags Also reviewed these test cases, again, with none reproducible in 1.4: #788746 parser crashes on comments like <!-- foobar --!> #772700 Jsp Tags are not parsed correctly when in quoted attributes. Index: AttributeTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/AttributeTests.java,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** AttributeTests.java 8 Dec 2003 01:31:53 -0000 1.8 --- AttributeTests.java 2 Jan 2004 05:01:28 -0000 1.9 *************** *** 38,41 **** --- 38,42 ---- import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.PageAttribute; + import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; *************** *** 519,522 **** --- 520,604 ---- assertTrue ("Attribute missing", table.containsKey ("OTHER")); assertEquals ("Attribute has wrong value", "fred", (String)table.get ("OTHER")); + } + + /** + * see bug #778781 SRC-attribute suppression in IMG-tags + * & #753012 IMG SRC not parsed v1.3 & v1.4 + * & #755929 Empty string attr. value causes attr parsing to be stopped + * & #778781 SRC-attribute suppression in IMG-tags + * & #832530 empty attribute causes parser to fail + * & #851882 zero length alt tag causes bug in ImageScanner + * + * HTML before parse: + * <img src="images/first" alt="first">" + * <img src="images/second" alt=""> + * <img alt="third" src="images/third"> + * <img alt="" src="images/fourth"> + * + * HTML after parse: + * <IMG ALT="first" SRC="images/first"> + * <IMG ALT="" SRC="images/second"> + * <IMG ALT="third" SRC="images/third"> + * <IMG ALT=""> + */ + public void testSrcAndAlt () throws ParserException + { + String html = "<img src=\"images/first\" alt=\"first\">"; + + createParser (html); + parseAndAssertNodeCount (1); + assertTrue ("Node should be an ImageTag", node[0] instanceof ImageTag); + ImageTag img = (ImageTag)node[0]; + assertTrue ("bad source", "images/first".equals (img.getImageURL ())); + assertTrue ("bad alt", "first".equals (img.getAttribute ("alt"))); + assertStringEquals ("toHtml()", html, img.toHtml ()); + } + + /** + * see bug #778781 SRC-attribute suppression in IMG-tags + */ + public void testSrcAndEmptyAlt () throws ParserException + { + String html = "<img src=\"images/second\" alt=\"\">"; + + createParser (html); + parseAndAssertNodeCount (1); + assertTrue ("Node should be an ImageTag", node[0] instanceof ImageTag); + ImageTag img = (ImageTag)node[0]; + assertTrue ("bad source", "images/second".equals (img.getImageURL ())); + assertTrue ("bad alt", "".equals (img.getAttribute ("alt"))); + assertStringEquals ("toHtml()", html, img.toHtml ()); + } + + /** + * see bug #778781 SRC-attribute suppression in IMG-tags + */ + public void testAltAndSrc () throws ParserException + { + String html = "<img alt=\"third\" src=\"images/third\">"; + + createParser (html); + parseAndAssertNodeCount (1); + assertTrue ("Node should be an ImageTag", node[0] instanceof ImageTag); + ImageTag img = (ImageTag)node[0]; + assertTrue ("bad source", "images/third".equals (img.getImageURL ())); + assertTrue ("bad alt", "third".equals (img.getAttribute ("alt"))); + assertStringEquals ("toHtml()", html, img.toHtml ()); + } + + /** + * see bug #778781 SRC-attribute suppression in IMG-tags + */ + public void testEmptyAltAndSrc () throws ParserException + { + String html = "<img alt=\"\" src=\"images/third\">"; + + createParser (html); + parseAndAssertNodeCount (1); + assertTrue ("Node should be an ImageTag", node[0] instanceof ImageTag); + ImageTag img = (ImageTag)node[0]; + assertTrue ("bad source", "images/third".equals (img.getImageURL ())); + assertTrue ("bad alt", "".equals (img.getAttribute ("alt"))); + assertStringEquals ("toHtml()", html, img.toHtml ()); } } Index: LexerTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** LexerTests.java 31 Dec 2003 14:40:50 -0000 1.13 --- LexerTests.java 2 Jan 2004 05:01:28 -0000 1.14 *************** *** 697,700 **** --- 697,750 ---- } + /** + * See bug #825820 Words conjoined + */ + public void testConjoined () + throws + ParserException + { + StringBuffer buffer; + NodeIterator iterator; + Node node; + String expected; + + expected = "The Title\nThis is the body."; + String html1 = "<html><title>The Title\n</title>" + + "<body>This is <a href=\"foo.html\">the body</a>.</body></html>"; + createParser (html1); + buffer = new StringBuffer (); + for (iterator = parser.elements (); iterator.hasMoreNodes (); ) + { + node = iterator.nextNode (); + String text = node.toPlainTextString (); + buffer.append (text); + } + assertStringEquals ("conjoined text", expected, buffer.toString ()); + + String html2 = "<html><title>The Title</title>\n" + + "<body>This is <a href=\"foo.html\">the body</a>.</body></html>"; + createParser (html2); + buffer = new StringBuffer (); + for (iterator = parser.elements (); iterator.hasMoreNodes (); ) + { + node = iterator.nextNode (); + String text = node.toPlainTextString (); + buffer.append (text); + } + assertStringEquals ("conjoined text", expected, buffer.toString ()); + + String html3 = "<html><title>The Title</title>" + + "<body>\nThis is <a href=\"foo.html\">the body</a>.</body></html>"; + createParser (html3); + buffer = new StringBuffer (); + for (iterator = parser.elements (); iterator.hasMoreNodes (); ) + { + node = iterator.nextNode (); + String text = node.toPlainTextString (); + buffer.append (text); + } + assertStringEquals ("conjoined text", expected, buffer.toString ()); + } + } |
From: <der...@us...> - 2004-01-01 17:16:56
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors In directory sc8-pr-cvs1:/tmp/cvs-serv1008 Modified Files: UrlModifyingVisitor.java Log Message: Fix support request #824989 UrlModifyingVisitor taking out HTML comments. Added remark node handling to preserve comments. Index: UrlModifyingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/UrlModifyingVisitor.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** UrlModifyingVisitor.java 8 Dec 2003 01:31:56 -0000 1.41 --- UrlModifyingVisitor.java 1 Jan 2004 17:16:53 -0000 1.42 *************** *** 33,36 **** --- 33,37 ---- import org.htmlparser.Node; import org.htmlparser.Parser; + import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; import org.htmlparser.tags.CompositeTag; *************** *** 59,64 **** } ! public void visitStringNode(StringNode stringNode) { ! modifiedResult.append(stringNode.toHtml()); } --- 60,71 ---- } ! public void visitRemarkNode (RemarkNode remarkNode) ! { ! modifiedResult.append (remarkNode.toHtml()); ! } ! ! public void visitStringNode(StringNode stringNode) ! { ! modifiedResult.append (stringNode.toHtml()); } |
From: <der...@us...> - 2004-01-01 16:54:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1:/tmp/cvs-serv30232 Modified Files: FormTagTest.java Log Message: Added test case for investigation of support request #772998 Cannot extract input tags. Not reproducible. Index: FormTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/FormTagTest.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** FormTagTest.java 8 Dec 2003 01:31:55 -0000 1.40 --- FormTagTest.java 1 Jan 2004 16:54:47 -0000 1.41 *************** *** 449,451 **** --- 449,530 ---- ); } + + /** + * From support request #772998 Cannot extract input tags + * The getFormInputs list was reporting zero size and textarea tags were + * in the inputs list. + * Neither of these was reproducible. + */ + public void testTextArea () throws Exception + { + FormTag formTag; + NodeList nl; + InputTag inpTag; + TextareaTag texTag; + + String html = "<body onload=\"otextnloadHandler()\" onunload=\"closeAdvanced()\">\n" + + " <form name=\"searchForm\" onsubmit=\"doSearch()\">\n" + + " <table id=\"searchTable\" align=\"left\" valign=\"middle\" cellspacing=\"0\" cellpadding=\"0\" border=\"0\">\n" + + " <tbody><tr nowrap=\"\" valign=\"middle\">\n" + + " <td id=\"searchTD\">\n" + + " <label id=\"searchLabel\" for=\"searchWord\">\n" + + " Search:\n" + + " </label>\n" + + " </td>\n" + + "\n" + + " <td>\n" + + " <input type=\"text\" id=\"searchWord\" name=\"searchWord\" value=\"\" size=\"24\" maxlength=\"256\" alt=\"Search Expression\">\n" + + " </td>\n" + + // note: this was added as there weren't any textarea tags in the page referenced + " <td>\n" + + " <textarea name=\"mytextarea\" rows=\"1\" cols=\"12\" alt=\"Free Form Text\">\n" + + " The text.\n" + + " </textarea>\n" + + " </td>\n" + + " <td>\n" + + " <input type=\"button\" onclick=\"this.blur();doSearch()\" value=\"GO\" id=\"go\" alt=\"GO\">\n" + + " <input type=\"hidden\" name=\"maxHits\" value=\"500\">\n" + + " </td>\n" + + " <td nowrap=\"nowrap\">\n" + + "\n" + + " <a id=\"scopeLabel\" href=\"javascript:openAdvanced();\" title=\"Search only the following topics\" alt=\"Search only the following topics\" onmouseover=\"window.status='Search only the following topics'; return true;\" onmouseout=\"window.status='';\">Search scope:</a>\n" + + " </td>\n" + + " <td nowrap=\"nowrap\">\n" + + " <input type=\"hidden\" name=\"workingSet\" value=\"All topics\">\n" + + " <div id=\"scope\">All topics</div>\n" + + " </td>\n" + + " </tr>\n" + + "\n" + + " </tbody></table>\n" + + " </form>\n" + + "\n" + + "</body>\n"; + createParser (html); + formTag = + (FormTag)(parser.extractAllNodesThatAre ( + FormTag.class + )[0]); + assertNotNull ("Should have found a form tag",formTag); + assertStringEquals ("name", "searchForm", formTag.getFormName ()); + nl = formTag.getFormInputs (); + assertTrue ("4 inputs", 4 == nl.size ()); + inpTag = (InputTag)nl.elementAt (0); + assertStringEquals ("name", "searchWord", inpTag.getAttribute ("name")); + assertStringEquals ("value", "", inpTag.getAttribute ("value")); + inpTag = (InputTag)nl.elementAt (1); + assertNull ("name", inpTag.getAttribute ("name")); + assertStringEquals ("value", "GO", inpTag.getAttribute ("value")); + inpTag = (InputTag)nl.elementAt (2); + assertStringEquals ("name", "maxHits", inpTag.getAttribute ("name")); + assertStringEquals ("value", "500", inpTag.getAttribute ("value")); + inpTag = (InputTag)nl.elementAt (3); + assertStringEquals ("name", "workingSet", inpTag.getAttribute ("name")); + assertStringEquals ("value", "All topics", inpTag.getAttribute ("value")); + nl = formTag.getFormTextareas (); + assertTrue ("1 textarea", 1 == nl.size ()); + texTag = (TextareaTag)nl.elementAt (0); + assertStringEquals ("name", "mytextarea", texTag.getAttribute ("name")); + assertTrue ("only 1 child", 1 == texTag.getChildCount ()); + assertStringEquals ("text contents", "\n The text.\n ", texTag.getChild (0).toHtml ()); + } } |
From: <der...@us...> - 2003-12-31 14:40:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1:/tmp/cvs-serv31101/tests/lexerTests Modified Files: LexerTests.java Log Message: Fix bug #789439 Japanese page causes OutOfMemory Exception Modified the lexer to skip over JIS escape sequences. Index: LexerTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** LexerTests.java 8 Dec 2003 01:31:53 -0000 1.12 --- LexerTests.java 31 Dec 2003 14:40:50 -0000 1.13 *************** *** 38,41 **** --- 38,42 ---- import java.net.URL; import java.net.URLConnection; + import java.util.HashSet; import org.htmlparser.Node; *************** *** 51,54 **** --- 52,57 ---- import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; + import org.htmlparser.util.NodeIterator; + import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; *************** *** 592,595 **** --- 595,699 ---- // tests.testSpeedStreamWithTags (); // } + + static final HashSet mAcceptable; + static + { + mAcceptable = new HashSet (); + mAcceptable.add ("A"); + mAcceptable.add ("BODY"); + mAcceptable.add ("BR"); + mAcceptable.add ("CENTER"); + mAcceptable.add ("FONT"); + mAcceptable.add ("HEAD"); + mAcceptable.add ("HR"); + mAcceptable.add ("HTML"); + mAcceptable.add ("IMG"); + mAcceptable.add ("P"); + mAcceptable.add ("TABLE"); + mAcceptable.add ("TD"); + mAcceptable.add ("TITLE"); + mAcceptable.add ("TR"); + } + + /** + * Test case for bug #789439 Japanese page causes OutOfMemory Exception + * No exception is thrown in the current version of the parser, + * however, the problem is that ISO-2022-JP (aka JIS) encoding sometimes + * causes spurious tags. + * The root cause is characters bracketed by [esc]$B and [esc](J (contrary + * to what is indicated in then j_s_nightingale analysis of the problem) that + * sometimes have an angle bracket (< or 0x3c) embedded in them. These + * are taken to be tags by the parser, instead of being considered strings. + * <p> + * The URL refrenced has an ISO-8859-1 encoding (the default), but + * Japanese characters intermixed on the page with English, using the JIS + * encoding. We detect failure by looking for weird tag names which were + * not correctly handled as string nodes. + * <p> + * Here is a partial dump of the page with escape sequences: + * <pre> + * 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43 + * 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61 + * .. + * 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a + * 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c + * .. + * 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25 + * 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a + * .. + * 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25 + * 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f + * .. + * 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43 + * 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d + * .. + * 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21 + * 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28 + * 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22 + * </pre> + * <p> + * The fix proposed by j_s_nightingale is implemented to swallow JIS + * escape sequences in the string parser. + * Apparently the fix won't help EUC-JP and Shift-JIS though, so this may + * still be a problem. + * It's theoretically possible that JIS encoding, or another one, + * could be used as attribute names or values within tags as well, + * but this is considered improbable and is therefore not handled in + * the tag parser state machine. + */ + public void testJIS () + throws ParserException + { + Parser parser; + NodeIterator iterator; + + parser = new Parser ("http://www.009.com/"); + iterator = parser.elements (); + while (iterator.hasMoreNodes ()) + checkTagNames (iterator.nextNode ()); + } + + /** + * Check the tag name for one of the ones expected on the page. + * Recursively check the children. + */ + public void checkTagNames (Node node) + { + Tag tag; + String name; + NodeList children; + + if (node instanceof Tag) + { + tag = (Tag)node; + name = tag.getTagName (); + if (!mAcceptable.contains (name)) + fail ("unrecognized tag name \"" + name + "\""); + children = tag.getChildren (); + if (null != children) + for (int i = 0; i < children.size (); i++) + checkTagNames (children.elementAt (i)); + } + } } |
From: <der...@us...> - 2003-12-31 14:40:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv31101/lexer Modified Files: Lexer.java Log Message: Fix bug #789439 Japanese page causes OutOfMemory Exception Modified the lexer to skip over JIS escape sequences. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** Lexer.java 8 Dec 2003 01:31:51 -0000 1.22 --- Lexer.java 31 Dec 2003 14:40:49 -0000 1.23 *************** *** 312,315 **** --- 312,359 ---- /** + * Advance the cursor through a JIS escape sequence. + * @param cursor A cursor positioned within the escape sequence. + */ + protected void scanJIS (Cursor cursor) + throws + ParserException + { + boolean done; + char ch; + int state; + + done = false; + state = 0; + while (!done) + { + ch = mPage.getCharacter (cursor); + if (0 == ch) + done = true; + else + switch (state) + { + case 0: + if (0x1b == ch) // escape + state = 1; + break; + case 1: + if ('(' == ch) + state = 2; + else + state = 0; + break; + case 2: + if ('J' == ch) + done = true; + else + state = 0; + break; + default: + throw new IllegalStateException ("how the fuck did we get in state " + state); + } + } + } + + /** * Parse a string node. * Scan characters until "</", "<%", "<!" or < followed by a *************** *** 325,331 **** boolean done; char ch; - int length; - int begin; - int end; char quote; Node ret; --- 369,372 ---- *************** *** 338,341 **** --- 379,403 ---- if (0 == ch) done = true; + else if (0x1b == ch) // escape + { + ch = mPage.getCharacter (cursor); + if (0 == ch) + done = true; + else if ('$' == ch) + { + ch = mPage.getCharacter (cursor); + if (0 == ch) + done = true; + else if ('B' == ch) + scanJIS (cursor); + else + { + cursor.retreat (); + cursor.retreat (); + } + } + else + cursor.retreat (); + } else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) quote = ch; // enter quoted state |
From: <der...@us...> - 2003-12-31 04:03:40
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1:/tmp/cvs-serv1538 Modified Files: SourceTests.java Log Message: Fix Source 'SameChars' test. Index: SourceTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/SourceTests.java,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** SourceTests.java 8 Dec 2003 01:31:54 -0000 1.13 --- SourceTests.java 31 Dec 2003 04:03:37 -0000 1.14 *************** *** 296,336 **** { String link; - ArrayList chars1; - ArrayList chars2; URL url; ! URLConnection connection; InputStreamReader in; ! int c; Source source; int index; // pick a big file ! link = "http://sourceforge.net/projects/htmlparser/HTMLParser_Coverage.html"; ! chars1 = new ArrayList (); ! chars2 = new ArrayList (); try { url = new URL (link); ! connection = url.openConnection (); ! connection.connect (); ! in = new InputStreamReader (new BufferedInputStream (connection.getInputStream ()), DEFAULT_CHARSET); ! while (-1 != (c = in.read ())) ! chars1.add (new Character ((char)c)); ! in.close (); ! ! connection = url.openConnection (); ! connection.connect (); ! source = new Source (new Stream (connection.getInputStream ())); ! while (-1 != (c = source.read ())) ! chars2.add (new Character ((char)c)); ! source.close (); ! index = 0; ! while (index < chars1.size ()) { ! assertEquals ("characters differ at position " + index, chars1.get (index), chars2.get (index)); index++; } ! assertTrue ("extra characters", index == chars2.size ()); } catch (MalformedURLException murle) --- 296,331 ---- { String link; URL url; ! URLConnection connection1; ! URLConnection connection2; InputStreamReader in; ! int c1; ! int c2; Source source; int index; // pick a big file ! link = "http://htmlparser.sourceforge.net/HTMLParser_Coverage.html"; try { url = new URL (link); ! connection1 = url.openConnection (); ! connection1.connect (); ! in = new InputStreamReader (new BufferedInputStream (connection1.getInputStream ()), "UTF-8"); ! connection2 = url.openConnection (); ! connection2.connect (); ! source = new Source (new Stream (connection2.getInputStream ()), "UTF-8"); index = 0; ! while (-1 != (c1 = in.read ())) { ! c2 = source.read (); ! if (c1 != c2) ! fail ("characters differ at position " + index + ", expected " + c1 + ", actual " + c2); index++; } ! c2 = source.read (); ! assertTrue ("extra characters", -1 == c2); ! source.close (); ! in.close (); } catch (MalformedURLException murle) |
From: <der...@us...> - 2003-12-31 02:50:54
|
Update of /cvsroot/htmlparser/htmlparser/.ssh In directory sc8-pr-cvs1:/tmp/cvs-serv23579/.ssh Removed Files: known_hosts random_seed Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. --- known_hosts DELETED --- --- random_seed DELETED --- |
From: <der...@us...> - 2003-12-31 02:50:54
|
Update of /cvsroot/htmlparser/htmlparser/bin In directory sc8-pr-cvs1:/tmp/cvs-serv23579/bin Added Files: linkextractor linkextractor.bat Removed Files: crawler.bat ripper.bat Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. --- NEW FILE: linkextractor --- #! /bin/sh if [ -z "$HTMLPARSER_HOME" ] ; then ## resolve links - $0 may be a link to the home PRG="$0" progname=`basename "$0"` saveddir=`pwd` # need this for relative symlinks dirname_prg=`dirname "$PRG"` cd "$dirname_prg" while [ -h "$PRG" ] ; do ls=`ls -ld "$PRG"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '/.*' > /dev/null; then PRG="$link" else PRG=`dirname "$PRG"`"/$link" fi done HTMLPARSER_HOME=`dirname "$PRG"`/.. cd "$saveddir" # make it fully qualified HTMLPARSER_HOME=`cd "$HTMLPARSER_HOME" && pwd` fi if [ -z "$JAVACMD" ] ; then if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables JAVACMD="$JAVA_HOME/jre/sh/java" else JAVACMD="$JAVA_HOME/bin/java" fi else JAVACMD=`which java 2> /dev/null ` if [ -z "$JAVACMD" ] ; then JAVACMD=java fi fi fi if [ ! -x "$JAVACMD" ] ; then echo "Error: JAVA_HOME is not defined correctly." echo " We cannot execute $JAVACMD" exit 1 fi if [ -n "$CLASSPATH" ] ; then LOCALCLASSPATH="$CLASSPATH" fi HTMLPARSER_LIB="${HTMLPARSER_HOME}/lib" # add in the parser .jar file if [ -z "$LOCALCLASSPATH" ] ; then LOCALCLASSPATH="${HTMLPARSER_LIB}/htmlparser.jar" else LOCALCLASSPATH="${HTMLPARSER_LIB}/htmlparser.jar":"$LOCALCLASSPATH" fi # handle 1.1x JDKs if [ -n "$JAVA_HOME" ] ; then if [ -f "$JAVA_HOME/lib/classes.zip" ] ; then LOCALCLASSPATH="$LOCALCLASSPATH:$JAVA_HOME/lib/classes.zip" fi fi "$JAVACMD" -classpath "$LOCALCLASSPATH" org.htmlparser.parserapplications.LinkExtractor "$@" --- NEW FILE: linkextractor.bat --- java -jar ..\lib\htmlparser.jar org.htmlparser.parserapplications.LinkExtractor %1 %2 --- crawler.bat DELETED --- --- ripper.bat DELETED --- |
From: <der...@us...> - 2003-12-31 02:50:54
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv23579/src/org/htmlparser/util Modified Files: NodeList.java Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. Index: NodeList.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/NodeList.java,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** NodeList.java 20 Dec 2003 23:47:55 -0000 1.51 --- NodeList.java 31 Dec 2003 02:50:50 -0000 1.52 *************** *** 33,36 **** --- 33,38 ---- import org.htmlparser.Node; + import org.htmlparser.NodeFilter; + import org.htmlparser.filters.NodeClassFilter; public class NodeList implements Serializable { *************** *** 183,200 **** /** ! * Search for nodes of the given type non-recursively. ! * @param classType The class to search for. */ ! public NodeList searchFor (Class classType) { ! return (searchFor (classType, false)); } /** ! * Search for nodes of the given type recursively. ! * @param classType The class to search for. * @param recursive If <code>true<code> digs into the children recursively. */ ! public NodeList searchFor (Class classType, boolean recursive) { String name; --- 185,202 ---- /** ! * Filter the list with the given filter non-recursively. ! * @param filter The filter to use. */ ! public NodeList extractAllNodesThatMatch (NodeFilter filter) { ! return (extractAllNodesThatMatch (filter, false)); } /** ! * Filter the list with the given filter. ! * @param filter The filter to use. * @param recursive If <code>true<code> digs into the children recursively. */ ! public NodeList extractAllNodesThatMatch (NodeFilter filter, boolean recursive) { String name; *************** *** 204,212 **** ret = new NodeList (); - name = classType.getName (); for (int i = 0; i < size; i++) { node = nodeData[i]; ! if (node.getClass ().getName ().equals (name)) ret.add (node); if (recursive) --- 206,213 ---- ret = new NodeList (); for (int i = 0; i < size; i++) { node = nodeData[i]; ! if (filter.accept (node)) ret.add (node); if (recursive) *************** *** 214,222 **** children = node.getChildren (); if (null != children) ! ret.add (children.searchFor (classType, recursive)); } } return (ret); } } --- 215,242 ---- children = node.getChildren (); if (null != children) ! ret.add (children.extractAllNodesThatMatch (filter, recursive)); } } return (ret); + } + + /** + * Convenience method to search for nodes of the given type non-recursively. + * @param classType The class to search for. + */ + public NodeList searchFor (Class classType) + { + return (searchFor (classType, false)); + } + + /** + * Convenience method to search for nodes of the given type. + * @param classType The class to search for. + * @param recursive If <code>true<code> digs into the children recursively. + */ + public NodeList searchFor (Class classType, boolean recursive) + { + return (extractAllNodesThatMatch (new NodeClassFilter (classType), recursive)); } } |
From: <der...@us...> - 2003-12-31 02:50:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv23579/src/org/htmlparser/parserapplications Modified Files: LinkExtractor.java Removed Files: MailRipper.java Robot.java Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. Index: LinkExtractor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/LinkExtractor.java,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** LinkExtractor.java 8 Dec 2003 01:31:52 -0000 1.49 --- LinkExtractor.java 31 Dec 2003 02:50:50 -0000 1.50 *************** *** 1,4 **** ! // HTMLParser Library v1_4_20031207 - A java-based parser for HTML ! // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or --- 1,12 ---- ! // HTMLParser Library $Name$ - A java-based parser for HTML ! // http://sourceforge.org/projects/htmlparser ! // Copyright (C) 2003 Derrick Oswald ! // ! // Revision Control Information ! // ! // $Source$ ! // $Author$ ! // $Date$ ! // $Revision$ // // This library is free software; you can redistribute it and/or *************** *** 9,35 **** // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ! // ! // For any questions or suggestions, you can write to me at : ! // Email :so...@in... // - // Postal Address : - // Somik Raha - // Extreme Programmer & Coach - // Industrial Logic Corporation - // 2583 Cedar Street, Berkeley, - // CA 94708, USA - // Website : http://www.industriallogic.com package org.htmlparser.parserapplications; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.ParserException; --- 17,38 ---- // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.parserapplications; + import javax.swing.JOptionPane; import org.htmlparser.Node; + import org.htmlparser.NodeFilter; import org.htmlparser.Parser; + import org.htmlparser.filters.AndFilter; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.LinkTag; + import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; *************** *** 38,77 **** * and prints them on standard output. */ ! public class LinkExtractor { ! private String location; ! private Parser parser; ! public LinkExtractor(String location) { ! this.location = location; ! try { ! this.parser = new Parser(location); // Create the parser object ! } ! catch (ParserException e) { ! e.printStackTrace(); ! } ! ! } ! public void extractLinks() throws ParserException { ! System.out.println("Parsing "+location+" for links..."); ! Node [] links = parser.extractAllNodesThatAre(LinkTag.class); ! for (int i = 0;i < links.length;i++) { ! LinkTag linkTag = (LinkTag)links[i]; ! // To extract only mail addresses, uncomment the following line ! // if (linkTag.isMailLink()) ! System.out.println(linkTag.getLink()); ! } ! } ! public static void main(String[] args) { ! if (args.length != 1) { ! System.err.println("Syntax Error : Please provide the location(URL or file) to parse"); ! System.exit(-1); } ! LinkExtractor linkExtractor = new LinkExtractor(args[0]); ! try { ! linkExtractor.extractLinks(); } ! catch (ParserException e) { ! e.printStackTrace(); } } } --- 41,94 ---- * and prints them on standard output. */ ! public class LinkExtractor ! { ! /** ! */ ! public static void main (String[] args) ! { ! String url; ! Parser parser; ! NodeFilter filter; ! NodeList list; ! if (0 >= args.length) ! { ! url = (String)JOptionPane.showInputDialog ( ! null, ! "Enter the URL to extract links from:", ! "Web Site", ! JOptionPane.PLAIN_MESSAGE, ! null, ! null, ! "http://htmlparser.sourceforge.net/docs/"); ! if (null == url) ! System.exit (1); } ! else ! url = args[0]; ! filter = new NodeClassFilter (LinkTag.class); ! if ((1 < args.length) && args[1].equalsIgnoreCase ("-maillinks")) ! filter = new AndFilter ( ! filter, ! new NodeFilter () ! { ! public boolean accept (Node node) ! { ! return (((LinkTag)node).isMailLink ()); ! } ! } ! ); ! try ! { ! parser = new Parser (url); ! list = parser.extractAllNodesThatMatch (filter); ! for (int i = 0; i < list.size (); i++) ! System.out.println (list.elementAt (i).toHtml ()); } ! catch (ParserException e) ! { ! e.printStackTrace (); } + System.exit (0); } } --- MailRipper.java DELETED --- --- Robot.java DELETED --- |
From: <der...@us...> - 2003-12-31 02:50:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1:/tmp/cvs-serv23579/src/org/htmlparser/tests/lexerTests Modified Files: StreamTests.java Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. Index: StreamTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/StreamTests.java,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** StreamTests.java 8 Dec 2003 01:31:54 -0000 1.13 --- StreamTests.java 31 Dec 2003 02:50:50 -0000 1.14 *************** *** 97,137 **** { String link; - ArrayList bytes1; - ArrayList bytes2; URL url; ! URLConnection connection; BufferedInputStream in; ! int b; Stream stream; int index; // pick a big file ! link = "http://sourceforge.net/projects/htmlparser/HTMLParser_Coverage.html"; ! bytes1 = new ArrayList (); ! bytes2 = new ArrayList (); try { url = new URL (link); ! connection = url.openConnection (); ! connection.connect (); ! in = new BufferedInputStream (connection.getInputStream ()); ! while (-1 != (b = in.read ())) ! bytes1.add (new Byte ((byte)b)); ! in.close (); ! ! connection = url.openConnection (); ! connection.connect (); ! stream = new Stream (connection.getInputStream ()); ! while (-1 != (b = stream.read ())) ! bytes2.add (new Byte ((byte)b)); ! stream.close (); ! index = 0; ! while (index < bytes1.size ()) { ! assertEquals ("bytes differ at position " + index, bytes1.get (index), bytes2.get (index)); index++; } ! assertTrue ("extra bytes", index == bytes2.size ()); } catch (MalformedURLException murle) --- 97,132 ---- { String link; URL url; ! URLConnection connection1; ! URLConnection connection2; BufferedInputStream in; ! int b1; ! int b2; Stream stream; int index; // pick a big file ! link = "http://htmlparser.sourceforge.net/HTMLParser_Coverage.html"; try { url = new URL (link); ! connection1 = url.openConnection (); ! connection1.connect (); ! in = new BufferedInputStream (connection1.getInputStream ()); ! connection2 = url.openConnection (); ! connection2.connect (); ! stream = new Stream (connection2.getInputStream ()); index = 0; ! while (-1 != (b1 = in.read ())) { ! b2 = stream.read (); ! if (b1 != b2) ! fail ("bytes differ at position " + index + ", expected " + b1 + ", actual " + b2); index++; } ! b2 = stream.read (); ! stream.close (); ! in.close (); ! assertTrue ("extra bytes", b2 == -1); } catch (MalformedURLException murle) *************** *** 289,293 **** // pick a small file > 2000 bytes ! link = "http://sourceforge.net/projects/htmlparser/overview-summary.html"; bytes1 = new ArrayList (); bytes2 = new ArrayList (); --- 284,288 ---- // pick a small file > 2000 bytes ! link = "http://htmlparser.sourceforge.net/javadoc_1_3/overview-summary.html"; bytes1 = new ArrayList (); bytes2 = new ArrayList (); *************** *** 364,368 **** // pick a small file > 2000 bytes ! link = "http://sourceforge.net/projects/htmlparser/overview-summary.html"; bytes1 = new ArrayList (); bytes2 = new ArrayList (); --- 359,363 ---- // pick a small file > 2000 bytes ! link = "http://htmlparser.sourceforge.net/javadoc_1_3/overview-summary.html"; bytes1 = new ArrayList (); bytes2 = new ArrayList (); |
From: <der...@us...> - 2003-12-31 02:50:53
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv23579/src/org/htmlparser/lexer Modified Files: Stream.java Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. Index: Stream.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Stream.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** Stream.java 8 Dec 2003 01:31:51 -0000 1.10 --- Stream.java 31 Dec 2003 02:50:50 -0000 1.11 *************** *** 256,260 **** fill (false); if (0 != available ()) ! ret = mBuffer[mOffset++]; else ret = EOF; --- 256,260 ---- fill (false); if (0 != available ()) ! ret = mBuffer[mOffset++] & 0xff; else ret = EOF; |
From: <der...@us...> - 2003-12-31 02:50:53
|
Update of /cvsroot/htmlparser/htmlparser/src/doc-files In directory sc8-pr-cvs1:/tmp/cvs-serv23579/src/doc-files Modified Files: todo.html Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. Index: todo.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/doc-files/todo.html,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** todo.html 20 Dec 2003 23:47:54 -0000 1.2 --- todo.html 31 Dec 2003 02:50:49 -0000 1.3 *************** *** 15,21 **** </li> <li> - Implement the new filtering mechanism for NodeList.searchFor (). - </li> - <li> As of now, it's more likely that the javadocs are lying to you than providing any helpful advice. This needs to be reworked completely. --- 15,18 ---- |
From: <der...@us...> - 2003-12-31 02:50:52
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs1:/tmp/cvs-serv23579/docs Modified Files: panel.html Removed Files: banner.html bar.gif docsindex.html sample.html Log Message: Add filter support to NodeList. Rework LinkExtractor and remove MailRipper and Robot example programs. Clean out docs directory. Index: panel.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/panel.html,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** panel.html 24 Feb 2003 06:23:02 -0000 1.5 --- panel.html 31 Dec 2003 02:50:49 -0000 1.6 *************** *** 15,22 **** <li> <a href="http://sourceforge.net/project/showfiles.php?group_id=24399&release_id=129477" target="mainFrame">Download</a></li> ! <li><a href="samples/index.html" target="mainFrame">Sample Programs</a></li> <li> <a href="docs/index.html" target="_parent">Documentation</a></li> ! <li><a href="articles/index.html" target="mainFrame">Articles</a></li> <li> <a href="mailinglists.html" target="mainFrame">Mailing Lists</a> </li> --- 15,22 ---- <li> <a href="http://sourceforge.net/project/showfiles.php?group_id=24399&release_id=129477" target="mainFrame">Download</a></li> ! <li> <a href="samples/index.html" target="mainFrame">Sample Programs</a></li> <li> <a href="docs/index.html" target="_parent">Documentation</a></li> ! <li> <a href="articles/index.html" target="mainFrame">Articles</a></li> <li> <a href="mailinglists.html" target="mainFrame">Mailing Lists</a> </li> --- banner.html DELETED --- --- bar.gif DELETED --- --- docsindex.html DELETED --- --- sample.html DELETED --- |
From: <der...@us...> - 2003-12-29 14:18:27
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv29167/parserapplications Added Files: SiteCapturer.java Log Message: Add simplistic web site capture example application. Demonstration of using custom tags in the NodeFactory. Fixed various issues with URL rewriting. --- NEW FILE: SiteCapturer.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v $ // $Author: derrickoswald $ // $Date: 2003/12/29 14:18:24 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.parserapplications; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashSet; import javax.swing.JFileChooser; import javax.swing.JOptionPane; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * Save a web site locally. * Illustrative prgram to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. * This program has a number of limitations: * <ul> * <li>it doesn't capture forms, this would involve too many assumptions</li> * <li>it doesn't capture script references, so funky onMouseOver and other * non-static content will not be faithfully reproduced</li> * <li>it doesn't handle style sheets</li> * <li>it doesn't dig into attributes that might reference resources, so * for example, background images won't necessarily be captured</li> * <li>worst of all, it gets confused when a URL both has content and is * the prefix for other content, * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both * yield content, since this cannot be faithfully replicated to a static * directory structure (this happens a lot with servlet based sites)</li> *</ul> */ public class SiteCapturer { /** * The web site to capture. * This is used as the base URL in deciding whether to adjust a link * and whether to capture a page or not. */ protected String mSource; /** * The local directory to capture to. * This is used as a base prefix for files saved locally. */ protected String mTarget; /** * The list of pages to capture. * Links are added to this list as they are discovered, and removed in * sequential order (FIFO queue) leading to a breadth * first traversal of the web site space. */ protected ArrayList mPages; /** * The set of pages already captured. * Used to avoid repeated acquisition of the same page. */ protected HashSet mFinished; /** * The list of resources to copy. * Images and other resources are added to this list as they are discovered. */ protected ArrayList mImages; /** * The set of resources already copied. * Used to avoid repeated acquisition of the same images and other resources. */ protected HashSet mCopied; /** * The parser to use for processing. */ protected Parser mParser; /** * If <code>true</code>, save resources locally too, * otherwise, leave resource links pointing to original page. */ protected boolean mCaptureResources; /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. */ protected final int TRANSFER_SIZE = 4096; /** * Create a web site capturer. */ public SiteCapturer () { PrototypicalNodeFactory factory; mPages = new ArrayList (); mFinished = new HashSet (); mImages = new ArrayList (); mCopied = new HashSet (); mParser = new Parser (); factory = new PrototypicalNodeFactory (); factory.registerTag (new LocalLinkTag ()); factory.registerTag (new LocalFrameTag ()); factory.registerTag (new LocalBaseHrefTag ()); factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); } /** * Getter for property source. * @return Value of property source. */ public String getSource () { return (mSource); } /** * Setter for property source. * This is the base URL to capture. URL's that don't start with this prefix * are ignored (left as is), while the ones with this URL as a base are * re-homed to the local target. * @param source New value of property source. */ public void setSource (String source) { if (source.endsWith ("/")) source = source.substring (0, source.length () - 1); mSource = source; } /** * Getter for property target. * @return Value of property target. */ public String getTarget () { return (mTarget); } /** * Setter for property target. * This is the local directory under which to save the site's pages. * @param target New value of property target. */ public void setTarget (String target) { mTarget = target; } /** * Getter for property captureResources. * If <code>true</code>, the images and other resources referenced by * the site and within the base URL tree are also copied locally to the * target directory. If <code>false</code>, the image links are left 'as * is', still refering to the original site. * @return Value of property captureResources. */ public boolean getCaptureResources () { return (mCaptureResources); } /** * Setter for property captureResources. * @param capture New value of property captureResources. */ public void setCaptureResources (boolean capture) { mCaptureResources = capture; } /** * Returns <code>true</code> if the link is one we are interested in. * @param link The link to be checked. * @return <code>true</code> if the link has the source URL as a prefix * and doesn't contain '?' or '#'; the former because we won't be able to * handle server side queries in the static target directory structure and * the latter because presumably the full page with that reference has * already been captured previously. This performs a case insensitive * comparison, which is cheating really, but it's cheap. */ protected boolean isToBeCaptured (String link) { return ( link.toLowerCase ().startsWith (getSource ().toLowerCase ()) && (-1 == link.indexOf ("?")) && (-1 == link.indexOf ("#"))); } /** * Returns <code>true</code> if the link contains text/html content. * @return <code>true</code> if the HTTP header indicates the type is * "text/html". */ protected boolean isHtml (String link) throws ParserException { URL url; URLConnection connection; String type; boolean ret; ret = false; try { url = new URL (link); connection = url.openConnection (); type = connection.getContentType (); ret = type.startsWith ("text/html"); } catch (Exception e) { throw new ParserException ("URL " + link + " has a problem", e); } return (ret); } /** * Converts a link to local. * A relative link can be used to construct both a URL and a file name. * Basically, the operation is to strip off the base url, if any, * and then prepend as many dot-dots as necessary to make * it relative to the current page. * A bit of a kludge handles the root page specially by calling it * index.html, even though that probably isn't it's real file name. * This isn't pretty, but it works for me. * @param link The link to make relative. * @param current The current page URL, or empty if it's an absolute URL * that needs to be converted. * @return The URL relative to the current page. */ protected String makeLocalLink (String link, String current) { int i; int j; String ret; if (link.equals (getSource ())) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) && (link.length () > getSource ().length ())) ret = link.substring (getSource ().length () + 1); else ret = link; // give up // make it relative to the current page by prepending "../" for // each '/' in the current local path if ((null != current) && link.startsWith (getSource ()) && (current.length () > getSource ().length ())) { current = current.substring (getSource ().length () + 1); i = 0; while (-1 != (j = current.indexOf ('/', i))) { ret = "../" + ret; i = j + 1; } } return (ret); } /** * Copy a resource (image) locally. * Removes one element from the 'to be copied' list and saves the * resource it points to locally as a file. */ protected void copy () { String link; File file; File dir; URL source; byte[] data; InputStream in; FileOutputStream out; int read; link = (String)mImages.remove (0); mCopied.add (link); if (getCaptureResources ()) { file = new File (getTarget (), makeLocalLink (link, "")); System.out.println ("copying " + link + " to " + file.getAbsolutePath ()); // ensure directory exists dir = file.getParentFile (); if (!dir.exists ()) dir.mkdirs (); try { source = new URL (link); data = new byte [TRANSFER_SIZE]; try { in = source.openStream (); try { out = new FileOutputStream (file); try { while (-1 != (read = in.read (data, 0, data.length))) out.write (data, 0, read); } finally { out.close (); } } catch (FileNotFoundException fnfe) { fnfe.printStackTrace (); } finally { in.close (); } } catch (FileNotFoundException fnfe) { System.err.println ("broken link " + fnfe.getMessage () + " ignored"); } } catch (MalformedURLException murle) { murle.printStackTrace (); } catch (IOException ioe) { ioe.printStackTrace (); } } } /** * Process a single page. */ protected void process () throws ParserException { String url; NodeList list; File file; File dir; PrintWriter out; // get the next URL and add it to the done pile url = (String)mPages.remove (0); System.out.println ("processing " + url); mFinished.add (url); try { // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); ) list.add (e.nextNode ()); // URL conversion occurs in the tags // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); dir = file.getParentFile (); if (!dir.exists ()) dir.mkdirs (); try { out = new PrintWriter (new FileOutputStream (file)); for (int i = 0; i < list.size (); i++) out.print (list.elementAt (i).toHtml ()); out.close (); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace (); } } catch (ParserException pe) { String message; // this exception handling is suboptimal, // but it recognizes resources that aren't text/html message = pe.getMessage (); if ((null != message) && (message.endsWith ("does not contain text"))) { if (!mCopied.contains (url)) if (!mImages.contains (url)) mImages.add (url); mFinished.remove (url); } else throw pe; } } /** * Link tag that rewrites the HREF. * The HREF is changed to a local target if it matches the source. */ class LocalLinkTag extends LinkTag { public void doSemanticAction () throws ParserException { boolean html; String link; // get the link link = getLink (); // check if it needs to be captured if (isToBeCaptured (link)) { // add the link to a list to be processed if (mFinished.contains (link)) html = true; else if (mPages.contains (link)) html = true; else if (mCopied.contains (link)) html = false; else if (mImages.contains (link)) html = false; else { // this test is expensive, do it reluctantly html = isHtml (link); if (html) mPages.add (link); else mImages.add (link); } // alter the link if (html || (!html && getCaptureResources ())) link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); setLink (link); } } } /** * Frame tag that rewrites the SRC URLs. * The SRC URLs are mapped to local targets if they match the source. */ class LocalFrameTag extends FrameTag { public void doSemanticAction () throws ParserException { boolean html; String link; // get the link link = getFrameLocation (); // check if it needs to be captured if (isToBeCaptured (link)) { // add the link to a list to be processed if (mFinished.contains (link)) html = true; else if (mPages.contains (link)) html = true; else if (mCopied.contains (link)) html = false; else if (mImages.contains (link)) html = false; else { // this test is expensive, do it reluctantly html = isHtml (link); if (html) mPages.add (link); else mImages.add (link); } // alter the link if (html || (!html && getCaptureResources ())) link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); setFrameLocation (link); } } } /** * Image tag that rewrites the SRC URL. * If resources are being captured the SRC is mapped to a local target if * it matches the source, otherwise it is convered to a full URL to point * back to the original site. */ class LocalImageTag extends ImageTag { public void doSemanticAction () throws ParserException { String image; // get the image url image = getImageURL (); // check if it needs to be captured if (isToBeCaptured (image)) { // add the image to the list needing to be copied if (!mCopied.contains (image)) if (!mImages.contains (image)) mImages.add (image); if (getCaptureResources ()) image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ()); // alter the link setImageURL (image); } } } /** * Base tag that doesn't show. * The toHtml() method is overridden to return an empty string, * effectively shutting off the base reference. */ class LocalBaseHrefTag extends BaseHrefTag { // we don't want to have a base pointing back at the source page public String toHtml () { return (""); } } /** * Perform the capture. */ public void capture () { mPages.clear (); mPages.add (getSource ()); while (0 != mPages.size ()) try { process (); while (0 != mImages.size ()) copy (); } catch (ParserException pe) { // this exception handling is suboptimal, // but it messages correctly about broken links Throwable throwable; throwable = pe.getThrowable (); if (null != throwable) { throwable = throwable.getCause (); if (throwable instanceof FileNotFoundException) System.err.println ("broken link " + ((FileNotFoundException)throwable).getMessage () + " ignored"); else pe.printStackTrace (); } else pe.printStackTrace (); } } /** * Mainline to capture a web site locally. * @param args The command line arguments. * There are three arguments the web site to capture, the local directory * to save it to, and a flag (true or false) to indicate whether resources * such as images and video are to be captured as well. * These are requested via dialog boxes if not supplied. */ public static void main (String[] args) throws MalformedURLException, IOException { SiteCapturer worker; String url; JFileChooser chooser; URL source; String path; File target; Boolean capture; int ret; worker = new SiteCapturer (); if (0 >= args.length) { url = (String)JOptionPane.showInputDialog ( null, "Enter the URL to capture:", "Web Site", JOptionPane.PLAIN_MESSAGE, null, null, "http://htmlparser.sourceforge.net/docs"); if (null != url) worker.setSource (url); else System.exit (1); } else worker.setSource (args[0]); if (1 >= args.length) { url = worker.getSource (); source = new URL (url); path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath (); target = new File (path); chooser = new JFileChooser (target); chooser.setDialogType (JFileChooser.SAVE_DIALOG); chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY); chooser.setSelectedFile (target); // this doesn't frickin' work chooser.setMultiSelectionEnabled (false); chooser.setDialogTitle ("Target Directory"); ret = chooser.showSaveDialog (null); if (ret == JFileChooser.APPROVE_OPTION) worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ()); else System.exit (1); } else worker.setTarget (args[1]); if (2 >= args.length) { capture = (Boolean)JOptionPane.showInputDialog ( null, "Should resources be captured:", "Capture Resources", JOptionPane.PLAIN_MESSAGE, null, new Object[] { Boolean.TRUE, Boolean.FALSE}, Boolean.TRUE); if (null != capture) worker.setCaptureResources (capture.booleanValue ()); else System.exit (1); } else worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ())); worker.capture (); System.exit (0); } } |
From: <der...@us...> - 2003-12-29 14:18:27
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1:/tmp/cvs-serv29167/tests/tagTests Modified Files: BaseHrefTagTest.java Log Message: Add simplistic web site capture example application. Demonstration of using custom tags in the NodeFactory. Fixed various issues with URL rewriting. Index: BaseHrefTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/BaseHrefTagTest.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** BaseHrefTagTest.java 8 Dec 2003 01:31:55 -0000 1.36 --- BaseHrefTagTest.java 29 Dec 2003 14:18:24 -0000 1.37 *************** *** 79,83 **** assertTrue("Base href tag should be the 4th tag", node[3] instanceof BaseHrefTag); BaseHrefTag baseRefTag = (BaseHrefTag)node[3]; ! assertEquals("Base HREF Url","http://www.abc.com",baseRefTag.getBaseUrl()); } --- 79,83 ---- assertTrue("Base href tag should be the 4th tag", node[3] instanceof BaseHrefTag); BaseHrefTag baseRefTag = (BaseHrefTag)node[3]; ! assertEquals("Base HREF Url","http://www.abc.com/",baseRefTag.getBaseUrl()); } |
From: <der...@us...> - 2003-12-29 14:18:27
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv29167/tags Modified Files: BaseHrefTag.java FrameTag.java Log Message: Add simplistic web site capture example application. Demonstration of using custom tags in the NodeFactory. Fixed various issues with URL rewriting. Index: BaseHrefTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/BaseHrefTag.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** BaseHrefTag.java 8 Dec 2003 01:31:52 -0000 1.34 --- BaseHrefTag.java 29 Dec 2003 14:18:23 -0000 1.35 *************** *** 71,75 **** base = getAttribute ("HREF"); if (base != null && base.length() > 0) ! base = LinkProcessor.removeLastSlash (base.trim()); base = (null == base) ? "" : base; --- 71,75 ---- base = getAttribute ("HREF"); if (base != null && base.length() > 0) ! base = base.trim (); base = (null == base) ? "" : base; *************** *** 80,90 **** { setAttribute ("HREF", base); - } - - public String toString() - { - return "BASE TAG\n"+ - "--------\n"+ - "Name : "+getBaseUrl(); } --- 80,83 ---- Index: FrameTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/FrameTag.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** FrameTag.java 8 Dec 2003 01:31:52 -0000 1.32 --- FrameTag.java 29 Dec 2003 14:18:24 -0000 1.33 *************** *** 58,71 **** /** ! * Returns the location of the frames. ! * TODO: handle base url? */ ! public String getFrameLocation() { ! String relativeFrame = getAttribute ("SRC"); ! if (relativeFrame==null) return ""; else ! return (new LinkProcessor()).extract(relativeFrame, getPage ().getUrl ()); } --- 58,82 ---- /** ! * Returns the location of the frame. ! * @return The contents of the SRC attribute converted to an absolute URL. */ ! public String getFrameLocation () { ! String src; ! ! src = getAttribute ("SRC"); ! if (null == src) return ""; else ! return (getPage ().getLinkProcessor ().extract (src, getPage ().getUrl ())); ! } ! ! /** ! * Sets the location of the frame. ! * @param url The new frame location. ! */ ! public void setFrameLocation (String url) ! { ! setAttribute ("SRC", url); } |
From: <der...@us...> - 2003-12-29 14:18:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv29167/lexer Modified Files: Page.java Log Message: Add simplistic web site capture example application. Demonstration of using custom tags in the NodeFactory. Fixed various issues with URL rewriting. Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** Page.java 16 Dec 2003 02:29:55 -0000 1.29 --- Page.java 29 Dec 2003 14:18:09 -0000 1.30 *************** *** 60,63 **** --- 60,69 ---- /** + * The default content type. + * In the absence of alternate information, assume html content. + */ + public static final String DEFAULT_CONTENT_TYPE = "text/html"; + + /** * The URL this page is coming from. * Cached value of <code>getConnection().toExternalForm()</code> or *************** *** 310,315 **** { Stream stream; String charset; - mConnection = connection; --- 316,321 ---- { Stream stream; + String type; String charset; mConnection = connection; *************** *** 327,331 **** throw new ParserException (ioe.getMessage (), ioe); } ! charset = getCharacterSet (); try { --- 333,343 ---- throw new ParserException (ioe.getMessage (), ioe); } ! type = getContentType (); ! if (!type.startsWith ("text")) ! throw new ParserException ( ! "URL " ! + connection.getURL ().toExternalForm () ! + " does not contain text"); ! charset = getCharset (type); try { *************** *** 391,394 **** --- 403,423 ---- /** + * Try and extract the content type from the HTTP header. + * @return The content type. + */ + public String getContentType () + { + URLConnection connection; + String ret; + + ret = DEFAULT_CONTENT_TYPE; + connection = getConnection (); + if (null != connection) + ret = connection.getContentType (); + + return (ret); + } + + /** * Read the character at the cursor position. * The cursor position can be behind or equal to the current source position. *************** *** 479,505 **** // update the EOL index in any case mIndex.add (cursor); - - return (ret); - } - - /** - * Try and extract the character set from the HTTP header. - * @return The character set name to use for this HTML page. - */ - public String getCharacterSet () - { - final String CONTENT_TYPE_STRING = "Content-Type"; - URLConnection connection; - String string; - String ret; - - ret = DEFAULT_CHARSET; - connection = getConnection (); - if (null != connection) - { - string = connection.getHeaderField (CONTENT_TYPE_STRING); - if (null != string) - ret = getCharset (string); - } return (ret); --- 508,511 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv12747/org/htmlparser/scanners Modified Files: CompositeTagScanner.java JspScanner.java ScriptScanner.java TagScanner.java package.html Added Files: Scanner.java Log Message: Reduce recursion on the JVM stack in CompositeTagScanner. Pass a stack of open tags to the scanner. Add smarter tag closing by walking up the stack on encountering an unopened end tag. Avoids a problem with bad HTML such as that found at http://scores.nba.com/games/20031029/scoreboard.html by Shaun Roach. Added testInvalidNesting to CompositeTagScanner Test based on the above. --- NEW FILE: Scanner.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/Scanner.java,v $ // $Author: derrickoswald $ // $Date: 2003/12/20 23:47:55 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.scanners; import org.htmlparser.lexer.Lexer; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * Generic interface for scanning. * Tags needing specialized operations can provide an object that implements * this interface via getThisScanner(). * By default non-composite tags simply perform the semantic action and * return while composite tags will gather their children. */ public interface Scanner { /** * Scan the tag. * The Lexer is provided in order to do a lookahead operation. * @param tag HTML tag to be scanned for identification. * @param lexer Provides html page access. * @param stack The parse stack. May contain pending tags that enclose * this tag. Nodes on the stack should be considered incomplete. * @return The resultant tag (may be unchanged). * @exception ParserException if an unrecoverable problem occurs. */ public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException; } Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.83 retrieving revision 1.84 diff -C2 -d -r1.83 -r1.84 *** CompositeTagScanner.java 8 Dec 2003 13:13:59 -0000 1.83 --- CompositeTagScanner.java 20 Dec 2003 23:47:55 -0000 1.84 *************** *** 1,4 **** ! // HTMLParser Library v1_4_20031207 - A java-based parser for HTML ! // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or --- 1,12 ---- ! // HTMLParser Library $Name$ - A java-based parser for HTML ! // http://sourceforge.org/projects/htmlparser ! // Copyright (C) 2003 Somik Raha ! // ! // Revision Control Information ! // ! // $Source$ ! // $Author$ ! // $Date$ ! // $Revision$ // // This library is free software; you can redistribute it and/or *************** *** 9,29 **** // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ! // ! // For any questions or suggestions, you can write to me at : ! // Email :so...@in... // - // Postal Address : - // Somik Raha - // Extreme Programmer & Coach - // Industrial Logic Corporation - // 2583 Cedar Street, Berkeley, - // CA 94708, USA - // Website : http://www.industriallogic.com package org.htmlparser.scanners; --- 17,27 ---- // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.scanners; *************** *** 37,40 **** --- 35,39 ---- import org.htmlparser.lexer.Page; import org.htmlparser.lexer.nodes.Attribute; + import org.htmlparser.scanners.Scanner; import org.htmlparser.tags.CompositeTag; import org.htmlparser.tags.Tag; *************** *** 43,167 **** /** ! * To create your own scanner that can create tags tht hold children, create a subclass of this class. ! * The composite tag scanner can be configured with:<br> ! * <ul> ! * <li>Tags which will trigger a match</li> ! * <li>Tags which when encountered before a legal end tag, should force a correction</li> ! * </ul> ! * Here are examples of each:<BR> ! * <B>Tags which will trigger a match</B> ! * If we wish to recognize <mytag>, ! * <pre> ! * MyScanner extends CompositeTagScanner { ! * private static final String [] MATCH_IDS = { "MYTAG" }; ! * MyScanner() { ! * super(MATCH_IDS); ! * } ! * ... ! * } ! * </pre> ! * <B>Tags which force correction</B> ! * If we wish to insert end tags if we get a </BODY> or </HTML> without recieving ! * </mytag> ! * <pre> ! * MyScanner extends CompositeTagScanner { ! * private static final String [] MATCH_IDS = { "MYTAG" }; ! * private static final String [] ENDERS = {}; ! * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" }; ! * MyScanner() { ! * super(MATCH_IDS, ENDERS, END_TAG_ENDERS, true); ! * } ! * ... ! * } ! * </pre> ! * <B>Preventing children of same type</B> ! * This is useful when you know that a certain tag can never hold children of its own type. ! * e.g. <FORM> can never have more form tags within it. If it does, it is an error and should ! * be corrected. Specify the tagEnders set to contain (at least) the match ids. ! * <pre> ! * MyScanner extends CompositeTagScanner { ! * private static final String [] MATCH_IDS = { "FORM" }; ! * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" }; ! * MyScanner() { ! * super(MATCH_IDS, MATCH_IDS, END_TAG_ENDERS, false); ! * } ! * ... ! * } ! * </pre> ! * Inside the scanner, use createTag() to specify what tag needs to be created. */ public class CompositeTagScanner extends TagScanner { ! protected Set tagEnderSet; ! private Set endTagEnderSet; ! private boolean balance_quotes; ! ! public CompositeTagScanner() ! { ! this(new String[] {}); ! } ! ! public CompositeTagScanner(String [] tagEnders) ! { ! this("",tagEnders); ! } ! ! public CompositeTagScanner(String filter) ! { ! this(filter,new String [] {}); ! } ! ! public CompositeTagScanner( ! String filter, ! String [] tagEnders) ! { ! this(filter,tagEnders,new String[] {}); ! } ! public CompositeTagScanner( ! String filter, ! String [] tagEnders, ! String [] endTagEnders) ! { ! this(filter,tagEnders,endTagEnders, false); ! } ! /** ! * Constructor specifying all member fields. ! * @param filter A string that is used to match which tags are to be allowed ! * to pass through. This can be useful when one wishes to dynamically filter ! * out all tags except one type which may be programmed later than the parser. ! * @param tagEnders The non-endtag tag names which signal that no closing ! * end tag was found. For example, encountering <FORM> while ! * scanning a <A> link tag would mean that no </A> was found ! * and needs to be corrected. ! * @param endTagEnders The endtag names which signal that no closing end ! * tag was found. For example, encountering </HTML> while ! * scanning a <BODY> tag would mean that no </BODY> was found ! * and needs to be corrected. These items are not prefixed by a '/'. ! * @param balance_quotes <code>true</code> if scanning string nodes needs to ! * honour quotes. For example, ScriptScanner defines this <code>true</code> ! * so that text within <SCRIPT></SCRIPT> ignores tag-like text ! * within quotes. ! */ ! public CompositeTagScanner( ! String filter, ! String [] tagEnders, ! String [] endTagEnders, ! boolean balance_quotes) { - super(filter); - this.balance_quotes = balance_quotes; - this.tagEnderSet = new HashSet(); - for (int i=0;i<tagEnders.length;i++) - tagEnderSet.add(tagEnders[i]); - this.endTagEnderSet = new HashSet(); - for (int i=0;i<endTagEnders.length;i++) - endTagEnderSet.add(endTagEnders[i]); } /** * Collect the children. ! * An initial test is performed for an empty XML tag, in which case * the start tag and end tag of the returned tag are the same and it has * no children.<p> --- 42,78 ---- /** ! * The main scanning logic for nested tags. ! * When asked to scan, this class gathers nodes into a heirarchy of tags. */ public class CompositeTagScanner extends TagScanner { ! /** ! * Determine whether to use JVM or NodeList stack. ! * This can be set to true to get the original behaviour of ! * recursion into composite tags on the JVM stack. ! * This may lead to StackOverFlowException problems in some cases ! * i.e. Windows. ! */ ! private static final boolean mUseJVMStack = false; ! /** ! * Determine whether unexpected end tags should cause stack roll-up. ! * This can be set to true to get the original behaviour of gathering ! * end tags into whatever tag is open. ! * This can be expensive, but should only be needed in the presence of ! * bad HTML. ! */ ! private static final boolean mLeaveEnds = false; ! /** ! * Create a composite tag scanner. ! */ ! public CompositeTagScanner () { } /** * Collect the children. ! * <p>An initial test is performed for an empty XML tag, in which case * the start tag and end tag of the returned tag are the same and it has * no children.<p> *************** *** 171,221 **** * In the latter case, a virtual end tag is created. * Each node found that is not the end tag is added to ! * the list of children.<p> ! * The scanner's {@link #createTag} method is called with details about ! * the start tag, end tag and children. The attributes from the start tag ! * will wind up duplicated in the newly created tag, so the start tag is ! * kind of redundant (and may be removed in subsequent refactoring). ! * @param tag The tag this scanner is responsible for. This will be the ! * start (and possibly end) tag passed to {@link #createTag}. ! * @param url The url for the page the tag is discovered on. * @param lexer The source of subsequent nodes. ! * @return The scanner specific tag from the call to {@link #createTag}. */ ! public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException { Node node; ! NodeList nodeList; ! Tag endTag; ! String match; String name; ! TagScanner scanner; CompositeTag ret; ! nodeList = new NodeList (); ! endTag = null; ! match = tag.getTagName (); ! if (tag.isEmptyXmlTag ()) ! endTag = tag; else do { ! node = lexer.nextNode (balance_quotes); if (null != node) { if (node instanceof Tag) { ! Tag next = (Tag)node; name = next.getTagName (); // check for normal end tag ! if (next.isEndTag () && name.equals (match)) { ! endTag = next; node = null; } ! else if (isTagToBeEndedFor (tag, next)) // check DTD { ! // insert a virtual end tag and backup one node ! endTag = createVirtualEndTag (tag, lexer.getPage (), next.getStartPosition ()); lexer.setPosition (next.getStartPosition ()); node = null; --- 82,131 ---- * In the latter case, a virtual end tag is created. * Each node found that is not the end tag is added to ! * the list of children. The end tag is special and not a child.<p> ! * Nodes that also have a CompositeTagScanner as their scanner are ! * recursed into, which provides the nested structure of an HTML page. ! * This method operates in two possible modes, depending on a private boolean. ! * It can recurse on the JVM stack, which has caused some overflow problems ! * in the past, or it can use the supplied stack argument to nest scanning ! * of child tags within itself. The former is left as an option in the code, ! * mostly to help subsequent modifiers visualize what the internal nesting ! * is doing. ! * @param tag The tag this scanner is responsible for. * @param lexer The source of subsequent nodes. ! * @param stack The parse stack. May contain pending tags that enclose ! * this tag. ! * @return The resultant tag (may be unchanged). */ ! public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException { Node node; ! Tag next; String name; ! Scanner scanner; CompositeTag ret; ! ret = (CompositeTag)tag; ! if (ret.isEmptyXmlTag ()) ! ret.setEndTag (ret); else do { ! node = lexer.nextNode (false); if (null != node) { if (node instanceof Tag) { ! next = (Tag)node; name = next.getTagName (); // check for normal end tag ! if (next.isEndTag () && name.equals (ret.getTagName ())) { ! ret.setEndTag (next); node = null; } ! else if (isTagToBeEndedFor (ret, next)) // check DTD { ! // backup one node. insert a virtual end tag later lexer.setPosition (next.getStartPosition ()); node = null; *************** *** 225,249 **** // now recurse if there is a scanner for this type of tag scanner = next.getThisScanner (); ! if ((null != scanner) && scanner.evaluate (next, null)) ! node = scanner.scan (next, lexer.getPage ().getUrl (), lexer); } } ! if (null != node) ! nodeList.add (node); } } while (null != node); ! if (null == endTag) ! endTag = createVirtualEndTag (tag, lexer.getPage (), lexer.getCursor ().getPosition ()); ! ! ret = (CompositeTag)tag; ! ret.setEndTag (endTag); ! ret.setChildren (nodeList); ! for (int i = 0; i < ret.getChildCount (); i++) ! ret.childAt (i).setParent (ret); ! endTag.setParent (ret); ! ret.doSemanticAction (); return (ret); --- 135,275 ---- // now recurse if there is a scanner for this type of tag scanner = next.getThisScanner (); ! if (null != scanner) ! { ! if (mUseJVMStack) ! { // JVM stack recursion ! node = scanner.scan (next, lexer, stack); ! addChild (ret, node); ! } ! else ! { ! // fake recursion: ! if ((scanner == this) && (next instanceof CompositeTag)) ! { ! CompositeTag ondeck = (CompositeTag)next; ! if (ondeck.isEmptyXmlTag ()) ! { ! ondeck.setEndTag (ondeck); ! finishTag (ondeck, lexer); ! addChild (ret, ondeck); ! } ! else ! { ! stack.add (ret); ! ret = ondeck; ! } ! } ! else ! { // normal recursion if switching scanners ! node = scanner.scan (next, lexer, stack); ! addChild (ret, node); ! } ! } ! } ! else ! addChild (ret, next); ! } ! else ! { ! if (!mUseJVMStack && !mLeaveEnds) ! { ! // Since all non-end tags are consumed by the ! // previous clause, we're here because we have an ! // end tag with no opening tag... this could be bad. ! // There are two cases... ! // 1) The tag hasn't been registered, in which case ! // we just add it as a simple child, like it's ! // opening tag ! // 2) There may be an opening tag further up the ! // parse stack that needs closing. ! // So, we ask the factory for a node like this one ! // (since end tags never have scanners) and see ! // if it's scanner is a composite tag scanner. ! // If it is we walk up the parse stack looking for ! // something that needs this end tag to finish it. ! // If there is something, we close off all the tags ! // walked over and continue on as if nothing ! // happened. ! Vector attributes = new Vector (); ! attributes.addElement (new Attribute (name, null)); ! Tag opener = (Tag)lexer.getNodeFactory ().createTagNode ( ! next.getPage (), next.getStartPosition (), next.getEndPosition (), ! attributes); ! ! scanner = opener.getThisScanner (); ! if ((null != scanner) && (scanner == this)) ! { ! // uh-oh ! int index = -1; ! for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--) ! { ! // short circuit here... assume everything on the stack is a CompositeTag and has this as it's scanner ! // we'll need to stop if either of those conditions isn't met ! CompositeTag boffo = (CompositeTag)stack.elementAt (i); ! if (name.equals (boffo.getTagName ())) ! index = i; ! else if (isTagToBeEndedFor (boffo, next)) // check DTD ! index = i; ! } ! if (-1 != index) ! { ! // finish off the current one first ! finishTag (ret, lexer); ! addChild ((CompositeTag)stack.elementAt (stack.size () - 1), ret); ! for (int i = stack.size () - 1; i > index; i--) ! { ! CompositeTag fred = (CompositeTag)stack.remove (i); ! finishTag (fred, lexer); ! addChild ((CompositeTag)stack.elementAt (i - 1), fred); ! } ! ret = (CompositeTag)stack.remove (index); ! node = null; ! } ! else ! addChild (ret, next); // default behaviour ! } ! else ! addChild (ret, next); // default behaviour ! } ! else ! addChild (ret, next); } } + else + addChild (ret, node); + } ! if (!mUseJVMStack) ! { ! // handle coming out of fake recursion ! if (null == node) ! { ! int depth = stack.size (); ! if (0 != depth) ! { ! node = stack.elementAt (depth - 1); ! if (node instanceof CompositeTag) ! { ! CompositeTag precursor = (CompositeTag)node; ! scanner = precursor.getThisScanner (); ! if (scanner == this) ! { ! stack.remove (depth - 1); ! finishTag (ret, lexer); ! addChild (precursor, ret); ! ret = precursor; ! } ! else ! node = null; // normal recursion ! } ! else ! node = null; // normal recursion ! } ! } } } while (null != node); ! finishTag (ret, lexer); return (ret); *************** *** 251,264 **** /** * Creates an end tag with the same name as the given tag. - * NOTE: This does not call the {@link #createTag} method, but may in the - * future after refactoring. * @param tag The tag to end. * @param page The page the tag is on (virtually). * @param position The offset into the page at which the tag is to * be anchored. ! * @return An end tag with the name "/" + tag.getTagName() and a start ! * and end position at the given position. The fact these are equal may ! * be used to distinguish it as a virtual tag. */ protected Tag createVirtualEndTag (Tag tag, Page page, int position) --- 277,319 ---- /** + * Add a child to the given tag. + * @param parent The parent tag. + * @param child The child node. + */ + protected void addChild (Tag parent, Node child) + { + if (null == parent.getChildren ()) + parent.setChildren (new NodeList ()); + child.setParent (parent); + parent.getChildren ().add (child); + } + + /** + * Finish off a tag. + * Perhap add a virtual end tag. + * Set the end tag parent as this tag. + * Perform the semantic acton. + * @param tag The tag to finish off. + * @param lexer A lexer positioned at the end of the tag. + */ + protected void finishTag (CompositeTag tag, Lexer lexer) + throws + ParserException + { + if (null == tag.getEndTag ()) + tag.setEndTag (createVirtualEndTag (tag, lexer.getPage (), lexer.getCursor ().getPosition ())); + tag.getEndTag ().setParent (tag); + tag.doSemanticAction (); + } + + /** * Creates an end tag with the same name as the given tag. * @param tag The tag to end. * @param page The page the tag is on (virtually). * @param position The offset into the page at which the tag is to * be anchored. ! * @return An end tag with the name '"/" + tag.getTagName()' and a start ! * and end position at the given position. The fact these positions are ! * equal may be used to distinguish it as a virtual tag later on. */ protected Tag createVirtualEndTag (Tag tag, Page page, int position) *************** *** 277,287 **** /** ! * For composite tags this shouldn't be used and hence throws an exception. */ - public Tag createTag (Page page, int start, int end, Vector attributes, Tag tag, String url) throws ParserException - { - throw new ParserException ("composite tags shouldn't be using this"); - } - public final boolean isTagToBeEndedFor (Tag current, Tag tag) { --- 332,344 ---- /** ! * Determine if the current tag should be terminated by the given tag. ! * Examines the 'enders' or 'end tag enders' lists of the current tag ! * for a match with the given tag. Which list is chosen depends on whether ! * tag is an end tag ('end tag enders') or not ('enders'). ! * @param current The tag that might need to be ended. ! * @param tag The candidate tag that might end the current one. ! * @return <code>true</code> if the name of the given tag is a member of ! * the appropriate list. */ public final boolean isTagToBeEndedFor (Tag current, Tag tag) { Index: JspScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/JspScanner.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** JspScanner.java 8 Dec 2003 01:31:52 -0000 1.33 --- JspScanner.java 20 Dec 2003 23:47:55 -0000 1.34 *************** *** 1,4 **** ! // HTMLParser Library v1_4_20031207 - A java-based parser for HTML ! // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or --- 1,12 ---- ! // HTMLParser Library $Name$ - A java-based parser for HTML ! // http://sourceforge.org/projects/htmlparser ! // Copyright (C) 2003 Somik Raha ! // ! // Revision Control Information ! // ! // $Source$ ! // $Author$ ! // $Date$ ! // $Revision$ // // This library is free software; you can redistribute it and/or *************** *** 9,71 **** // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ! // ! // For any questions or suggestions, you can write to me at : ! // Email :so...@in... // - // Postal Address : - // Somik Raha - // Extreme Programmer & Coach - // Industrial Logic Corporation - // 2583 Cedar Street, Berkeley, - // CA 94708, USA - // Website : http://www.industriallogic.com - package org.htmlparser.scanners; ! import java.util.Vector; ! import org.htmlparser.lexer.Page; ! ///////////////////////// ! // HTML Parser Imports // ! ///////////////////////// ! import org.htmlparser.tags.JspTag; ! import org.htmlparser.tags.Tag; ! import org.htmlparser.util.ParserException; ! ! public class JspScanner extends TagScanner { ! ! public JspScanner() { ! super(); ! } ! ! public JspScanner(String filter) { ! super(filter); ! } ! ! public String [] getID() { ! String [] ids = new String[3]; ! ids[0] = "%"; ! ids[1] = "%="; ! ids[2] = "%@"; ! return ids; ! } ! ! public Tag createTag (Page page, int start, int end, Vector attributes, Tag tag, String url) throws ParserException { - JspTag ret; - - ret = new JspTag (); - ret.setPage (page); - ret.setStartPosition (start); - ret.setEndPosition (end); - ret.setAttributesEx (attributes); - - return (ret); } } --- 17,41 ---- // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.scanners; ! /** ! * Placeholder for <em>yet to be written</em> scanner for JSP tags. ! * This vacuous class does nothing special at the moment. ! */ ! public class JspScanner extends TagScanner ! { ! /** ! * Create a new JspScanner. ! */ ! public JspScanner () { } } Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** ScriptScanner.java 8 Dec 2003 01:31:52 -0000 1.53 --- ScriptScanner.java 20 Dec 2003 23:47:55 -0000 1.54 *************** *** 1,4 **** ! // HTMLParser Library v1_4_20031207 - A java-based parser for HTML ! // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or --- 1,12 ---- ! // HTMLParser Library $Name$ - A java-based parser for HTML ! // http://sourceforge.org/projects/htmlparser ! // Copyright (C) 2003 Somik Raha ! // ! // Revision Control Information ! // ! // $Source$ ! // $Author$ ! // $Date$ ! // $Revision$ // // This library is free software; you can redistribute it and/or *************** *** 9,29 **** // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ! // ! // For any questions or suggestions, you can write to me at : ! // Email :so...@in... // - // Postal Address : - // Somik Raha - // Extreme Programmer & Coach - // Industrial Logic Corporation - // 2583 Cedar Street, Berkeley, - // CA 94708, USA - // Website : http://www.industriallogic.com package org.htmlparser.scanners; --- 17,27 ---- // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.scanners; *************** *** 53,85 **** CompositeTagScanner { ! private static final String SCRIPT_END_TAG = "</SCRIPT>"; ! private static final String MATCH_NAME [] = {"SCRIPT"}; ! private static final String ENDERS [] = {"BODY", "HTML"}; ! ! public ScriptScanner() { ! super("",ENDERS); ! } ! ! public ScriptScanner(String filter) { ! super(filter,ENDERS); ! } ! ! public String [] getID() { ! return MATCH_NAME; ! } ! ! public Tag createTag(Page page, int start, int end, Vector attributes, Tag startTag, Tag endTag, NodeList children) throws ParserException { - ScriptTag ret; - - ret = new ScriptTag (); - ret.setPage (page); - ret.setStartPosition (start); - ret.setEndPosition (end); - ret.setAttributesEx (attributes); - ret.setEndTag (endTag); - ret.setChildren (children); - - return (ret); } --- 51,59 ---- CompositeTagScanner { ! /** ! * Create a script scanner. ! */ ! public ScriptScanner() { } *************** *** 88,95 **** * Accumulates nodes returned from the lexer, until </SCRIPT>, * <BODY> or <HTML> is encountered. Replaces the node factory ! * in the lexer with a new Parser to avoid other scanners missing their ! * end tags and accumulating even the </SCRIPT>. */ ! public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException { --- 62,72 ---- * Accumulates nodes returned from the lexer, until </SCRIPT>, * <BODY> or <HTML> is encountered. Replaces the node factory ! * in the lexer with a new (empty) one to avoid other scanners missing their ! * end tags and accumulating even the </SCRIPT> tag. ! * @param tag The tag this scanner is responsible for. ! * @param lexer The source of subsequent nodes. ! * @param stack The parse stack, <em>not used</em>. */ ! public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException { *************** *** 118,122 **** if (node instanceof Tag) if ( ((Tag)node).isEndTag () ! && ((Tag)node).getTagName ().equals (MATCH_NAME[0])) { end = (Tag)node; --- 95,99 ---- if (node instanceof Tag) if ( ((Tag)node).isEndTag () ! && ((Tag)node).getTagName ().equals (tag.getIds ()[0])) { end = (Tag)node; *************** *** 181,194 **** return (ret); - } - - /** - * Gets the end tag that the scanner uses to stop scanning. Subclasses of - * <code>ScriptScanner</code> you should override this method. - * @return String containing the end tag to search for, i.e. </SCRIPT> - */ - public String getEndTag() - { - return SCRIPT_END_TAG; } } --- 158,161 ---- Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.52 retrieving revision 1.53 diff -C2 -d -r1.52 -r1.53 *** TagScanner.java 8 Dec 2003 13:13:59 -0000 1.52 --- TagScanner.java 20 Dec 2003 23:47:55 -0000 1.53 *************** *** 1,4 **** ! // HTMLParser Library v1_4_20031207 - A java-based parser for HTML ! // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or --- 1,12 ---- ! // HTMLParser Library $Name$ - A java-based parser for HTML ! // http://sourceforge.org/projects/htmlparser ! // Copyright (C) 2003 Somik Raha ! // ! // Revision Control Information ! // ! // $Source$ ! // $Author$ ! // $Date$ ! // $Revision$ // // This library is free software; you can redistribute it and/or *************** *** 9,147 **** // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ! // ! // For any questions or suggestions, you can write to me at : ! // Email :so...@in... // - // Postal Address : - // Somik Raha - // Extreme Programmer & Coach - // Industrial Logic Corporation - // 2583 Cedar Street, Berkeley, - // CA 94708, USA - // Website : http://www.industriallogic.com package org.htmlparser.scanners; ! ////////////////// ! // Java Imports // ! ////////////////// import java.io.Serializable; - import java.util.Hashtable; - import java.util.Map; - import java.util.Vector; - import org.htmlparser.AbstractNode; - import org.htmlparser.Node; - import org.htmlparser.Parser; - import org.htmlparser.StringNode; import org.htmlparser.lexer.Lexer; - import org.htmlparser.lexer.Page; - import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.tags.Tag; ! import org.htmlparser.util.NodeIterator; import org.htmlparser.util.ParserException; - import org.htmlparser.util.ParserFeedback; /** ! * TagScanner is an abstract superclass which is subclassed to create specific ! * scanners. ! * This isn't much use other than creating a specific tag type since scanning ! * is mostly done by the lexer level. If you want to match end tags and ! * handle special syntax between tags, then you'll probably want to subclass ! * {@link CompositeTagScanner} instead. Use TagScanner when you have meta task ! * to do like setting the BASE url for the page when a BASE tag is encountered. ! * <br> ! * If you wish to write your own scanner, then you must implement scan(). ! * You MAY implement evaluate() as well, if your evaluation logic is not based ! * on a match of the tag name. ! * You MUST implement getID() - which identifies your scanner uniquely in the hashtable of scanners. ! * ! * <br> ! * Also, you have a feedback object provided to you, should you want to send log messages. This object is ! * instantiated by Parser when a scanner is added to its collection. ! * */ public class TagScanner implements Serializable { /** ! * A filter which is used to associate this tag. The filter contains a string ! * that is used to match which tags are to be allowed to pass through. This can ! * be useful when one wishes to dynamically filter out all tags except one type ! * which may be programmed later than the parser. Is also useful for command line ! * implementations of the parser. ! */ ! protected String filter; ! ! /** ! * Default Constructor, automatically registers the scanner into a static array of ! * scanners inside Tag */ public TagScanner () { - this (""); } /** ! * This constructor automatically registers the scanner, and sets the filter for this ! * tag. ! * @param filter The filter which will allow this tag to pass through. ! */ ! public TagScanner (String filter) ! { ! this.filter=filter; ! } ! ! /** ! * This method is used to decide if this scanner can handle this tag type. If the ! * evaluation returns true, the calling side makes a call to scan(). ! * <strong>This method has to be implemented meaningfully only if a first-word match with ! * the scanner id does not imply a match (or extra processing needs to be done). ! * Default returns true</strong> ! * @param tag The tag with a name that matches a value from {@link #getID}. ! * @param previousOpenScanner Indicates any previous scanner which hasn't ! * completed, before the current scan has begun, and hence allows us to ! * write scanners that can work with dirty html. ! */ ! public boolean evaluate (Tag tag, TagScanner previousOpenScanner) ! { ! return (true); ! } ! ! public String getFilter() ! { ! return filter; ! } ! ! /** ! * Scan the tag and extract the information related to this type. The url of the ! * initiating scan has to be provided in case relative links are found. The initial ! * url is then prepended to it to give an absolute link. ! * The Lexer is provided in order to do a lookahead operation. We assume that ! * the identification has already been performed using the evaluate() method. ! * @param tag HTML Tag to be scanned for identification. ! * @param url The initiating url of the scan (Where the html page lies). * @param lexer Provides html page access. * @return The resultant tag (may be unchanged). */ ! public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException { ! Tag ret; ! ! ret = tag; ! ret.doSemanticAction (); ! ! return (ret); ! } ! public String [] getID () ! { ! return (new String[0]); } } --- 17,73 ---- // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of ! // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software ! // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.scanners; ! import java.io.Serializable; import org.htmlparser.lexer.Lexer; import org.htmlparser.tags.Tag; ! import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** ! * TagScanner is an abstract superclass, subclassed to create specific scanners. ! * When asked to scan the tag, this class does nothing other than perform the ! * tag's semantic action. ! * Use TagScanner when you have a meta task to do like setting the BASE url for ! * the page when a BASE tag is encountered. ! * If you want to match end tags and handle special syntax between tags, ! * then you'll probably want to subclass {@link CompositeTagScanner} instead. */ public class TagScanner implements + Scanner, Serializable { /** ! * Create a (non-composite) tag scanner. */ public TagScanner () { } /** ! * Scan the tag. ! * For this implementation, the only operation is to perform the tag's ! * semantic action. ! * @param tag The tag to scan. * @param lexer Provides html page access. + * @param stack The parse stack. May contain pending tags that enclose + * this tag. * @return The resultant tag (may be unchanged). */ ! public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException { ! tag.doSemanticAction (); ! return (tag); } } Index: package.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/package.html,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** package.html 8 Dec 2003 01:31:52 -0000 1.18 --- package.html 20 Dec 2003 23:47:55 -0000 1.19 *************** *** 3,54 **** <head> <!-- ! @(#)package.html 1.60 98/01/27 ! ! HTMLParser Library v1_4_20031207 - A java-based parser for HTML ! Copyright (C) Dec 31, 2000 Somik Raha ! ! This library is free software; you can redistribute it and/or ! modify it under the terms of the GNU Lesser General Public ! License as published by the Free Software Foundation; either ! version 2.1 of the License, or (at your option) any later version. ! ! This library is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ! Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public ! License along with this library; if not, write to the Free Software ! Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ! ! For any questions or suggestions, you can write to me at : ! Email :so...@in... ! ! Postal Address : ! Somik Raha ! Extreme Programmer & Coach ! Industrial Logic Corporation ! 2583 Cedar Street, Berkeley, ! CA 94708, USA ! Website : http://www.industriallogic.com --> </head> <body bgcolor="white"> ! The scanners package contains scanners that can be fired automatically upon the identification of tags. ! Developers should familiarize themselves with this package, as extension to this framework will be mostly in the form of ! addition of custom scanners. ! ! ! <h2>Related Documentation</h2> ! ! For overviews, tutorials, examples, guides, and tool documentation, please see: ! <ul> ! <li><a href="http://htmlparser.sourceforge.net">HTML Parser Home Page</a> ! </ul> ! ! <!-- Put @see and @since tags down here. --> ! </body> </html> --- 3,51 ---- <head> <!-- + HTMLParser Library $Name$ - A java-based parser for HTML + http://sourceforge.org/projects/htmlparser + Copyright (C) 2003 Somik Raha ! Revision Control Information + $Source$ + $Author$ + $Date$ + $Revision$ + // + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + // + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + // + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + // --> </head> <body bgcolor="white"> ! The scanners package contains classes responsible for the tertiary ! identification of tags. The lower level classes in the {@link ! org.htmlparser.lexer.Lexer lexer} package convert ! byte streams to characters and characters to nodes (via the {@link ! org.htmlparser.lexer.nodes.NodeFactory NodeFactory}). In the case of tags, the ! scanners in this package can then complete the tag or override the current tag ! and return an augmented tag. The existing implementation of the {@link ! org.htmlparser.scanners.CompositeTagScanner composite tag ! scanner}, for example, gathers the children of composite tags, identifying the ! nested structure of HTML documents. The {@link ! org.htmlparser.scanners.ScriptScanner script scanner} overrides the nodes ! returned by the lexer and creates a tag containing a single string that is the ! script code.<br> ! You might need to create a scanner (that implements the {@link Scanner Scanner} interface) if ! the text you are trying to parse doesn't look like HTML, as is the case for the ! script scanner, or the normal processing of tags by nesting their structure is ! inadequate. </body> </html> |
From: <der...@us...> - 2003-12-20 23:47:58
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv12747/org/htmlparser/util Modified Files: IteratorImpl.java NodeList.java PeekingIteratorImpl.java Log Message: Reduce recursion on the JVM stack in CompositeTagScanner. Pass a stack of open tags to the scanner. Add smarter tag closing by walking up the stack on encountering an unopened end tag. Avoids a problem with bad HTML such as that found at http://scores.nba.com/games/20031029/scoreboard.html by Shaun Roach. Added testInvalidNesting to CompositeTagScanner Test based on the above. Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** IteratorImpl.java 8 Dec 2003 01:31:56 -0000 1.37 --- IteratorImpl.java 20 Dec 2003 23:47:55 -0000 1.38 *************** *** 32,36 **** import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; ! import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeIterator; --- 32,36 ---- import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; ! import org.htmlparser.scanners.Scanner; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeIterator; *************** *** 69,72 **** --- 69,76 ---- public Node nextNode() throws ParserException { + Tag tag; + String name; + Scanner scanner; + NodeList stack; Node ret; *************** *** 79,86 **** if (ret instanceof Tag) { - Tag tag; - String name; - TagScanner scanner; - tag = (Tag)ret; if (!tag.isEndTag ()) --- 83,86 ---- *************** *** 88,93 **** // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if ((null != scanner) && scanner.evaluate (tag, null)) ! ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); } } --- 88,96 ---- // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if (null != scanner) ! { ! stack = new NodeList (); ! ret = scanner.scan (tag, mLexer, stack); ! } } } Index: NodeList.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/NodeList.java,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** NodeList.java 8 Dec 2003 01:31:56 -0000 1.50 --- NodeList.java 20 Dec 2003 23:47:55 -0000 1.51 *************** *** 158,165 **** } ! public void remove(int index) { System.arraycopy(nodeData, index+1, nodeData, index, size-index-1); nodeData[size-1] = null; size--; } --- 158,168 ---- } ! public Node remove(int index) { ! Node ret; ! ret = nodeData[index]; System.arraycopy(nodeData, index+1, nodeData, index, size-index-1); nodeData[size-1] = null; size--; + return (ret); } Index: PeekingIteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/PeekingIteratorImpl.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** PeekingIteratorImpl.java 8 Nov 2003 21:30:57 -0000 1.1 --- PeekingIteratorImpl.java 20 Dec 2003 23:47:55 -0000 1.2 *************** *** 31,34 **** --- 31,37 ---- import org.htmlparser.Node; import org.htmlparser.lexer.Lexer; + import org.htmlparser.scanners.Scanner; + import org.htmlparser.tags.Tag; + import org.htmlparser.util.NodeList; /** *************** *** 50,53 **** --- 53,60 ---- public Node peek () throws ParserException { + Tag tag; + String name; + Scanner scanner; + NodeList stack; Node ret; *************** *** 63,69 **** if (ret instanceof org.htmlparser.tags.Tag) { - org.htmlparser.tags.Tag tag; - String name; - org.htmlparser.scanners.TagScanner scanner; tag = (org.htmlparser.tags.Tag)ret; --- 70,73 ---- *************** *** 72,77 **** // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if ((null != scanner) && scanner.evaluate (tag, null)) ! ret = scanner.scan (tag, mLexer.getPage ().getUrl (), mLexer); } } --- 76,84 ---- // now recurse if there is a scanner for this type of tag scanner = tag.getThisScanner (); ! if (null != scanner) ! { ! stack = new NodeList (); ! ret = scanner.scan (tag, mLexer, stack); ! } } } |
From: <der...@us...> - 2003-12-20 23:47:58
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv12747/org/htmlparser/tests/scannersTests Modified Files: CompositeTagScannerTest.java Log Message: Reduce recursion on the JVM stack in CompositeTagScanner. Pass a stack of open tags to the scanner. Add smarter tag closing by walking up the stack on encountering an unopened end tag. Avoids a problem with bad HTML such as that found at http://scores.nba.com/games/20031029/scoreboard.html by Shaun Roach. Added testInvalidNesting to CompositeTagScanner Test based on the above. Index: CompositeTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** CompositeTagScannerTest.java 8 Dec 2003 13:13:59 -0000 1.56 --- CompositeTagScannerTest.java 20 Dec 2003 23:47:55 -0000 1.57 *************** *** 37,40 **** --- 37,45 ---- import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; + import org.htmlparser.tags.Div; + import org.htmlparser.tags.LinkTag; + import org.htmlparser.tags.TableColumn; + import org.htmlparser.tags.TableRow; + import org.htmlparser.tags.TableTag; import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; *************** *** 756,759 **** --- 761,802 ---- return (mEndTagEnders); } + } + + /** + * Extracted from "http://scores.nba.com/games/20031029/scoreboard.html" + * which has a lot of table columns with unclosed DIV tags because the + * closing DIV doesn't have a slash. + * This caused java.lang.StackOverflowError on Windows. + * Tests the new non-recursive CompositeTagScanner with the walk back + * through the parse stack. + * See also Bug #750117 StackOverFlow while Node-Iteration and + * others. + */ + public void testInvalidNesting () throws ParserException + { + String html = "<table cellspacing=\"2\" cellpadding=\"0\" border=\"0\" width=\"600\">\n" + + "<tr>\n" + + "<td><div class=\"ScoreBoardSec\"> <a target=\"_parent\" class=\"ScoreBoardSec\" href=\"http://www.nba.com/heat/\">Heat</a><div></td>\n" + + "</tr>\n" + + "</table>"; + createParser (html); + parseAndAssertNodeCount (1); + assertType ("table", TableTag.class, node[0]); + TableTag table = (TableTag)node[0]; + assertTrue ("table should have 3 nodes", 3 == table.getChildCount ()); + assertType ("row", TableRow.class, table.childAt (1)); + TableRow row = (TableRow)table.childAt (1); + assertTrue ("row should have 3 nodes", 3 == row.getChildCount ()); + assertType ("column", TableColumn.class, row.childAt (1)); + TableColumn column = (TableColumn)row.childAt (1); + assertTrue ("column should have 1 node", 1 == column.getChildCount ()); + assertType ("element", Div.class, column.childAt (0)); + Div div = (Div)column.childAt (0); + assertTrue ("div should have 3 nodes", 3 == div.getChildCount ()); + assertType ("link", LinkTag.class, div.childAt (1)); + LinkTag link = (LinkTag)div.childAt (1); + assertTrue ("link contents", link.getLink ().equals ("http://www.nba.com/heat/")); + assertType ("bogus div", Div.class, div.childAt (2)); + assertTrue ("bogus div should have no children", 0 == ((Div)div.childAt (2)).getChildCount ()); } } |
From: <der...@us...> - 2003-12-20 23:47:58
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv12747/org/htmlparser/tags Modified Files: CompositeTag.java Log Message: Reduce recursion on the JVM stack in CompositeTagScanner. Pass a stack of open tags to the scanner. Add smarter tag closing by walking up the stack on encountering an unopened end tag. Avoids a problem with bad HTML such as that found at http://scores.nba.com/games/20031029/scoreboard.html by Shaun Roach. Added testInvalidNesting to CompositeTagScanner Test based on the above. Index: CompositeTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v retrieving revision 1.69 retrieving revision 1.70 diff -C2 -d -r1.69 -r1.70 *** CompositeTag.java 16 Dec 2003 02:29:56 -0000 1.69 --- CompositeTag.java 20 Dec 2003 23:47:55 -0000 1.70 *************** *** 374,379 **** } ! public int getChildCount() { ! return (getChildren ().size ()); } --- 374,384 ---- } ! public int getChildCount() ! { ! NodeList children; ! ! children = getChildren (); ! ! return ((null == children) ? 0 : children.size ()); } *************** *** 480,491 **** } } ! // eliminate virtual tags ! // if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ())) ! { ! for (int i = 0; i <= level; i++) ! buffer.append (" "); ! buffer.append (getEndTag ().toString ()); ! buffer.append (System.getProperty ("line.separator")); ! } } } --- 485,498 ---- } } ! ! if (null != getEndTag ()) ! // eliminate virtual tags ! // if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ())) ! { ! for (int i = 0; i <= level; i++) ! buffer.append (" "); ! buffer.append (getEndTag ().toString ()); ! buffer.append (System.getProperty ("line.separator")); ! } } } |
From: <der...@us...> - 2003-12-20 23:47:58
|
Update of /cvsroot/htmlparser/htmlparser/src/doc-files In directory sc8-pr-cvs1:/tmp/cvs-serv12747/doc-files Modified Files: todo.html Log Message: Reduce recursion on the JVM stack in CompositeTagScanner. Pass a stack of open tags to the scanner. Add smarter tag closing by walking up the stack on encountering an unopened end tag. Avoids a problem with bad HTML such as that found at http://scores.nba.com/games/20031029/scoreboard.html by Shaun Roach. Added testInvalidNesting to CompositeTagScanner Test based on the above. Index: todo.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/doc-files/todo.html,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** todo.html 16 Dec 2003 02:29:56 -0000 1.1 --- todo.html 20 Dec 2003 23:47:54 -0000 1.2 *************** *** 74,83 **** </li> <li> - The recursion that currently happens on the JVM stack can probably be done via a - stack of open tags passed to the scanner. This would probably avoid the 'Stack - overflow' exceptions observed on Windows and also allow for smarter tag closing - (in conjuction with the end tag enders list). - </li> - <li> Change all the headers to match the new format. The integration process needs to be revamped to use the $Name: CVS substitution (via 'get label'), so a checkin --- 74,77 ---- |
From: <der...@us...> - 2003-12-16 02:29:59
|
Update of /cvsroot/htmlparser/htmlparser/src/doc-files In directory sc8-pr-cvs1:/tmp/cvs-serv22177/src/doc-files Added Files: building.html overview.html todo.html Log Message: Javadoc changes and additions. Stylesheet, overview, build instructions and todo list. Added HTMLTaglet, an inline Javadoc taglet for embedding HTML into javadocs. --- NEW FILE: building.html --- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <HTML> <HEAD> <TITLE>How to Build the HTML Parser Libraries</TITLE> <link REL ="stylesheet" TYPE="text/css" HREF="../stylesheet.css" TITLE="Style"> </HEAD> <BODY bgcolor="white"> <H1>How to Build the HTML Parser libraries</H1> <H2>JDK</H2> Set up java. I won't include instructions here, just a link to the <a href="http://java.sun.com/j2se">Sun j2se site</a>. I use version 1.4.1, and you need a JDK (java development kit), not a JRE (java runtime environment).<p> Test your installation by typing command:<p> <code>javac</code><p> This should display help on the java compiler options. <H2>Ant</H2> Set up ant, the Java-based build tool from the <a href="http://jakarta.apache.org/ant/index.html">Apache Jakarta project</a>. It is kind of like Make, but without Make's wrinkles. The build.xml file the HTML Parser uses relies on command tags available in Ant version 1.4.1 or higher. The version currently used on the build machine is 1.5.3. The current version of Ant is available <a href="http://archive.apache.org/dist/ant/ant-current-bin.zip">here</a>.<p> Basically you unzip the file into a directory and add an ANT_HOME environment variable that points at it. Test your installation by typing the command:<p> <code>ant -help</code><p> This should display help on ant options. <H2>Third Party Libraries</H2> Any needed third-party libraries are included in the lib directory.<p> The unit test code relies on lib/junit.jar from the <a href="http://sourceforge.net/projects/junit">JUnit project</a>. The version used on the build machine is 3.8.1 which you can get <a href="http://prdownloads.sourceforge.net/junit/junit3.8.1.zip?download">here</a>. <H2>Sources</H2> The distribution zip file contains a src.jar file. If you've unpacked the distribution this file should be in the top level directory you chose.<p> Unjar this file with the command:<p> <code>jar -xf src.jar</code><p> There should now be a build.xml in the top level directory. <H2>Building</H2> The default ant target 'htmlparser' builds everything:<p> <code>ant</code><p> If you just want to build some of the parts see the help list:<p> <pre><code>ant -projecthelp Package glom the release and source files into the distribution zip file Release prepare the release files changelog create the change log from CVS logs checkstyle check source code adheres to coding standards clean cleanup compile compile all java files compilelexer compile lexer java files compileparser compile parser java files htmlparser same as Package plus cleanup init initialize version properties jar create htmlparser.jar and htmllexer.jar jarlexer create htmllexer.jar jarparser create htmlparser.jar javadoc create JavaDoc (API) documentation sources create the source zip test run the JUnit tests thumbelina create thumbelina.jar versionSource update the version in all java files </code></pre><p> <H2>Developing</H2> For development purposes you might want to get an Integrated Development Environment (IDE) such as <a href="http://www.netbeans.org/">NetBeans</a> or <a href="http://eclipse.org/">Eclipse</a>. Mount the org directory where the HTML Parser was installed along with the <code>junit.jar</code> file from the <code>lib</code> directory. "Build All" should work. <H2>CVS</H2> The most recent files are only available via CVS: <pre> server: cvs.htmlparser.sourceforge.net repository: /cvsroot/htmlparser </pre> For read-only access use 'pserver' and anonymous access with no password. For commit access you'll need to set up ssh (see <a href="http://sourceforge.net/docman/display_doc.php?docid=6841&group_id=1">an introduction to SSH on sourceforge</a> and <a href="http://sourceforge.net/docman/display_doc.php?docid=761&group_id=1">a guide on setting up ssh keys</a>). <p>Short instructions from Karle Kaila:<p> <pre> I have installed SSH software from <a href="http://www.f-secure.com">www.f-secure.com</a> I think it was something like F-Secure SSH 5.2 for Win95/98/ME/NT4.0/2000/XP Client It is a nice grapfical SSH client both for terminal use and filetransfer and it also contains commandline ssh2 software that CVS needs. To access CVS I first set it up with these commands set CVS_RSH=ssh2 set CVSROOT=use...@cv...:/cvsroot/htmlparser username = your sourceforge username In an empty directory I then can give CVS commands such as cvs chekcout htmlparser It asks for your password to sourceforge This retrieves the latest fileversions. Check the CVS commands in some handbook you can find on the internet. The manual I found is called Version Management with CVS by Per Cederqvist et al. perhaps from http://www.cvshome.org Derrick says: I need CVSROOT=:ext:use...@cv...:/cvsroot/htmlparser CVS_RSH=ssh </pre> <H2>Other</H2> Some of the build.xml targets (like changelog) rely on Perl to execute, and need a sourceforge login via ssh (secure shell). This is unlikely to be needed by the casual user. </BODY> </HTML> --- NEW FILE: overview.html --- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <HTML> <HEAD> <TITLE>HTML Parser Libraries Overview</TITLE> </HEAD> <BODY> <h1>The HTML Parser Libraries.</h1> These java libraries provide access to the contents of local or remote HTML resources in a programatic way. <h2>Components</h2> The HTML Parser distribution is composed of: <li>a low level {@link org.htmlparser.lexer.Lexer lexer} that converts characters into tags</li> <li>a high level {@link org.htmlparser.Parser parser} that provides a heirarchical document view</li> <li>several example applications</li> <p> <h2>Building</h2> To build the system you'll need to get the sources from the <a href="http://sourceforge.net/project/showfiles.php?group_id=24399&release_id=161563">HTML Parser project on Sourceforge</a> if you haven't already, and then follow the <A href="{@docRoot}/doc-files/building.html">build instructions</A>. <h2>History</h2> <p> Originally started by Somik Raha, the HTML Parser has evolved with input from numerous people, and through several revisions... <h2>Outstanding Issues.</h2> The <A href="{@docRoot}/doc-files/todo.html">ToDo list</A> lists things that can or should be done. <h2>Mailing Lists.</h2> If you want to be notified when new releases of HTML Parser are available, join the <A HREF="http://lists.sourceforge.net/lists/listinfo/htmlparser-announce" target="_top">HTML Parser Announcement List</A>.<br> If you have questions about the usage of the parser, join the <A HREF="http://lists.sourceforge.net/lists/listinfo/htmlparser-user" target="_top">HTML Parser User List</A>.<br> If you want to join as a developer, please sign up on the <A HREF="http://lists.sourceforge.net/lists/listinfo/htmlparser-developer" target="_top">HTML Parser Developer List</A> </BODY> </HTML> --- NEW FILE: todo.html --- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <HTML> <HEAD> <TITLE>ToDo List for the HTML Parser Libraries</TITLE> <link REL ="stylesheet" TYPE="text/css" HREF="../stylesheet.css" TITLE="Style"> </HEAD> <BODY> <ul> <li> It looks like there are enough bugs and requests to warrant another 1.3 point release with some patched files. I hate to work on a branch, but it may be the only way to get everyone off my back. </li> <li> Implement the new filtering mechanism for NodeList.searchFor (). </li> <li> As of now, it's more likely that the javadocs are lying to you than providing any helpful advice. This needs to be reworked completely. </li> <li> There are some changes needed in the lexer state machines to handle JSP constructs and also whitespace either side of attribute equals signs. Currently the latter is handled by a kludgy fixAttributes() method applied after a tag is parsed, but it would be better handled in the state machine initially. The former isn't handled at all, and would involve all nodes possibly having children (a remark or string node can have embedded JSP, i.e. <!-- this remark, created on <%@ date() %>, needs to be handled -->. So some design work needs to be done to analyze the state transitions and gating characters. </li> <li> toHtml(boolean verbatim/fixed) - one of the design goals for the new Lexer subsystem was to be able to regurgitate the original HTML via the toHtml() method, so the original page is unmodified except for any explicit user edits, i.e. link URL edits. But the parser fixes broken HTML without asking, so you can't get back an unadulterated page from toHtml(). A lot of test cases assume fixed HTML. Either a parameter on toHtml() or another method would be needed to provide the choice of the original HTML or the fixed HTML. There's some initial work on eliminating the added virtual end tags commented out in TagNode, but it will also require a way to remember broken tags, like: <pre> <title>The Title</title</head><body>... </pre> </li> <li> Some GUI based parser application showing the HTML parse tree in one panel and the HTML text in another, with the tree node selected being highlighted in the text, or the text cursor setting the tree node selected, would be really good. A filter builder tool to graphically construct a program to extract a snippet from an HTML page would blow people away. </li> <li> Rework all the applications for a better 'out of the box' experience for new and novice users. Fix all the scripts in /bin (for unix and windows) and add any others that don't exist already. </li> <li> The tag-enders and end-tag-enders lists are only a partial solution to the HTML specification for block and inline tags. By marking each tag as a block or inline tag and ensuring block tags don't overlap, a better parsing job could be done, i.e. <pre> <FORM> .... <TABLE> ... </FORM></TABLE> </pre> would be rearranged as <pre> <FORM> .... <TABLE> ... </TABLE></FORM> </pre> This needs some design work. </li> <li> The recursion that currently happens on the JVM stack can probably be done via a stack of open tags passed to the scanner. This would probably avoid the 'Stack overflow' exceptions observed on Windows and also allow for smarter tag closing (in conjuction with the end tag enders list). </li> <li> Change all the headers to match the new format. The integration process needs to be revamped to use the $Name: CVS substitution (via 'get label'), so a checkin isn't required every integration. </li> <li> The default is now the equivalent of the old 'RegisterDomTags', so the operation of the following mainlines needs to be revisited: <ol> <li> Generate </li> <li> Parser </li> <li> LinkBean </li> <li> Robot </li> <li> InstanceofPerformanceTest </li> <li> StringBean </li> <li> MailRipper </li> <li> LinkExtractor </li> <li> BeanyBaby </li> </ol> </li> <li> decode() can be optimized by introducing parameters for start and end in the convertToChar( String bigString, int startToLook, int endToLook) to eliminate the substring operations. </li> <li> Use <A href="http://trove4j.sourceforge.net/javadocs/gnu/trove/TObjectIntHashMap.html"> TObjectIntHashMap</A> or use a sorted list similar to the newline index in PageIndex to avoid the HashMap and the 336 Character objects in Translate. </li> <li> Modify StingBean so it can be driven by a visitor externally. See <A href="http://sourceforge.net/mailarchive/forum.php?forum_id=2023&max_rows=25&style=flat&viewmonth=200311&viewday=12">StringBean.diff</A>. </li> </ul> </BODY> </HTML> |
From: <der...@us...> - 2003-12-16 02:29:59
|
Update of /cvsroot/htmlparser/htmlparser/resources In directory sc8-pr-cvs1:/tmp/cvs-serv22177/resources Added Files: HtmlTaglet.java stylesheet.css Log Message: Javadoc changes and additions. Stylesheet, overview, build instructions and todo list. Added HTMLTaglet, an inline Javadoc taglet for embedding HTML into javadocs. --- NEW FILE: HtmlTaglet.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/resources/HtmlTaglet.java,v $ // $Author: derrickoswald $ // $Date: 2003/12/16 02:29:56 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // import com.sun.tools.doclets.Taglet; import com.sun.javadoc.*; import java.util.Map; import org.htmlparser.util.Translate; /** * A JavaDoc Taglet that encodes HTML. * This inline <A href="http://java.sun.com/j2se/1.4.2/docs/tooldocs/javadoc/taglet/com/sun/tools/doclets/Taglet.html">taglet</A> * converts HTML into character references for embedding into * <A href="http://java.sun.com/j2se/1.4.2/docs/tooldocs/javadoc/index.html">Javadocs</A>. * For example, it converts <html> into &lt;html&gt;. * Typical usage is to embed an example stream of html into the javadoc for * a class or method: * <pre><font color="green"> * /** * * Gather DIV elements. * * This method takes {@.html <div></div>} pairs and * * constructs nested ... * </font></pre> * This is useful so that the documentation is readable while coding and when * presented in the generated javadocs. Normally, embedding HTML in the * javadoc documentation requires the use of character entity references, * otherwise the HTML is interpreted by the javadoc tool and is stripped out. * The programmer can manually embed the character translations to pass the * HTML through, but the resultant comment is extremely hard to read and * understand when editing the code directly. Plus there is the added * possibility of an incorrect encoding because of the manual step.<p> * The use of this taglet requires a 1.4.x or higher JDK, but it is not * expected that users with older JDKs will be generating javadocs when they * are provided in the distribution.<p> * The name was supposed to be "html", but a warning message is generated * by the javadoc tool if a custom tag name doesn't contain any dots. So the * next best name ".html" is used instead, with a passing resemblance to * directives in nroff. */ public class HtmlTaglet implements Taglet { private static final String NAME = ".html"; /** * Construct a taglet for encoding HTML in doc comments. */ public HtmlTaglet () { } /** * Return the name of this custom taglet. */ public String getName () { return (NAME); } /** * Will return true since <code>{@.html}</code> * can be used in field documentation. * @return <code>true</code> since <code>{@.html}</code> * can be used in field documentation. */ public boolean inField () { return (true); } /** * Will return true since <code>{@.html}</code> * can be used in constructor documentation. * @return <code>true</code> since <code>{@.html}</code> * can be used in constructor documentation. */ public boolean inConstructor () { return (true); } /** * Will return true since <code>{@.html}</code> * can be used in method documentation. * @return <code>true</code> since <code>{@.html}</code> * can be used in method documentation. */ public boolean inMethod () { return (true); } /** * Will return true since <code>{@.html}</code> * can be used in method documentation. * @return <code>true</code> since <code>{@.html}</code> * can be used in overview documentation. */ public boolean inOverview () { return (true); } /** * Will return true since <code>{@.html}</code> * can be used in package documentation. * @return <code>true</code> since <code>{@.html}</code> * can be used in package documentation. */ public boolean inPackage () { return (true); } /** * Will return true since <code>{@.html}</code> * can be used in type documentation (classes or interfaces). * @return true since <code>{@.html}</code> * can be used in type documentation. */ public boolean inType () { return (true); } /** * Will return true since <code>{@.html}</code> * is an inline tag. * @return <code>true</code> since <code>{@.html}</code> * is an inline tag. */ public boolean isInlineTag () { return (true); } /** * Register this Taglet. * @param tagletMap the map to register this tag to. */ public static void register (Map tagletMap) { HtmlTaglet tag = new HtmlTaglet (); tagletMap.put (tag.getName (), tag); } /** * Format the given string to appear "as is" within a JavaDoc comment. * This method is more complicated than it needs to be, since you might * say why not just use PRE tags surrounding the text. Unfortunately, PRE * is a block level tag that breaks the flow of text, preventing inline * operation. Instead we manually format the whitespace (actually just * spaces and newlines) within the string to preserve the format. */ protected String format (String s) { int base; int offset; StringBuffer ret; ret = new StringBuffer (512); base = 0; offset = 0; while (-1 != (offset = s.indexOf ('\n', base))) { ret.append (Translate.encode (s.substring (base, offset))); ret.append ("<br>\n"); base = offset + 1; } if (base != s.length ()) ret.append (Translate.encode (s.substring (base))); s = ret.toString (); ret.setLength (0); for (int i = 0; i < s.length (); i++) if (' ' == s.charAt (i)) ret.append (" "); else ret.append (s.charAt (i)); return (ret.toString ()); } /** * Given the <code>Tag</code> representation of this custom * tag, return its string representation. * @param tag the <code>Tag</code> representation of this custom tag. */ public String toString (Tag tag) { return (format (tag.text ())); } /** * Given an array of <code>Tag</code>s representing this custom * tag, return its string representation. * @param tags the array of <code>Tag</code>s representing of this custom tag. */ public String toString(Tag[] tags) { StringBuffer ret; if (0 == tags.length) return (null); else { ret = new StringBuffer (512); for (int i = 0; i < tags.length; i++) { if (i > 0) ret.append ("<br>\n"); ret.append (format (tags[i].text())); } return (ret.toString ()); } } } --- NEW FILE: stylesheet.css --- /* Javadoc style sheet */ /* Define colors, fonts and other style attributes here to override the defaults */ /* Page background color */ body { background-color: #EEEEEE } /* Headings */ h1 { font-size: 145% } /* Table colors */ .TableHeadingColor { background: #CCFFCC } /* Dark green */ .TableSubHeadingColor { background: #EEFFEE } /* Light green */ .TableRowColor { background: #EEEEEE } /* Grey */ /* Font used in left-hand frame lists */ .FrameTitleFont { font-size: 100%; font-family: Helvetica, Arial, sans-serif } .FrameHeadingFont { font-size: 90%; font-family: Helvetica, Arial, sans-serif } .FrameItemFont { font-size: 90%; font-family: Helvetica, Arial, sans-serif } /* Navigation bar fonts and colors */ .NavBarCell1 { background-color:#EEFFEE;} /* Light green */ .NavBarCell1Rev { background-color:#558B55;} /* Dark green */ .NavBarFont1 { font-family: Arial, Helvetica, sans-serif; color:#000000;} .NavBarFont1Rev { font-family: Arial, Helvetica, sans-serif; color:#EEEEEE;} .NavBarCell2 { font-family: Arial, Helvetica, sans-serif; background-color:#EEEEEE;} .NavBarCell3 { font-family: Arial, Helvetica, sans-serif; background-color:#EEEEEE;} |