htmlparser-cvs Mailing List for HTML Parser (Page 21)
Brought to you by:
derrickoswald
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: <der...@us...> - 2004-02-29 14:34:35
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv8741/scanners Added Files: StyleScanner.java Log Message: Fix bug #900125 Style Tag Children not grouped Added StyleScanner, a near copy of ScriptScanner. Added testStyleChildren() in StyleTagTest to check it's operation. |
From: <der...@us...> - 2004-02-29 14:34:35
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv8741/tags Modified Files: StyleTag.java Log Message: Fix bug #900125 Style Tag Children not grouped Added StyleScanner, a near copy of ScriptScanner. Added testStyleChildren() in StyleTagTest to check it's operation. Index: StyleTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/StyleTag.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** StyleTag.java 2 Jan 2004 16:24:55 -0000 1.34 --- StyleTag.java 29 Feb 2004 14:16:27 -0000 1.35 *************** *** 27,30 **** --- 27,32 ---- package org.htmlparser.tags; + import org.htmlparser.scanners.StyleScanner; + /** * A StyleTag represents a <style> tag. *************** *** 38,45 **** --- 40,53 ---- /** + * The set of end tag names that indicate the end of this tag. + */ + private static final String[] mEndTagEnders = new String[] {"BODY", "HTML"}; + + /** * Create a new style tag. */ public StyleTag () { + setThisScanner (new StyleScanner ()); } *************** *** 54,79 **** /** * Get the style data in this tag. * @return The HTML of the children of this tag. */ ! public String getStyleCode() { ! return getChildrenHTML(); } /** * Print the contents of the style node. */ public String toString() { ! String guts = toHtml(); ! guts = guts.substring (1, guts.length () - 2); ! StringBuffer sb = new StringBuffer(); ! sb.append("Style Node : \n"); ! sb.append("\n"); ! sb.append("Code\n"); ! sb.append("****\n"); ! sb.append(guts+"\n"); ! return sb.toString(); } } --- 62,100 ---- /** + * Return the set of end tag names that cause this tag to finish. + * @return The names of following end tags that stop further scanning. + */ + public String[] getEndTagEnders () + { + return (mEndTagEnders); + } + + /** * Get the style data in this tag. * @return The HTML of the children of this tag. */ ! public String getStyleCode () { ! return (getChildrenHTML ()); } /** * Print the contents of the style node. + * @return A string suitable for debugging or a printout. */ public String toString() { ! String guts; ! StringBuffer ret; ! ! ret = new StringBuffer (); ! ! guts = toHtml (); ! guts = guts.substring (1, guts.length () - 1); ! ret.append ("Style node :\n"); ! ret.append (guts); ! ret.append ("\n"); ! ! return (ret.toString ()); } } |
From: <der...@us...> - 2004-02-29 14:34:35
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv8741/tests/tagTests Modified Files: StyleTagTest.java Log Message: Fix bug #900125 Style Tag Children not grouped Added StyleScanner, a near copy of ScriptScanner. Added testStyleChildren() in StyleTagTest to check it's operation. Index: StyleTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/StyleTagTest.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** StyleTagTest.java 2 Jan 2004 16:24:57 -0000 1.35 --- StyleTagTest.java 29 Feb 2004 14:16:27 -0000 1.36 *************** *** 28,31 **** --- 28,32 ---- import org.htmlparser.Parser; + import org.htmlparser.StringNode; import org.htmlparser.tags.HeadTag; import org.htmlparser.tags.Html; *************** *** 65,69 **** "</STYLE>"; createParser(style); - Parser.setLineSeparator("\r\n"); parseAndAssertNodeCount(1); assertTrue(node[0] instanceof StyleTag); --- 66,69 ---- *************** *** 130,132 **** --- 130,163 ---- assertStringEquals("Expected Style Code",expectedCode,styleTag.getStyleCode()); } + + /** + * See bug #900125 Style Tag Children not grouped + */ + public void testStyleChildren () throws ParserException + { + String style = + "\nbody {color:white}\n" + + "<!--\n" + + ".teliabox {\n" + + "color: #A9014E;\n" + + "text-align: center;\n" + + "background-image:url(hallo.gif);\n" + + "}\n" + + "-->"; + String html = + "<style type=\"text/css\" media=\"screen\">" + + style + + "</style>"; + StyleTag tag; + StringNode string; + + createParser (html); + parseAndAssertNodeCount (1); + assertTrue ("Node should be a STYLE tag", node[0] instanceof StyleTag); + tag = (StyleTag)node[0]; + assertTrue ("STYLE tag should have one child", 1 == tag.getChildCount ()); + assertTrue ("Child should be a StringNode", tag.getChild (0) instanceof StringNode); + string = (StringNode)tag.getChild (0); + assertStringEquals ("Style text incorrect", style, string.toHtml ()); + } } |
From: <der...@us...> - 2004-02-29 13:10:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25994/tests Modified Files: ParserTest.java Log Message: Fix bug #900128 RemarkNode.setText() does not set Text Add override setText() to StringNode and RemarkNode. Add unit tests to excercise the new code. Remove remaining XX_FILTER constants. Index: ParserTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** ParserTest.java 25 Jan 2004 21:33:12 -0000 1.56 --- ParserTest.java 29 Feb 2004 12:52:21 -0000 1.57 *************** *** 42,45 **** --- 42,46 ---- import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; + import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; import org.htmlparser.filters.NodeClassFilter; *************** *** 884,886 **** --- 885,958 ---- } } + + /** + * See bug #900128 RemarkNode.setText() does not set Text + */ + public void testSetStringText () throws Exception + { + String text; + String html; + String newtext; + String newhtml; + Node txt; + + text = "This is just text."; + html = "<body>" + text + "</body>"; + newtext = "This is different text."; + newhtml = "<body>" + newtext + "</body>"; + createParser (html); + parseAndAssertNodeCount (1); + assertStringEquals ("html wrong", html, node[0].toHtml ()); + assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ()); + assertTrue ("string node expected", node[0].getChildren ().elementAt (0) instanceof StringNode); + txt = node[0].getChildren ().elementAt (0); + assertStringEquals ("string html wrong", text, txt.toHtml ()); + assertStringEquals ("string contents wrong", text, txt.getText ()); + assertTrue ("toString wrong", txt.toString ().endsWith (text)); + txt.setText (newtext); + assertStringEquals ("html wrong", newhtml, node[0].toHtml ()); + assertStringEquals ("new string html wrong", newtext, txt.toHtml ()); + assertStringEquals ("new string contents wrong", newtext, txt.getText ()); + assertTrue ("toString wrong", txt.toString ().endsWith (newtext)); + } + + /** + * See bug #900128 RemarkNode.setText() does not set Text + */ + public void testSetRemarkText () throws Exception + { + String text; + String remark; + String html; + String newtext; + String newremark; + String newhtml; + Node rem; + + text = " This is a remark. "; + remark = "<!--" + text + "-->"; + html = "<body>" + remark + "</body>"; + newtext = " This is a different remark. "; + newremark = "<!--" + newtext + "-->"; + newhtml = "<body>" + newremark + "</body>"; + createParser (html); + parseAndAssertNodeCount (1); + assertStringEquals ("html wrong", html, node[0].toHtml ()); + assertTrue ("wrong number of children", 1 == node[0].getChildren ().size ()); + assertTrue ("remark node expected", node[0].getChildren ().elementAt (0) instanceof RemarkNode); + rem = node[0].getChildren ().elementAt (0); + assertStringEquals ("remark html wrong", remark, rem.toHtml ()); + assertStringEquals ("remark contents wrong", text, rem.getText ()); + assertTrue ("toString wrong", rem.toString ().endsWith (text)); + rem.setText (newtext); + assertStringEquals ("html wrong", newhtml, node[0].toHtml ()); + assertStringEquals ("new remark html wrong", newremark, rem.toHtml ()); + assertStringEquals ("new remark contents wrong", newtext, rem.getText ()); + assertTrue ("toString wrong", rem.toString ().endsWith (newtext)); + rem.setText (newremark); + assertStringEquals ("html wrong", newhtml, node[0].toHtml ()); + assertStringEquals ("new remark html wrong", newremark, rem.toHtml ()); + assertStringEquals ("new remark contents wrong", newtext, rem.getText ()); + assertTrue ("toString wrong", rem.toString ().endsWith (newtext)); + } } |
From: <der...@us...> - 2004-02-29 13:10:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25994/tags Modified Files: ImageTag.java LinkTag.java Log Message: Fix bug #900128 RemarkNode.setText() does not set Text Add override setText() to StringNode and RemarkNode. Add unit tests to excercise the new code. Remove remaining XX_FILTER constants. Index: ImageTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ImageTag.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** ImageTag.java 25 Jan 2004 21:33:12 -0000 1.42 --- ImageTag.java 29 Feb 2004 12:52:21 -0000 1.43 *************** *** 39,44 **** public class ImageTag extends Tag { - public static final String IMAGE_TAG_FILTER="-i"; - /** * The set of names handled by this tag. --- 39,42 ---- Index: LinkTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/LinkTag.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** LinkTag.java 2 Jan 2004 16:24:55 -0000 1.47 --- LinkTag.java 29 Feb 2004 12:52:21 -0000 1.48 *************** *** 37,42 **** public class LinkTag extends CompositeTag { - public static final String LINK_TAG_FILTER="-l"; - /** * The set of names handled by this tag. --- 37,40 ---- |
From: <der...@us...> - 2004-02-29 13:10:25
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25994/lexer/nodes Modified Files: RemarkNode.java StringNode.java Log Message: Fix bug #900128 RemarkNode.setText() does not set Text Add override setText() to StringNode and RemarkNode. Add unit tests to excercise the new code. Remove remaining XX_FILTER constants. Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/RemarkNode.java,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** RemarkNode.java 2 Jan 2004 16:24:53 -0000 1.16 --- RemarkNode.java 29 Feb 2004 12:52:20 -0000 1.17 *************** *** 38,56 **** public class RemarkNode extends AbstractNode { ! public final static String REMARK_NODE_FILTER="-r"; /** ! * Constructor takes in the text string, beginning and ending posns. ! * @param page The page this string is on. ! * @param start The beginning position of the string. ! * @param end The ending positiong of the string. */ public RemarkNode (Page page, int start, int end) { super (page, start, end); } /** * Returns the text contents of the comment tag. */ public String getText() --- 38,72 ---- public class RemarkNode extends AbstractNode { ! /** ! * The contents of the remark node, or override text. ! */ ! protected String mText; /** ! * Constructor takes in the text string. ! * @param text The string node text. For correct generation of HTML, this ! * should not contain representations of tags (unless they are balanced). ! */ ! public RemarkNode (String text) ! { ! super (null, 0, 0); ! setText (text); ! } ! ! /** ! * Constructor takes in the page and beginning and ending posns. ! * @param page The page this remark is on. ! * @param start The beginning position of the remark. ! * @param end The ending positiong of the remark. */ public RemarkNode (Page page, int start, int end) { super (page, start, end); + mText = null; } /** * Returns the text contents of the comment tag. + * @return The contents of the text inside the comment delimiters. */ public String getText() *************** *** 60,73 **** String ret; ! start = getStartPosition () + 4; ! end = getEndPosition () - 3; ! if (start >= end) ! ret = ""; else ! ret = mPage.getText (start, end); return (ret); } public String toPlainTextString() { --- 76,108 ---- String ret; ! if (null == mText) ! { ! start = getStartPosition () + 4; // <!-- ! end = getEndPosition () - 3; // --> ! if (start >= end) ! ret = ""; ! else ! ret = mPage.getText (start, end); ! } else ! ret = mText; return (ret); } + /** + * Sets the string contents of the node. + * If the text has the remark delimiters (<!-- -->), these are stripped off. + * @param text The new text for the node. + */ + public void setText (String text) + { + mText = text; + if (text.startsWith ("<!--") && text.endsWith ("-->")) + mText = text.substring (4, text.length () - 3); + nodeBegin = 0; + nodeEnd = mText.length (); + } + public String toPlainTextString() { *************** *** 77,85 **** public String toHtml() { ! return (mPage.getText (getStartPosition (), getEndPosition ())); } /** * Print the contents of the remark tag. */ public String toString() --- 112,138 ---- public String toHtml() { ! StringBuffer buffer; ! String ret; ! ! if (null == mText) ! ret = mPage.getText (getStartPosition (), getEndPosition ()); ! else ! { ! buffer = new StringBuffer (mText.length () + 7); ! buffer.append ("<!--"); ! buffer.append (mText); ! buffer.append ("-->"); ! ret = buffer.toString (); ! } ! ! return (ret); } /** * Print the contents of the remark tag. + * This is suitable for display in a debugger or output to a printout. + * Control characters are replaced by their equivalent escape + * sequence and contents is truncated to 80 characters. + * @return A string representation of the remark node. */ public String toString() *************** *** 95,110 **** endpos = getEndPosition (); ret = new StringBuffer (endpos - startpos + 20); ! start = new Cursor (getPage (), startpos); ! end = new Cursor (getPage (), endpos); ! ret.append ("Rem ("); ! ret.append (start); ! ret.append (","); ! ret.append (end); ! ret.append ("): "); ! while (start.getPosition () < endpos) { ! try { ! c = mPage.getCharacter (start); switch (c) { --- 148,203 ---- endpos = getEndPosition (); ret = new StringBuffer (endpos - startpos + 20); ! if (null == mText) { ! start = new Cursor (getPage (), startpos); ! end = new Cursor (getPage (), endpos); ! ret.append ("Rem ("); ! ret.append (start); ! ret.append (","); ! ret.append (end); ! ret.append ("): "); ! start.setPosition (startpos + 4); // <!-- ! endpos -= 3; // --> ! while (start.getPosition () < endpos) { ! try ! { ! c = mPage.getCharacter (start); ! switch (c) ! { ! case '\t': ! ret.append ("\\t"); ! break; ! case '\n': ! ret.append ("\\n"); ! break; ! case '\r': ! ret.append ("\\r"); ! break; ! default: ! ret.append (c); ! } ! } ! catch (ParserException pe) ! { ! // not really expected, but we're only doing toString, so ignore ! } ! if (77 <= ret.length ()) ! { ! ret.append ("..."); ! break; ! } ! } ! } ! else ! { ! ret.append ("Rem ("); ! ret.append (startpos); ! ret.append (","); ! ret.append (endpos); ! ret.append ("): "); ! while (startpos < endpos) ! { ! c = mText.charAt (startpos); switch (c) { *************** *** 121,133 **** ret.append (c); } ! } ! catch (ParserException pe) ! { ! // not really expected, but we'return only doing toString, so ignore ! } ! if (77 <= ret.length ()) ! { ! ret.append ("..."); ! break; } } --- 214,223 ---- ret.append (c); } ! if (77 <= ret.length ()) ! { ! ret.append ("..."); ! break; ! } ! startpos++; } } Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/StringNode.java,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** StringNode.java 2 Jan 2004 16:24:53 -0000 1.17 --- StringNode.java 29 Feb 2004 12:52:20 -0000 1.18 *************** *** 38,45 **** public class StringNode extends AbstractNode { ! public static final String STRING_FILTER = "-string"; /** ! * Constructor takes in the text string, beginning and ending posns. * @param page The page this string is on. * @param start The beginning position of the string. --- 38,59 ---- public class StringNode extends AbstractNode { ! /** ! * The contents of the string node, or override text. ! */ ! protected String mText; /** ! * Constructor takes in the text string. ! * @param text The string node text. For correct generation of HTML, this ! * should not contain representations of tags (unless they are balanced). ! */ ! public StringNode (String text) ! { ! super (null, 0, 0); ! setText (text); ! } ! ! /** ! * Constructor takes in the page and beginning and ending posns. * @param page The page this string is on. * @param start The beginning position of the string. *************** *** 49,52 **** --- 63,67 ---- { super (page, start, end); + mText = null; } *************** *** 65,81 **** public void setText (String text) { ! mPage = new Page (text); nodeBegin = 0; ! nodeEnd = text.length (); ! // TODO: this really needs work ! try ! { ! Cursor cursor = new Cursor (mPage, nodeBegin); ! for (int i = nodeBegin; i < nodeEnd; i++) ! mPage.getCharacter (cursor); ! } ! catch (ParserException pe) ! { ! } } --- 80,86 ---- public void setText (String text) { ! mText = text; nodeBegin = 0; ! nodeEnd = mText.length (); } *************** *** 87,91 **** public String toHtml () { ! return (mPage.getText (getStartPosition (), getEndPosition ())); } --- 92,102 ---- public String toHtml () { ! String ret; ! ! ret = mText; ! if (null == ret) ! ret = mPage.getText (getStartPosition (), getEndPosition ()); ! ! return (ret); } *************** *** 109,124 **** endpos = getEndPosition (); ret = new StringBuffer (endpos - startpos + 20); ! start = new Cursor (getPage (), startpos); ! end = new Cursor (getPage (), endpos); ! ret.append ("Txt ("); ! ret.append (start); ! ret.append (","); ! ret.append (end); ! ret.append ("): "); ! while (start.getPosition () < endpos) { ! try { ! c = mPage.getCharacter (start); switch (c) { --- 120,173 ---- endpos = getEndPosition (); ret = new StringBuffer (endpos - startpos + 20); ! if (null == mText) { ! start = new Cursor (getPage (), startpos); ! end = new Cursor (getPage (), endpos); ! ret.append ("Txt ("); ! ret.append (start); ! ret.append (","); ! ret.append (end); ! ret.append ("): "); ! while (start.getPosition () < endpos) { ! try ! { ! c = mPage.getCharacter (start); ! switch (c) ! { ! case '\t': ! ret.append ("\\t"); ! break; ! case '\n': ! ret.append ("\\n"); ! break; ! case '\r': ! ret.append ("\\r"); ! break; ! default: ! ret.append (c); ! } ! } ! catch (ParserException pe) ! { ! // not really expected, but we're only doing toString, so ignore ! } ! if (77 <= ret.length ()) ! { ! ret.append ("..."); ! break; ! } ! } ! } ! else ! { ! ret.append ("Txt ("); ! ret.append (startpos); ! ret.append (","); ! ret.append (endpos); ! ret.append ("): "); ! while (startpos < endpos) ! { ! c = mText.charAt (startpos); switch (c) { *************** *** 135,147 **** ret.append (c); } ! } ! catch (ParserException pe) ! { ! // not really expected, but we'return only doing toString, so ignore ! } ! if (77 <= ret.length ()) ! { ! ret.append ("..."); ! break; } } --- 184,193 ---- ret.append (c); } ! if (77 <= ret.length ()) ! { ! ret.append ("..."); ! break; ! } ! startpos++; } } |
From: <der...@us...> - 2004-02-29 01:56:19
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19426 Modified Files: ScriptTag.java Log Message: Correct booboo in ScriptTag toHtml() injected by fix to bug #902121. Index: ScriptTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ScriptTag.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** ScriptTag.java 28 Feb 2004 15:52:43 -0000 1.36 --- ScriptTag.java 29 Feb 2004 01:38:36 -0000 1.37 *************** *** 27,31 **** --- 27,33 ---- package org.htmlparser.tags; + import org.htmlparser.Node; import org.htmlparser.scanners.ScriptScanner; + import org.htmlparser.util.SimpleNodeIterator; /** *************** *** 136,161 **** } ! /** ! * Render the tag as HTML. ! * @return The tag as an HTML fragment. ! * @see org.htmlparser.Node#toHtml() ! */ ! public String toHtml() { ! StringBuffer ret; ! ! ret = new StringBuffer (); ! ret.append (super.toHtml ()); ! if (!isEmptyXmlTag ()) ! { ! if (null != getScriptCode ()) ! ret.append (getScriptCode ()); ! else ! putChildrenInto (ret); ! if (null != getEndTag ()) ! putEndTagInto (ret); ! } ! return (ret.toString()); } --- 138,155 ---- } ! protected void putChildrenInto(StringBuffer sb) { ! Node node; ! if (null != getScriptCode ()) ! sb.append (getScriptCode ()); ! else ! for (SimpleNodeIterator e = children (); e.hasMoreNodes ();) ! { ! node = e.nextNode (); ! // eliminate virtual tags ! // if (!(node.getStartPosition () == node.getEndPosition ())) ! sb.append (node.toHtml ()); ! } } |
From: <der...@us...> - 2004-02-28 16:10:13
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv977/tests/scannersTests Modified Files: ScriptScannerTest.java Log Message: Fix bug #902121 StringBean throws NullPointerException. Added ScriptDecoder to handle Microsoft Script Encoder encrypted tags. Added accessor to ScriptTag's scriptCode property to be able to override it. Ensured that a Tag always has a non-null name. Skip STYLE tags in StringBean, just like SCRIPT. Index: ScriptScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java,v retrieving revision 1.52 retrieving revision 1.53 diff -C2 -d -r1.52 -r1.53 *** ScriptScannerTest.java 14 Jan 2004 02:53:47 -0000 1.52 --- ScriptScannerTest.java 28 Feb 2004 15:52:44 -0000 1.53 *************** *** 27,38 **** --- 27,44 ---- package org.htmlparser.tests.scannersTests; + import java.io.IOException; + import java.io.StringReader; import java.util.Hashtable; import org.htmlparser.Node; import org.htmlparser.Parser; + import org.htmlparser.filters.TagNameFilter; + import org.htmlparser.lexer.Lexer; + import org.htmlparser.scanners.ScriptDecoder; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.NodeIterator; + import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; *************** *** 579,581 **** --- 585,673 ---- assertStringEquals ("bad html", teststring, htmlBuffer.toString ()); } + + /** + * See bug #902121 StringBean throws NullPointerException + * Contributed by Reza Motori (rezamotori) + */ + public void testDecodeScript () + throws ParserException + { + String plaintext = + "<HTML>\n" + + "<HEAD>\n" + + "<TITLE>Script Encoder Sample Page</TITLE>\n" + + "<SCRIPT LANGUAGE=\"JScript.Encode\">\n" + + "<!--//\n" + + "//Copyright© 1998 Microsoft Corporation. All Rights Reserved.\n" + + "//**Start Encode**\r\n" + + "function verifyCorrectBrowser(){\r\n" + + " if(navigator.appName == \"Microsoft Internet Explorer\")\r\n" + + " if (navigator.appVersion.indexOf (\"5.\") >= 0)\r\n" + + " return(true);\r\n" + + " else\r\n" + + " return(false);\r\n" + + "}\r\n" + + "function getAppropriatePage(){\r\n" + + " var str1 = \"Had this been an actual Web site, a page compatible with \";\r\n" + + " var str2 = \"browsers other than \";\r\n" + + " var str3 = \"Microsoft Internet Explorer 5.0 \";\r\n" + + " var str4 = \"would have been loaded.\";\r\n" + + " if (verifyCorrectBrowser())\r\n" + + " document.write(str1 + str3 + str4);\r\n" + + " else\r\n" + + " document.write(str1 + str2 + str3 + str4);\r\n" + + "}\r\n" + + "//-->\r\n" + + "</SCRIPT>\n" + + "</HEAD>\n" + + "<BODY onload=\"getAppropriatePage()\">\n" + + "</BODY>\n" + + "</HTML>"; + String cryptext = + "<HTML>\n" + + "<HEAD>\n" + + "<TITLE>Script Encoder Sample Page</TITLE>\n" + + "<SCRIPT LANGUAGE=\"JScript.Encode\">\n" + + "<!--//\n" + + "//Copyright© 1998 Microsoft Corporation. All Rights Reserved.\n" + + "//**Start Encode**#@~^ZwIAAA==@#@&0; mDkW P7nDb0zZKD.n1YAMGhk+Dvb`@#@&P,kW`UC7kLlDGDcl22gl:n~{'~Jtr1DGkW6YP&xDnD +OPA62sKD+ME#@#@&P,~~k6PvxC\\rLmYGDcCwa.n.kkWU bx[+X66Pcr*cJ#,@*{~!*@#@&P,P~~,D+D;D `YM;n#p@#@&P~P~n^/n@#@&~P,P~~M+Y;. `Wl^d#I@#@&)@#@&6E ^YbWUPT+O)awDK2DblYKCo`* @#@&~~7l.PkOD8Px~rCl[~Dtr/,8+U,l Pl1Y!CV,n4,/rO~Pm~wmo+,^G:alDk8Vn~SkOt,Ei@#@&~~7lD~dDD+P{~r4.Khk+DkPKOtD~Y4lU~ri@#@&~P7lD,dOD2P{PEHr^MWdW6OP&xOnMx+O~A62VK.D~lRZPJp@#@&~P7l.PkY.*,'PrAW!VN,4C\\P(+nx~sKl[+9 Jp@#@&~,k0~c7+.k6z;W.M+1YAMWSd+M`b#@#@&~~,PNK^Es+xD ADbY`dY.q,_~/D.&,_~dDDcbI@#@&~Psk+@#@&P,PP9W1;:xORSDrO`/D.F,_PkO. ,_,/ODf~3PdYM*#p@#@&N@#@&z&R @*@#@&qrIAAA==^#~@</SCRIPT>\n" + + "</HEAD>\n" + + "<BODY onload=\"getAppropriatePage()\">\n" + + "</BODY>\n" + + "</HTML>"; + Lexer lexer; + + lexer = new Lexer (cryptext); + ScriptDecoder.LAST_STATE = ScriptDecoder.STATE_INITIAL; // read everything + try + { + String result = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ()); + assertStringEquals ("decoding failed", plaintext, result); + } + finally + { + ScriptDecoder.LAST_STATE = ScriptDecoder.STATE_DONE; + } + } + + /** + * See bug #902121 StringBean throws NullPointerException + * Contributed by Reza Motori (rezamotori) + */ + public void testDecodePage () + throws ParserException + { + String url = "http://htmlparser.sourceforge.net/test/EncryptedScriptExample.html"; + String plaintext = + "\r\n" + + "var nows = new Date();\r\n" + + "var nIndexs = nows.getTime();\r\n" + + "document.write(\"<img src=\\\"http://www.parsads.com/adserve/scriptinject.asp?F=4&Z=3,4,5,10,12&N=1&U=644&O=&nocache=\" + nIndexs + \"\\\" width=\\\"1\\\" hight=\\\"1\\\"><img src=\\\"http://www.parsads.com/adserve/scriptinject.asp?F=4&Z=3,4,5,10,12&N=1&U=643&O=&nocache=\" + nIndexs + \"\\\" width=\\\"1\\\" hight=\\\"1\\\"><img src=\\\"http://www.parsads.com/adserve/scriptinject.asp?F=4&Z=3,4,5,10,12&N=1&U=324&O=&nocache=\" + nIndexs + \"\\\" width=\\\"1\\\" hight=\\\"1\\\">\");\r\n"; + + parser = new Parser (url); + NodeList scripts = parser.extractAllNodesThatMatch (new TagNameFilter ("SCRIPT")); + assertEquals ("wrong number of scripts found", 2, scripts.size ()); + ScriptTag script = (ScriptTag)scripts.elementAt (1); + assertStringEquals ("script not decoded correctly", plaintext, script.getScriptCode ()); + } } |
From: <der...@us...> - 2004-02-28 16:10:13
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv977/tests Modified Files: ParserTestCase.java Log Message: Fix bug #902121 StringBean throws NullPointerException. Added ScriptDecoder to handle Microsoft Script Encoder encrypted tags. Added accessor to ScriptTag's scriptCode property to be able to override it. Ensured that a Tag always has a non-null name. Skip STYLE tags in StringBean, just like SCRIPT. Index: ParserTestCase.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTestCase.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** ParserTestCase.java 14 Jan 2004 02:53:47 -0000 1.44 --- ParserTestCase.java 28 Feb 2004 15:52:43 -0000 1.45 *************** *** 108,122 **** if (expected.length() < actual.length()) { mismatchInfo = "\n\nACTUAL result has "+(actual.length()-expected.length())+" extra characters at the end. They are :"; ! ! for (int i = expected.length(); i < actual.length(); i++) { mismatchInfo += ("\nPosition : " + i + " , Code = " + (int) actual.charAt(i)); ! } } else if(expected.length() > actual.length()) { mismatchInfo = "\n\nEXPECTED result has "+(expected.length()-actual.length())+" extra characters at the end. They are :"; ! ! for (int i = actual.length(); i < expected.length(); i++) { mismatchInfo += ("\nPosition : " + i + " , Code = " + (int) expected.charAt(i)); ! } ! } for (int i = 0; i < expected.length(); i++) { --- 108,123 ---- if (expected.length() < actual.length()) { mismatchInfo = "\n\nACTUAL result has "+(actual.length()-expected.length())+" extra characters at the end. They are :"; ! int limit = Math.min (expected.length() + 10, actual.length()); ! for (int i = expected.length(); i < limit; i++) mismatchInfo += ("\nPosition : " + i + " , Code = " + (int) actual.charAt(i)); ! if (limit != actual.length()) ! mismatchInfo += "\netc."; } else if(expected.length() > actual.length()) { mismatchInfo = "\n\nEXPECTED result has "+(expected.length()-actual.length())+" extra characters at the end. They are :"; ! int limit = Math.min (actual.length() + 10, expected.length()); ! for (int i = actual.length(); i < expected.length(); i++) mismatchInfo += ("\nPosition : " + i + " , Code = " + (int) expected.charAt(i)); ! if (limit != expected.length ()) ! mismatchInfo += "\netc."; } for (int i = 0; i < expected.length(); i++) { |
From: <der...@us...> - 2004-02-28 16:10:09
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv977/lexer/nodes Modified Files: TagNode.java Log Message: Fix bug #902121 StringBean throws NullPointerException. Added ScriptDecoder to handle Microsoft Script Encoder encrypted tags. Added accessor to ScriptTag's scriptCode property to be able to override it. Ensured that a Tag always has a non-null name. Skip STYLE tags in StringBean, just like SCRIPT. Index: TagNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/TagNode.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** TagNode.java 9 Feb 2004 02:09:44 -0000 1.30 --- TagNode.java 28 Feb 2004 15:52:43 -0000 1.31 *************** *** 601,605 **** /** ! * A call to a tag's toHTML() method will render it in HTML. * @see org.htmlparser.Node#toHtml() */ --- 601,607 ---- /** ! * Render the tag as HTML. ! * A call to a tag's <code>toHtml()</code> method will render it in HTML. ! * @return The tag as an HTML fragment. * @see org.htmlparser.Node#toHtml() */ *************** *** 819,823 **** raw = getRawTagName (); ! return ((null == raw) ? false : ('/' == raw.charAt (0))); } --- 821,825 ---- raw = getRawTagName (); ! return ((null == raw) ? false : ((0 != raw.length ()) && ('/' == raw.charAt (0)))); } |
From: <der...@us...> - 2004-02-28 16:10:09
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv977/filters Modified Files: package.html Log Message: Fix bug #902121 StringBean throws NullPointerException. Added ScriptDecoder to handle Microsoft Script Encoder encrypted tags. Added accessor to ScriptTag's scriptCode property to be able to override it. Ensured that a Tag always has a non-null name. Skip STYLE tags in StringBean, just like SCRIPT. Index: package.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters/package.html,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** package.html 2 Jan 2004 16:24:53 -0000 1.4 --- package.html 28 Feb 2004 15:52:43 -0000 1.5 *************** *** 38,42 **** parser.parse (new HasAttributeFilter ("id")); </pre> ! These filters can be combined to yield powerfull extraction capabilities. For example, to get a list of links where the contents is an image, you could use: <pre> --- 38,42 ---- parser.parse (new HasAttributeFilter ("id")); </pre> ! These filters can be combined to yield powerful extraction capabilities. For example, to get a list of links where the contents is an image, you could use: <pre> |
From: <der...@us...> - 2004-02-28 16:10:08
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv977/scanners Modified Files: ScriptScanner.java Added Files: ScriptDecoder.java Log Message: Fix bug #902121 StringBean throws NullPointerException. Added ScriptDecoder to handle Microsoft Script Encoder encrypted tags. Added accessor to ScriptTag's scriptCode property to be able to override it. Ensured that a Tag always has a non-null name. Skip STYLE tags in StringBean, just like SCRIPT. --- NEW FILE: ScriptDecoder.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptDecoder.java,v $ // $Author: derrickoswald $ // $Date: 2004/02/28 15:52:43 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.scanners; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Page; import org.htmlparser.util.ParserException; /** * Decode script. * Script obfuscated by the <A href="http://www.microsoft.com/downloads/details.aspx?FamilyId=E7877F67-C447-4873-B1B0-21F0626A6329&displaylang=en" target="_parent">Windows Script Encoder</A> * provided by Microsoft, is converted to plaintext. This code is based loosely * on example code provided by MrBrownstone with changes by Joe Steele, see * <A href="http://www.virtualconspiracy.com/download/scrdec14.c" target="_parent">scrdec14.c</A>. */ public class ScriptDecoder { /** * Termination state. */ public static final int STATE_DONE = 0; /** * State on entry. */ public static final int STATE_INITIAL = 1; /** * State while reading the encoded length. */ protected static final int STATE_LENGTH = 2; /** * State when reading up to decoded text. */ protected static final int STATE_PREFIX = 3; /** * State while decoding. */ protected static final int STATE_DECODE = 4; /** * State when reading an escape sequence. */ protected static final int STATE_ESCAPE = 5; /** * State when reading the checksum. */ protected static final int STATE_CHECKSUM = 6; /** * State while exiting. */ protected static final int STATE_FINAL = 7; /** * The state to enter when decrypting is complete. * If this is STATE_DONE, the decryption will return with any characters * following the encoded text still unconsumed. * Otherwise, if this is STATE_INITIAL, the input will be exhausted and * all following characters will be contained in the return value * of the <code>Decode()</code> method. */ public static int LAST_STATE = STATE_DONE; /** * Table of lookup choice. * The decoding cycles between three flavours determined * by this sequence of 64 choices, corresponding to the * first dimension of the lookup table. */ protected static byte mEncodingIndex[] = { 1, 2, 0, 1, 2, 0, 2, 0, 0, 2, 0, 2, 1, 0, 2, 0, 1, 0, 2, 0, 1, 1, 2, 0, 0, 2, 1, 0, 2, 0, 0, 2, 1, 1, 0, 2, 0, 2, 0, 1, 0, 1, 1, 2, 0, 1, 0, 2, 1, 0, 2, 0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 1, 0, 2, }; /** * Two dimensional lookup table. * The decoding uses this table to determine the plaintext for * characters that aren't mEscaped. */ protected static char mLookupTable[][] = { { '{', '2', '0', '!', ')', '[', '8', '3', '=', 'X', ':', '5', 'e', '9', '\\', 'V', 's', 'f', 'N', 'E', 'k', 'b', 'Y', 'x', '^', '}', 'J', 'm', 'q', 0, '`', 0, 'S', 0, 'B', '\'', 'H', 'r', 'u', '1', '7', 'M', 'R', '"', 'T', 'j', 'G', 'd', '-', ' ', '', '.', 'L', ']', '~', 'l', 'o', 'y', 't', 'C', '&', 'v', '%', '$', '+', '(', '#', 'A', '4', '\t', '*', 'D', '?', 'w', ';', 'U', 'i', 'a', 'c', 'P', 'g', 'Q', 'I', 'O', 'F', 'h', '|', '6', 'p', 'n', 'z', '/', '_', 'K', 'Z', ',', 'W', }, { 'W', '.', 'G', 'z', 'V', 'B', 'j', '/', '&', 'I', 'A', '4', '2', '[', 'v', 'r', 'C', '8', '9', 'p', 'E', 'h', 'q', 'O', '\t', 'b', 'D', '#', 'u', 0, '~', 0, '^', 0, 'w', 'J', 'a', ']', '"', 'K', 'o', 'N', ';', 'L', 'P', 'g', '*', '}', 't', 'T', '+', '-', ',', '0', 'n', 'k', 'f', '5', '%', '!', 'd', 'M', 'R', 'c', '?', '{', 'x', ')', '(', 's', 'Y', '3', '', 'm', 'U', 'S', '|', ':', '_', 'e', 'F', 'X', '1', 'i', 'l', 'Z', 'H', '\'', '\\', '=', '$', 'y', '7', '`', 'Q', ' ', '6', }, { 'n', '-', 'u', 'R', '`', 'q', '^', 'I', '\\', 'b', '}', ')', '6', ' ', '|', 'z', '', 'k', 'c', '3', '+', 'h', 'Q', 'f', 'v', '1', 'd', 'T', 'C', 0, ':', 0, '~', 0, 'E', ',', '*', 't', '\'', '7', 'D', 'y', 'Y', '/', 'o', '&', 'r', 'j', '9', '{', '?', '8', 'w', 'g', 'S', 'G', '4', 'x', ']', '0', '#', 'Z', '[', 'l', 'H', 'U', 'p', 'i', '.', 'L', '!', '$', 'N', 'P', '\t', 'V', 's', '5', 'a', 'K', 'X', ';', 'W', '"', 'm', 'M', '%', '(', 'F', 'J', '2', 'A', '=', '_', 'O', 'B', 'e', }, }; /** * The base 64 decoding table. * This array determines the value of decoded base 64 elements. */ protected static int mDigits[]; static { mDigits = new int[0x7b]; for (int i = 0; i < 26; i++) { mDigits['A' + i] = i; mDigits['a' + i] = i + 26; } for (int i = 0; i < 10; i++) mDigits['0' + i] = i + 52; mDigits[0x2b] = '>'; mDigits[0x2f] = '?'; } /** * The leader. * The prefix to the encoded script is #@~^nnnnnn== where the n are the * length digits in base64. */ protected static char mLeader[] = { '#', '@', '~', '^', }; /** * The prefix. * The prfix separates the encoded text from the length. */ protected static char mPrefix[] = { '=', '=', }; /** * The trailer. * The suffix to the encoded script is nnnnnn==^#~@ where the n are the * checksum digits in base64. These characters are the part after the checksum. */ protected static char mTrailer[] = { '=', '=', '^', '#', '~', '@', }; /** * Escape sequence characters. */ protected static char mEscapes[] = { '#', '&', '!', '*', '$', }; /** * The escaped characters corresponding to the each escape sequence. */ protected static char mEscaped[] = //"\r\n<>@"; { '\r', '\n', '<', '>', '@', }; /** * Extract the base 64 encoded number. * This is a very limited subset of base 64 encoded characters. * Six characters are expected. These are translated into a single long * value. For a more complete base 64 codec see for example the base64 * package of <A href="http://sourceforge.net/projects/iharder/" target="_parent">iHarder.net</A> * @param p Six base 64 encoded digits. * @return The value of the decoded number. */ protected static long decodeBase64 (char[] p) { long ret; ret = 0; ret += (mDigits[p[0]] << 2); ret += (mDigits[p[1]] >> 4); ret += (mDigits[p[1]] & 0xf) << 12; ret += ((mDigits[p[2]] >> 2) << 8); ret += ((mDigits[p[2]] & 0x3) << 22); ret += (mDigits[p[3]] << 16); ret += ((mDigits[p[4]] << 2) << 24); ret += ((mDigits[p[5]] >> 4) << 24); return (ret); } /** * Decode script encoded by the Microsoft obfuscator. * @param page The source for encoded text. * @param cursor The position at which to start decoding. * This is advanced to the end of the encoded text. * @return The plaintext. * @exception ParserException If an error is discovered while decoding. */ public static String Decode (Page page, Cursor cursor) throws ParserException { int state; int substate_initial; int substate_length; int substate_prefix; int substate_checksum; int substate_final; long checksum; long length; char buffer[]; buffer = new char[6]; int index; char character; int input_character; boolean found; StringBuffer ret; ret = new StringBuffer (1024); state = STATE_INITIAL; substate_initial = 0; substate_length = 0; substate_prefix = 0; substate_checksum = 0; substate_final = 0; length = 0L; checksum = 0L; index = 0; while (STATE_DONE != state) { input_character = page.getCharacter (cursor); character = (char)input_character; if (0 == input_character) { if ( (STATE_INITIAL != state) || (0 != substate_initial) || (0 != substate_length) || (0 != substate_prefix) || (0 != substate_checksum) || (0 != substate_final)) throw new ParserException ("illegal state for exit"); state = STATE_DONE; } else switch (state) { case STATE_INITIAL: if (character == mLeader[substate_initial]) { substate_initial++; if (substate_initial == mLeader.length) { substate_initial = 0; state = STATE_LENGTH; } } else { // oops, flush for (int k = 0; 0 < substate_initial; k++) { ret.append (mLeader[k++]); substate_initial--; } ret.append (character); } break; case STATE_LENGTH: buffer[substate_length] = character; substate_length++; if (substate_length >= buffer.length) { length = decodeBase64 (buffer); if (0 > length) throw new ParserException ("illegal length: " + length); substate_length = 0; state = STATE_PREFIX; } break; case STATE_PREFIX: if (character == mPrefix[substate_prefix]) substate_prefix++; else throw new ParserException ("illegal character encountered: " + (int)character + " ('" + character + "')"); if (substate_prefix >= mPrefix.length) { substate_prefix = 0; state = STATE_DECODE; } break; case STATE_DECODE: if ('@' == character) state = STATE_ESCAPE; else { if (input_character < 0x80) { if (input_character == '\t') input_character = 0; else if (input_character >= ' ') input_character -= ' ' - 1; else throw new ParserException ("illegal encoded character: " + input_character + " ('" + character + "')"); char ch = mLookupTable[mEncodingIndex[index % 64]][input_character]; ret.append (ch); checksum += ch; index++; } else ret.append (character); } length--; if (0 == length) { index = 0; state = STATE_CHECKSUM; } break; case STATE_ESCAPE: found = false; for (int i = 0; i < mEscapes.length; i++) if (character == mEscapes[i]) { found = true; character = mEscaped[i]; } if (!found) throw new ParserException ("unexpected escape character: " + (int)character + " ('" + character + "')"); ret.append (character); checksum += character; index++; state = STATE_DECODE; length--; if (0 == length) { index = 0; state = STATE_CHECKSUM; } break; case STATE_CHECKSUM: buffer[substate_checksum] = character; substate_checksum++; if (substate_checksum >= buffer.length) { long check = decodeBase64 (buffer); if (check != checksum) throw new ParserException ("incorrect checksum, expected " + check + ", calculated " + checksum); checksum = 0; substate_checksum = 0; state = STATE_FINAL; } break; case STATE_FINAL: if (character == mTrailer[substate_final]) substate_final++; else throw new ParserException ("illegal character encountered: " + (int)character + " ('" + character + "')"); if (substate_final >= mTrailer.length) { substate_final = 0; state = LAST_STATE; } break; default: throw new ParserException ("invalid state: " + state); } } return (ret.toString ()); } // /** // * Example mainline for decrypting script. // * Change a file with encrypted script into one without. // * <em>WARNING: This does not preserve DOS type line endings.</em> // * @param args Command line arguments. Two file names, input and output. // * Optionally, the character set to use as a third argument. // * @exception IOException If the input file doesn't exist, or the output // * file cannot be created. // * @exception ParserException If there is a decryption problem. // */ // public static void main (String[] args) // throws // IOException, // ParserException // { // String charset; // FileInputStream in; // Page page; // Cursor cursor; // String string; // int ret; // // if (args.length < 2) // { // System.out.println ("Usage: java org.htmlparser.scanners.ScriptDecoder <infile> <outfile> [charset]"); // ret = 1; // } // else // { // if (2 < args.length) // charset = args[2]; // else // charset = "ISO-8859-1"; // in = new FileInputStream (args[0]); // page = new Page (in, charset); // cursor = new Cursor (page, 0); // ScriptDecoder.LAST_STATE = STATE_INITIAL; // string = ScriptDecoder.Decode (page, cursor); // in.close (); // // FileOutputStream outfile = new FileOutputStream (args[1]); // outfile.write (string.getBytes (charset)); // outfile.close (); // ret = (0 != string.length ()) ? 0 : 1; // } // // System.exit (ret); // } } Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** ScriptScanner.java 14 Jan 2004 02:53:46 -0000 1.55 --- ScriptScanner.java 28 Feb 2004 15:52:43 -0000 1.56 *************** *** 33,39 **** --- 33,42 ---- import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; + import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.nodes.NodeFactory; + import org.htmlparser.scanners.ScriptDecoder; import org.htmlparser.tags.CompositeTag; + import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeList; *************** *** 68,71 **** --- 71,75 ---- throws ParserException { + String language; Node node; boolean done; *************** *** 80,83 **** --- 84,100 ---- end = null; factory = lexer.getNodeFactory (); + if (tag instanceof ScriptTag) + { + language = ((ScriptTag)tag).getLanguage (); + if ((null != language) && + (language.equalsIgnoreCase ("JScript.Encode") || + language.equalsIgnoreCase ("VBScript.Encode"))) + { + int start = lexer.getPosition (); + String code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ()); + ((ScriptTag)tag).setScriptCode (code); + last = (StringNode)factory.createStringNode (lexer.getPage (), start, lexer.getPosition ()); + } + } lexer.setNodeFactory (new PrototypicalNodeFactory (true)); try |
From: <der...@us...> - 2004-02-28 16:10:08
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv977/tags Modified Files: ScriptTag.java Tag.java Log Message: Fix bug #902121 StringBean throws NullPointerException. Added ScriptDecoder to handle Microsoft Script Encoder encrypted tags. Added accessor to ScriptTag's scriptCode property to be able to override it. Ensured that a Tag always has a non-null name. Skip STYLE tags in StringBean, just like SCRIPT. Index: ScriptTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ScriptTag.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** ScriptTag.java 2 Jan 2004 16:24:55 -0000 1.35 --- ScriptTag.java 28 Feb 2004 15:52:43 -0000 1.36 *************** *** 45,48 **** --- 45,53 ---- /** + * Script code if different from the page contents. + */ + protected String mCode; + + /** * Create a new script tag. */ *************** *** 79,87 **** /** ! * Get the contents of the tag's children. */ ! public String getScriptCode() { ! return (getChildrenHTML ()); } --- 84,111 ---- /** ! * Get the script code. ! * Normally this is the contents of the children, but in the rare case that ! * the script is encoded, this is the plaintext decrypted code. ! * @return The plaintext or overridden code contents of the tag. */ ! public String getScriptCode () { ! String ret; ! ! if (null != mCode) ! ret = mCode; ! else ! ret = getChildrenHTML (); ! ! return (ret); ! } ! ! /** ! * Set the code contents. ! * @param code The new code contents of this tag. ! */ ! public void setScriptCode (String code) ! { ! mCode = code; } *************** *** 113,116 **** --- 137,164 ---- /** + * Render the tag as HTML. + * @return The tag as an HTML fragment. + * @see org.htmlparser.Node#toHtml() + */ + public String toHtml() + { + StringBuffer ret; + + ret = new StringBuffer (); + ret.append (super.toHtml ()); + if (!isEmptyXmlTag ()) + { + if (null != getScriptCode ()) + ret.append (getScriptCode ()); + else + putChildrenInto (ret); + if (null != getEndTag ()) + putEndTagInto (ret); + } + + return (ret.toString()); + } + + /** * Print the contents of the script tag. */ Index: Tag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/Tag.java,v retrieving revision 1.61 retrieving revision 1.62 diff -C2 -d -r1.61 -r1.62 *** Tag.java 14 Jan 2004 02:53:46 -0000 1.61 --- Tag.java 28 Feb 2004 15:52:43 -0000 1.62 *************** *** 64,67 **** --- 64,69 ---- if ((null != names) && (0 != names.length)) setTagName (names[0]); + else + setTagName (""); // make sure it's not null setThisScanner (mDefaultScanner); } *************** *** 71,74 **** --- 73,78 ---- super (node.getPage (), node.getTagBegin (), node.getTagEnd (), node.getAttributesEx ()); mScanner = scanner; + if (null == getTagName ()) + setTagName (""); // make sure it's not null } *************** *** 77,80 **** --- 81,86 ---- super (page, start, end, attributes); mScanner = null; + if (null == getTagName ()) + setTagName (""); // make sure it's not null } |
From: <der...@us...> - 2004-02-28 16:10:06
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv977/beans Modified Files: StringBean.java Log Message: Fix bug #902121 StringBean throws NullPointerException. Added ScriptDecoder to handle Microsoft Script Encoder encrypted tags. Added accessor to ScriptTag's scriptCode property to be able to override it. Ensured that a Tag always has a non-null name. Skip STYLE tags in StringBean, just like SCRIPT. Index: StringBean.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** StringBean.java 11 Feb 2004 12:37:59 -0000 1.37 --- StringBean.java 28 Feb 2004 15:52:42 -0000 1.38 *************** *** 162,165 **** --- 162,170 ---- /** + * Set <code>true</code> when traversing a STYLE tag. + */ + protected boolean mIsStyle; + + /** * Create a StringBean object. * Default property values are set to 'do the right thing': *************** *** 185,188 **** --- 190,194 ---- mIsScript = false; mIsPre = false; + mIsStyle = false; } *************** *** 322,325 **** --- 328,332 ---- mIsPre = false; mIsScript = false; + mIsStyle = false; try { // try again with the encoding now in force *************** *** 616,620 **** public void visitStringNode (StringNode string) { ! if (!mIsScript) { String text = string.getText (); --- 623,627 ---- public void visitStringNode (StringNode string) { ! if (!mIsScript && !mIsStyle) { String text = string.getText (); *************** *** 647,650 **** --- 654,659 ---- else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = true; + else if (name.equalsIgnoreCase ("STYLE")) + mIsStyle = true; if (tag.breaksFlow ()) carriage_return (); *************** *** 664,667 **** --- 673,678 ---- else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = false; + else if (name.equalsIgnoreCase ("STYLE")) + mIsStyle = false; } |
From: <der...@us...> - 2004-02-18 12:43:36
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27481/lexer Modified Files: Lexer.java Log Message: Fix bug #899413 bug in javascript end detection. Patch submitted by Gernot Fricke handles escaped quotes in strings when lexing with smartquote turned on. Added test case in LexerTests. Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** Lexer.java 7 Feb 2004 12:53:09 -0000 1.26 --- Lexer.java 18 Feb 2004 12:34:04 -0000 1.27 *************** *** 401,404 **** --- 401,413 ---- else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch))) quote = ch; // enter quoted state + // patch contributed by Gernot Fricke to handle escaped closing quote + else if (quotesmart && (0 != quote) && ('\\' == ch)) + { + ch = mPage.getCharacter (cursor); //try to consume escaped character + if ( (ch != '\\') // escaped backslash + && (ch != quote)) // escaped quote character + // ( reflects ["] or ['] whichever opened the quotation) + cursor.retreat(); // unconsume char if character was not an escapable char. + } else if (quotesmart && (ch == quote)) quote = 0; // exit quoted state |
From: <der...@us...> - 2004-02-18 12:43:36
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27481/tests/lexerTests Modified Files: LexerTests.java Log Message: Fix bug #899413 bug in javascript end detection. Patch submitted by Gernot Fricke handles escaped quotes in strings when lexing with smartquote turned on. Added test case in LexerTests. Index: LexerTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v retrieving revision 1.18 retrieving revision 1.19 diff -C2 -d -r1.18 -r1.19 *** LexerTests.java 24 Jan 2004 17:14:20 -0000 1.18 --- LexerTests.java 18 Feb 2004 12:34:04 -0000 1.19 *************** *** 789,792 **** --- 789,815 ---- assertNull ("too many nodes", lexer.nextNode ()); } + + /** + * See bug #899413 bug in javascript end detection. + */ + public void testEscapedQuote () throws ParserException + { + String string; + String html; + Lexer lexer; + Node node; + + string = "\na='\\'';\n"; + html = string + "</script>"; + lexer = new Lexer (html); + node = lexer.nextNode (true); + if (node == null) + fail ("too few nodes"); + else + assertStringEquals ("bad string", string, node.toHtml()); + assertNotNull ("too few nodes", lexer.nextNode (true)); + assertNull ("too many nodes", lexer.nextNode (true)); + } + } |
From: <der...@us...> - 2004-02-16 22:54:28
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25002/docs Modified Files: changes.txt contributors.html release.txt Log Message: Update version to 1.4-20040216. Index: changes.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/changes.txt,v retrieving revision 1.196 retrieving revision 1.197 diff -C2 -d -r1.196 -r1.197 *** changes.txt 26 Jan 2004 01:01:56 -0000 1.196 --- changes.txt 16 Feb 2004 22:46:07 -0000 1.197 *************** *** 13,16 **** --- 13,77 ---- ******************************************************************************* + Integration Build 1.4 - 20040216 + -------------------------------- + + 2004-02-11 07:37 derrickoswald + + * docs/contributors.html, src/org/htmlparser/beans/StringBean.java: + + Incorporate patch from Nick Burch to make StringBean a NodeVisistor for other parsers. + See task #93155 StringBean driven by visitor. + + 2004-02-08 21:09 derrickoswald + + * build.xml, src/org/htmlparser/lexer/nodes/Attribute.java, + src/org/htmlparser/lexer/nodes/TagNode.java, + src/org/htmlparser/tests/tagTests/TagTest.java, + src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java, + bin/translate, bin/translate.bat, + src/org/htmlparser/util/CharacterReference.java, + src/org/htmlparser/util/Generate.java, + src/org/htmlparser/util/Translate.java, + src/org/htmlparser/util/package.html: + + Rework character entity translation. + See task 58599 enhance character reference translation. + Decode now handles missing semi colons, encoding is more efficient, + hexadecimal numeric character entity references are handled and + both encoding and decoding make minimal use of substring(). + Augmented the tests in CharacterTranslationTest significantly, and + merged the Generate class into the tests. + Added translate command scripts in bin, which read from stdin and write to stdout. + + 2004-02-07 07:53 derrickoswald + + * src/org/htmlparser/: lexer/Lexer.java, + tests/lexerTests/AttributeTests.java: + + Fix bug #891058 Bug in lexer. + Patch submitted by Gernot Fricke. + This change causes attribute parsing to be more 'greedy' resulting in 'empty' attributes + consuming the next attribute. This brings the lexer parsing more in line with other + (browser) interpretations and simplifies it immensely. + + 2004-01-31 15:51 derrickoswald + + * src/org/htmlparser/lexer/Page.java: + + Compare encoding names without case sensitivity. + From HTML spec (http://www.w3.org/TR/html4/charset.html section 5.2.1): + Names for character encodings are case-insensitive, so that for + example "SHIFT_JIS", "Shift_JIS", and "shift_jis" are equivalent. + and from to IANA(http://www.iana.org/assignments/character-sets): + The character set names may be up to 40 characters taken from the + printable characters of US-ASCII. However, no distinction is made + between use of upper and lower case letters. + + 2004-01-31 11:31 derrickoswald + + * src/doc-files/: overview.html, todo.html: + + Move ToDo list to SourceForge trackers and tasks. + Integration Build 1.4 - 20040125 -------------------------------- Index: contributors.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/contributors.html,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** contributors.html 11 Feb 2004 12:37:52 -0000 1.5 --- contributors.html 16 Feb 2004 22:46:08 -0000 1.6 *************** *** 353,360 **** </tr> </table> ! <p>Thanks to Stephen Harrington, Domenico Lordi, Kamen, John Zook, Nick Burch, ! Cheng Jun, Mazlan Mat, Rob Shields, Wolfgang Germund, Raj Sharma, Robert Kausch, ! Gordon Deudney, Serge Kruppa, Roger Kjensrud, Rodney S Foley and Manpreet Singh ! for suggestions, bug reports and feature ideas. <br> </body> --- 353,360 ---- </tr> </table> ! <p>Thanks to Gernot Fricke, Nick Burch, Stephen Harrington, Domenico Lordi, Kamen, ! John Zook, Cheng Jun, Mazlan Mat, Rob Shields, Wolfgang Germund, Raj Sharma, ! Robert Kausch, Gordon Deudney, Serge Kruppa, Roger Kjensrud, Rodney S Foley ! and Manpreet Singh for suggestions, bug reports and feature ideas. <br> </body> Index: release.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/release.txt,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** release.txt 26 Jan 2004 01:02:09 -0000 1.55 --- release.txt 16 Feb 2004 22:46:08 -0000 1.56 *************** *** 1,3 **** ! HTMLParser Version 1.4 (Integration Build Jan 25, 2004) ********************************************* --- 1,3 ---- ! HTMLParser Version 1.4 (Integration Build Feb 16, 2004) ********************************************* *************** *** 21,24 **** --- 21,29 ---- Changes since Version 1.3 ------------------------- + Translation + Character entity encoding and decoding has been revamped, leading to + higher throughput and less memory churn. + Beans + The StringBean can now be used as a visitor for parsers external to the bean. Decorators The node decorator package has been added to provide support for the *************** *** 57,63 **** --- 62,71 ---- Applications New example applications Thumbelina and SiteCapturer. + A mainline has been added to the Translate class to encode/decode stdin to + stdout. Bug Fixes --------- + 891058 Bug in lexer 865279 Documentation 851882 zero length alt tag causes bug in ImageScanner *************** *** 121,124 **** --- 129,135 ---- [26] Stephen Nightingale [27] Donnla Nic Gearailt + [28] Pim Schrama + [29] Nick Burch + [30] Gernot Fricke If you find any bugs, please go to |
From: <der...@us...> - 2004-02-16 22:54:27
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv25002/src/org/htmlparser Modified Files: Parser.java Log Message: Update version to 1.4-20040216. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.86 retrieving revision 1.87 diff -C2 -d -r1.86 -r1.87 *** Parser.java 26 Jan 2004 01:02:10 -0000 1.86 --- Parser.java 16 Feb 2004 22:46:08 -0000 1.87 *************** *** 88,92 **** */ public final static String ! VERSION_DATE = "Jan 25, 2004" ; --- 88,92 ---- */ public final static String ! VERSION_DATE = "Feb 16, 2004" ; |
From: <der...@us...> - 2004-02-11 12:42:07
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3754/src/org/htmlparser/beans Modified Files: StringBean.java Log Message: Incorporate patch from Nick Burch to make StringBean a NodeVisistor for other parsers. See task #93155 StringBean driven by visitor. Index: StringBean.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** StringBean.java 10 Jan 2004 15:23:33 -0000 1.36 --- StringBean.java 11 Feb 2004 12:37:59 -0000 1.37 *************** *** 55,60 **** * String s = sb.getStrings (); * </pre> ! * @author Derrick Oswald ! * Created on December 23, 2002, 5:01 PM */ public class StringBean extends NodeVisitor implements Serializable --- 55,74 ---- * String s = sb.getStrings (); * </pre> ! * You can also use the StringBean as a NodeVisitor on your own parser, ! * in which case you have to refetch your page if you change one of the ! * properties because it resets the Strings property:</p> ! * <pre> ! * StringBean sb = new StringBean (); ! * Parser parser = new Parser ("http://cbc.ca"); ! * parser.visitAllNodesWith (sb); ! * String s = sb.getStrings (); ! * sb.setLinks (true); ! * parser.reset (); ! * parser.visitAllNodesWith (sb); ! * String sl = sb.getStrings (); ! * </pre> ! * According to Nick Burch, who contributed the patch, this is handy if you ! * don't want StringBean to wander off and get the content itself, either ! * because you already have it, it's not on a website etc. */ public class StringBean extends NodeVisitor implements Serializable *************** *** 168,171 **** --- 182,188 ---- mReplaceSpace = true; mCollapse = true; + mBuffer = new StringBuffer (4096); + mIsScript = false; + mIsPre = false; } *************** *** 259,268 **** String ret; - mIsPre = false; - mIsScript = false; - mBuffer = new StringBuffer (4096); mParser.visitAllNodesWith (this); ret = mBuffer.toString (); ! mBuffer = null; return (ret); --- 276,282 ---- String ret; mParser.visitAllNodesWith (this); ret = mBuffer.toString (); ! mBuffer = new StringBuffer(4096); return (ret); *************** *** 294,302 **** try { - mIsPre = false; - mIsScript = false; try { - mBuffer = new StringBuffer (4096); mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); --- 308,313 ---- *************** *** 304,308 **** finally { ! mBuffer = null; } } --- 315,319 ---- finally { ! mBuffer = new StringBuffer (4096); } } *************** *** 331,334 **** --- 342,352 ---- updateStrings (pe.toString ()); } + else + { + // reset in case this StringBean is used as a visitor + // on another parser, not it's own + mStrings = null; + mBuffer = new StringBuffer (4096); + } } *************** *** 388,392 **** { if (null == mStrings) ! setStrings (); return (mStrings); --- 406,413 ---- { if (null == mStrings) ! if (0 == mBuffer.length ()) ! setStrings (); ! else ! updateStrings (mBuffer.toString ()); return (mStrings); |
From: <der...@us...> - 2004-02-11 12:42:07
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3754/docs Modified Files: contributors.html Log Message: Incorporate patch from Nick Burch to make StringBean a NodeVisistor for other parsers. See task #93155 StringBean driven by visitor. Index: contributors.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/contributors.html,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** contributors.html 4 Jan 2004 03:23:08 -0000 1.4 --- contributors.html 11 Feb 2004 12:37:52 -0000 1.5 *************** *** 353,357 **** </tr> </table> ! <p>Thanks to Stephen Harrington, Domenico Lordi, Kamen, John Zook, Cheng Jun, Mazlan Mat, Rob Shields, Wolfgang Germund, Raj Sharma, Robert Kausch, Gordon Deudney, Serge Kruppa, Roger Kjensrud, Rodney S Foley and Manpreet Singh --- 353,357 ---- </tr> </table> ! <p>Thanks to Stephen Harrington, Domenico Lordi, Kamen, John Zook, Nick Burch, Cheng Jun, Mazlan Mat, Rob Shields, Wolfgang Germund, Raj Sharma, Robert Kausch, Gordon Deudney, Serge Kruppa, Roger Kjensrud, Rodney S Foley and Manpreet Singh |
From: <der...@us...> - 2004-02-09 02:12:56
|
Update of /cvsroot/htmlparser/htmlparser/bin In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9169/bin Added Files: translate translate.bat Log Message: Rework character entity translation. See task 58599 enhance character reference translation. Decode now handles missing semi colons, encoding is more efficient, hexadecimal numeric character entity references are handled and both encoding and decoding make minimal use of substring(). Augmented the tests in CharacterTranslationTest significantly, and merged the Generate class into the tests. Added translate command scripts in bin, which read from stdin and write to stdout. --- NEW FILE: translate --- #! /bin/sh if [ -z "$HTMLPARSER_HOME" ] ; then ## resolve links - $0 may be a link to the home PRG="$0" progname=`basename "$0"` saveddir=`pwd` # need this for relative symlinks dirname_prg=`dirname "$PRG"` cd "$dirname_prg" while [ -h "$PRG" ] ; do ls=`ls -ld "$PRG"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '/.*' > /dev/null; then PRG="$link" else PRG=`dirname "$PRG"`"/$link" fi done HTMLPARSER_HOME=`dirname "$PRG"`/.. cd "$saveddir" # make it fully qualified HTMLPARSER_HOME=`cd "$HTMLPARSER_HOME" && pwd` fi if [ -z "$JAVACMD" ] ; then if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables JAVACMD="$JAVA_HOME/jre/sh/java" else JAVACMD="$JAVA_HOME/bin/java" fi else JAVACMD=`which java 2> /dev/null ` if [ -z "$JAVACMD" ] ; then JAVACMD=java fi fi fi if [ ! -x "$JAVACMD" ] ; then echo "Error: JAVA_HOME is not defined correctly." echo " We cannot execute $JAVACMD" exit 1 fi if [ -n "$CLASSPATH" ] ; then LOCALCLASSPATH="$CLASSPATH" fi HTMLPARSER_LIB="${HTMLPARSER_HOME}/lib" # add in the parser .jar file if [ -z "$LOCALCLASSPATH" ] ; then LOCALCLASSPATH="${HTMLPARSER_LIB}/htmlparser.jar" else LOCALCLASSPATH="${HTMLPARSER_LIB}/htmlparser.jar":"$LOCALCLASSPATH" fi # handle 1.1x JDKs if [ -n "$JAVA_HOME" ] ; then if [ -f "$JAVA_HOME/lib/classes.zip" ] ; then LOCALCLASSPATH="$LOCALCLASSPATH:$JAVA_HOME/lib/classes.zip" fi fi "$JAVACMD" -classpath "$LOCALCLASSPATH" org.htmlparser.util.Translate "$@" --- NEW FILE: translate.bat --- java -classpath ..\lib\htmlparser.jar org.htmlparser.util.Translate %1 %2 |
From: <der...@us...> - 2004-02-09 02:12:55
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9169/src/org/htmlparser/tests/utilTests Modified Files: CharacterTranslationTest.java Log Message: Rework character entity translation. See task 58599 enhance character reference translation. Decode now handles missing semi colons, encoding is more efficient, hexadecimal numeric character entity references are handled and both encoding and decoding make minimal use of substring(). Augmented the tests in CharacterTranslationTest significantly, and merged the Generate class into the tests. Added translate command scripts in bin, which read from stdin and write to stdout. Index: CharacterTranslationTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** CharacterTranslationTest.java 14 Jan 2004 03:20:01 -0000 1.41 --- CharacterTranslationTest.java 9 Feb 2004 02:09:44 -0000 1.42 *************** *** 29,41 **** import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.net.URL; import java.net.URLConnection; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.Translate; [...1507 lines suppressed...] ! stimulus.append (character); ! response.append (">"); ! } ! else ! { ! stimulus.append (character); ! response.append (character); ! } ! } ! } ! } ! string = Translate.decode (response.toString ()); ! if (!string.equals (stimulus.toString ())) ! fail ("decoding incorrect:\nexpected \"" + stimulus.toString () + "\"\n decoded \"" + string + "\"\n encoded \"" + response.toString () + "\""); ! stimulus.setLength (0); ! response.setLength (0); ! } ! } } |
From: <der...@us...> - 2004-02-09 02:12:55
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9169/src/org/htmlparser/tests/tagTests Modified Files: TagTest.java Log Message: Rework character entity translation. See task 58599 enhance character reference translation. Decode now handles missing semi colons, encoding is more efficient, hexadecimal numeric character entity references are handled and both encoding and decoding make minimal use of substring(). Augmented the tests in CharacterTranslationTest significantly, and merged the Generate class into the tests. Added translate command scripts in bin, which read from stdin and write to stdout. Index: TagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/TagTest.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** TagTest.java 2 Jan 2004 16:24:57 -0000 1.56 --- TagTest.java 9 Feb 2004 02:09:44 -0000 1.57 *************** *** 368,372 **** assertEquals("font sans-serif parameter","sans-serif",table.get("SANS-SERIF")); // an alternate interpretation: assertEquals("font face parameter","Arial,helvetica,",table.get("FACE")); ! assertEquals("font face parameter","Arial,\"helvetica,",table.get("FACE")); } --- 368,373 ---- assertEquals("font sans-serif parameter","sans-serif",table.get("SANS-SERIF")); // an alternate interpretation: assertEquals("font face parameter","Arial,helvetica,",table.get("FACE")); ! // another: assertEquals("font face parameter","Arial,\"helvetica,",table.get("FACE")); ! assertEquals("font face parameter","Arial,",table.get("FACE")); } |
From: <der...@us...> - 2004-02-09 02:12:55
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9169/src/org/htmlparser/lexer/nodes Modified Files: Attribute.java TagNode.java Log Message: Rework character entity translation. See task 58599 enhance character reference translation. Decode now handles missing semi colons, encoding is more efficient, hexadecimal numeric character entity references are handled and both encoding and decoding make minimal use of substring(). Augmented the tests in CharacterTranslationTest significantly, and merged the Generate class into the tests. Added translate command scripts in bin, which read from stdin and write to stdout. Index: Attribute.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/Attribute.java,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** Attribute.java 2 Jan 2004 16:24:53 -0000 1.17 --- Attribute.java 9 Feb 2004 02:09:44 -0000 1.18 *************** *** 580,584 **** // references, so convert all double quotes into " quote = '"'; ! ref = Translate.convertToString (quote); // JDK 1.4: value = value.replaceAll ("\"", ref); buffer = new StringBuffer (value.length() * 5); --- 580,584 ---- // references, so convert all double quotes into " quote = '"'; ! ref = Translate.encode (quote); // JDK 1.4: value = value.replaceAll ("\"", ref); buffer = new StringBuffer (value.length() * 5); Index: TagNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/TagNode.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** TagNode.java 25 Jan 2004 21:32:59 -0000 1.29 --- TagNode.java 9 Feb 2004 02:09:44 -0000 1.30 *************** *** 186,190 **** // convert all double quotes into " quote = '"'; ! ref = Translate.convertToString (quote); // JDK 1.4: value = value.replaceAll ("\"", ref); buffer = new StringBuffer (value.length() * 5); --- 186,190 ---- // convert all double quotes into " quote = '"'; ! ref = Translate.encode (quote); // JDK 1.4: value = value.replaceAll ("\"", ref); buffer = new StringBuffer (value.length() * 5); |
From: <der...@us...> - 2004-02-09 02:12:55
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9169/src/org/htmlparser/util Modified Files: Translate.java package.html Added Files: CharacterReference.java Removed Files: Generate.java Log Message: Rework character entity translation. See task 58599 enhance character reference translation. Decode now handles missing semi colons, encoding is more efficient, hexadecimal numeric character entity references are handled and both encoding and decoding make minimal use of substring(). Augmented the tests in CharacterTranslationTest significantly, and merged the Generate class into the tests. Added translate command scripts in bin, which read from stdin and write to stdout. --- NEW FILE: CharacterReference.java --- /* * CharacterReference.java * * Created on February 5, 2004, 9:40 PM */ package org.htmlparser.util; import java.io.Serializable; import org.htmlparser.util.sort.Ordered; /** * Structure to hold a character and it's equivalent entity reference kernel. * For the character reference &copy; the character would be '©' and * the kernel would be "copy", for example.<p> * Character references are described at <a href="Character references">http://www.w3.org/TR/REC-html40/charset.html#entities</a> * Supports the Ordered interface so it's easy to create a list sorted by * kernel, to perform binary searches on.<p> */ public class CharacterReference implements Serializable, Cloneable, Ordered { /** * The character value as an integer. */ protected int mCharacter; /** * This entity reference kernel. * The text between the ampersand and the semicolon. */ protected String mKernel; /** * Construct a <code>CharacterReference</code> with the character and kernel given. * @param kernel The kernel in the equivalent character entity reference. * @param character The character needing encoding. */ public CharacterReference (String kernel, int character) { mKernel = kernel; mCharacter = character; if (null == mKernel) mKernel = ""; } /** * Get this CharacterReference's kernel. * @return The kernel in the equivalent character entity reference. */ public String getKernel () { return (mKernel); } /** * Set this CharacterReference's kernel. * This is used to avoid creating a new object to perform a binary search. * @param kernel The kernel in the equivalent character entity reference. */ void setKernel (String kernel) { mKernel = kernel; } /** * Get the character needing translation. * @return The character. */ public int getCharacter () { return (mCharacter); } /** * Set the character. * This is used to avoid creating a new object to perform a binary search. * @param character The character needing translation. */ void setCharacter (int character) { mCharacter = character; } /** * Visualize this character reference as a string. * @return A string with the character and kernel. */ public String toString () { String hex; StringBuffer ret; ret = new StringBuffer (6 + 8 + 2); // max 8 in string hex = Integer.toHexString ((int)getCharacter ()); ret.append ("\\u"); for (int i = hex.length (); i < 4; i++) ret.append ("0"); ret.append (hex); ret.append ("["); ret.append (getKernel ()); ret.append ("]"); return (ret.toString ()); } // // Ordered interface // /** * Compare one reference to another. * @see org.htmlparser.util.sort.Ordered */ public int compare (Object that) { CharacterReference r; r = (CharacterReference)that; return (getKernel ().compareTo (r.getKernel ())); } } Index: Translate.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** Translate.java 2 Jan 2004 16:24:58 -0000 1.42 --- Translate.java 9 Feb 2004 02:09:45 -0000 1.43 *************** *** 27,57 **** package org.htmlparser.util; import java.util.HashMap; import java.util.Iterator; import java.util.Map; /** * Translate numeric character references and character entity references to unicode characters. * Based on tables found at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html"> * http://www.w3.org/TR/REC-html40/sgml/entities.html</a> [...1684 lines suppressed...] + * Numeric character reference and character entity reference to unicode codec. + * Translate the <code>System.in</code> input into an encoded or decoded + * stream and send the results to <code>System.out</code>. + * @param args If arg[0] is <code>-encode</code> perform an encoding on + * <code>System.in</code>, otherwise perform a decoding. + */ + public static void main (String[] args) + { + boolean encode; + + if (0 < args.length && args[0].equalsIgnoreCase ("-encode")) + encode = true; + else + encode = false; + if (encode) + encode (System.in, System.out); + else + decode (System.in, System.out); + } } Index: package.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/package.html,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** package.html 2 Jan 2004 16:24:58 -0000 1.19 --- package.html 9 Feb 2004 02:09:45 -0000 1.20 *************** *** 29,46 **** --> </head> ! <body bgcolor="white"> ! The util package is intended for holding utility classes that dont directly help with the parsing, ! but can take responsibilities out from some classes. Resuable code which can be reused by many classes, should be located ! in this package. ! ! <h2>Related Documentation</h2> ! ! For overviews, tutorials, examples, guides, and tool documentation, please see: ! <ul> ! <li><a href="http://htmlparser.sourceforge.net">HTML Parser Home Page</a> ! </ul> ! ! <!-- Put @see and @since tags down here. --> ! </body> </html> --- 29,36 ---- --> </head> ! <body> ! Code which can be reused by many classes, is located in this package. ! The util package is intended for holding utility classes that don't directly ! help with parsing, but can take responsibilities out of some classes. </body> </html> --- Generate.java DELETED --- |