htmlparser-cvs Mailing List for HTML Parser (Page 2)
Brought to you by:
derrickoswald
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Derrick O. <der...@us...> - 2006-05-31 02:10:21
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodeDecorators In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27353/nodeDecorators Modified Files: AbstractNodeDecorator.java Log Message: implement task #93148 toHtml(boolean verbatim) To avoid printing generated end tags use toHtml(true). Index: AbstractNodeDecorator.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodeDecorators/AbstractNodeDecorator.java,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** AbstractNodeDecorator.java 1 Nov 2005 08:55:24 -0000 1.24 --- AbstractNodeDecorator.java 31 May 2006 02:10:15 -0000 1.25 *************** *** 269,272 **** --- 269,276 ---- } + public String toHtml(boolean verbatim) { + return delegate.toHtml(verbatim); + } + public String toPlainTextString() { return delegate.toPlainTextString(); |
From: Derrick O. <der...@us...> - 2006-05-31 02:10:20
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27353/tags Modified Files: CompositeTag.java ScriptTag.java Log Message: implement task #93148 toHtml(boolean verbatim) To avoid printing generated end tags use toHtml(true). Index: ScriptTag.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ScriptTag.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** ScriptTag.java 10 Apr 2005 23:20:45 -0000 1.38 --- ScriptTag.java 31 May 2006 02:10:15 -0000 1.39 *************** *** 142,148 **** /** * Places the script contents into the provided buffer. * @param sb The buffer to add the script to. */ ! protected void putChildrenInto (StringBuffer sb) { Node node; --- 142,150 ---- /** * Places the script contents into the provided buffer. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @param sb The buffer to add the script to. */ ! protected void putChildrenInto (StringBuffer sb, boolean verbatim) { Node node; *************** *** 155,160 **** node = e.nextNode (); // eliminate virtual tags ! // if (!(node.getStartPosition () == node.getEndPosition ())) ! sb.append (node.toHtml ()); } } --- 157,162 ---- node = e.nextNode (); // eliminate virtual tags ! if (!verbatim || !(node.getStartPosition () == node.getEndPosition ())) ! sb.append (node.toHtml (verbatim)); } } Index: CompositeTag.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v retrieving revision 1.81 retrieving revision 1.82 diff -C2 -d -r1.81 -r1.82 *** CompositeTag.java 20 Jun 2005 01:56:32 -0000 1.81 --- CompositeTag.java 31 May 2006 02:10:15 -0000 1.82 *************** *** 143,149 **** /** * Add the textual contents of the children of this node to the buffer. * @param sb The buffer to append to. */ ! protected void putChildrenInto(StringBuffer sb) { Node node; --- 143,151 ---- /** * Add the textual contents of the children of this node to the buffer. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @param sb The buffer to append to. */ ! protected void putChildrenInto (StringBuffer sb, boolean verbatim) { Node node; *************** *** 152,156 **** node = e.nextNode (); // eliminate virtual tags ! // if (!(node.getStartPosition () == node.getEndPosition ())) sb.append (node.toHtml ()); } --- 154,158 ---- node = e.nextNode (); // eliminate virtual tags ! if (!verbatim || !(node.getStartPosition () == node.getEndPosition ())) sb.append (node.toHtml ()); } *************** *** 159,186 **** /** * Add the textual contents of the end tag of this node to the buffer. * @param sb The buffer to append to. */ ! protected void putEndTagInto(StringBuffer sb) { // eliminate virtual tags ! // if (!(endTag.getStartPosition () == endTag.getEndPosition ())) ! sb.append(getEndTag ().toHtml()); } /** * Return this tag as HTML code. * @return This tag and it's contents (children) and the end tag * as HTML code. */ ! public String toHtml() { ! StringBuffer sb = new StringBuffer(); ! sb.append (super.toHtml ()); ! if (!isEmptyXmlTag()) { ! putChildrenInto(sb); if (null != getEndTag ()) ! putEndTagInto(sb); } ! return sb.toString(); } --- 161,195 ---- /** * Add the textual contents of the end tag of this node to the buffer. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @param sb The buffer to append to. */ ! protected void putEndTagInto (StringBuffer sb, boolean verbatim) { // eliminate virtual tags ! if (!verbatim || !(mEndTag.getStartPosition () == mEndTag.getEndPosition ())) ! sb.append (getEndTag ().toHtml()); } /** * Return this tag as HTML code. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @return This tag and it's contents (children) and the end tag * as HTML code. */ ! public String toHtml (boolean verbatim) ! { ! StringBuffer ret; ! ! ret = new StringBuffer (); ! ret.append (super.toHtml (verbatim)); ! if (!isEmptyXmlTag ()) { ! putChildrenInto (ret, verbatim); if (null != getEndTag ()) ! putEndTagInto (ret, verbatim); } ! return (ret.toString ()); } *************** *** 566,570 **** String ret; ! ret = super.toHtml (); ret = ret.substring (1, ret.length () - 1); --- 575,579 ---- String ret; ! ret = super.toHtml (true); // need TagNode.toHtml(boolean) ret = ret.substring (1, ret.length () - 1); |
From: Derrick O. <der...@us...> - 2006-05-31 02:10:20
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27353/nodes Modified Files: TagNode.java RemarkNode.java AbstractNode.java TextNode.java Log Message: implement task #93148 toHtml(boolean verbatim) To avoid printing generated end tags use toHtml(true). Index: RemarkNode.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/RemarkNode.java,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** RemarkNode.java 27 May 2006 14:03:52 -0000 1.5 --- RemarkNode.java 31 May 2006 02:10:15 -0000 1.6 *************** *** 120,126 **** /** * Return The full HTML remark. * @return The comment, i.e. {@.html <!-- this is a comment -->}. */ ! public String toHtml () { StringBuffer buffer; --- 120,128 ---- /** * Return The full HTML remark. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @return The comment, i.e. {@.html <!-- this is a comment -->}. */ ! public String toHtml (boolean verbatim) { StringBuffer buffer; Index: TextNode.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/TextNode.java,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** TextNode.java 4 Nov 2005 15:49:45 -0000 1.5 --- TextNode.java 31 May 2006 02:10:15 -0000 1.6 *************** *** 103,109 **** /** * Returns the text of the node. * @return The contents of this text node. */ ! public String toHtml () { String ret; --- 103,111 ---- /** * Returns the text of the node. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @return The contents of this text node. */ ! public String toHtml (boolean verbatim) { String ret; *************** *** 111,115 **** ret = mText; if (null == ret) ! ret = mPage.getText (getStartPosition (), getEndPosition ()); return (ret); --- 113,117 ---- ret = mText; if (null == ret) ! ret = mPage.getText (getStartPosition (), getEndPosition ()); return (ret); Index: AbstractNode.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/AbstractNode.java,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** AbstractNode.java 26 Oct 2005 22:01:23 -0000 1.5 --- AbstractNode.java 31 May 2006 02:10:15 -0000 1.6 *************** *** 117,128 **** /** ! * Return the HTML that generated this node. ! * This method will make it easier when using html parser to reproduce html ! * pages (with or without modifications). * Applications reproducing html can use this method on nodes which are to ! * be used or transferred as they were recieved, with the original html. ! * @return The HTML code for this node. */ ! public abstract String toHtml (); /** --- 117,147 ---- /** ! * Return the HTML for this node. ! * This should be the sequence of characters that were encountered by ! * the parser that caused this node to be created. Where this breaks down is ! * where broken nodes (tags and remarks) have been encountered and fixed. * Applications reproducing html can use this method on nodes which are to ! * be used or transferred as they were received or created. ! * @return The sequence of characters that would cause this node ! * to be returned by the parser or lexer. */ ! public String toHtml () ! { ! return (toHtml (false)); ! } ! ! /** ! * Return the HTML for this node. ! * This should be the exact sequence of characters that were encountered by ! * the parser that caused this node to be created. Where this breaks down is ! * where broken nodes (tags and remarks) have been encountered and fixed. ! * Applications reproducing html can use this method on nodes which are to ! * be used or transferred as they were received or created. ! * @param verbatim If <code>true</code> return as close to the original ! * page text as possible. ! * @return The (exact) sequence of characters that would cause this node ! * to be returned by the parser or lexer. ! */ ! public abstract String toHtml (boolean verbatim); /** Index: TagNode.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/TagNode.java,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** TagNode.java 10 Apr 2005 23:20:44 -0000 1.6 --- TagNode.java 31 May 2006 02:10:15 -0000 1.7 *************** *** 660,667 **** * Render the tag as HTML. * A call to a tag's <code>toHtml()</code> method will render it in HTML. * @return The tag as an HTML fragment. * @see org.htmlparser.Node#toHtml() */ ! public String toHtml () { int length; --- 660,669 ---- * Render the tag as HTML. * A call to a tag's <code>toHtml()</code> method will render it in HTML. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @return The tag as an HTML fragment. * @see org.htmlparser.Node#toHtml() */ ! public String toHtml (boolean verbatim) { int length; |
From: Derrick O. <der...@us...> - 2006-05-31 02:10:20
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27353/util Modified Files: NodeList.java Log Message: implement task #93148 toHtml(boolean verbatim) To avoid printing generated end tags use toHtml(true). Index: NodeList.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/NodeList.java,v retrieving revision 1.61 retrieving revision 1.62 diff -C2 -d -r1.61 -r1.62 *** NodeList.java 14 Apr 2006 22:18:47 -0000 1.61 --- NodeList.java 31 May 2006 02:10:15 -0000 1.62 *************** *** 159,172 **** /** * Convert this nodelist into the equivalent HTML. * @return The contents of the list as HTML text. */ public String toHtml () { ! StringBuffer buff = new StringBuffer (); ! for (int i=0;i<size;i++) ! buff.append (nodeData[i].toHtml ()); ! return buff.toString (); } ! /** * Remove the node at index. --- 159,186 ---- /** * Convert this nodelist into the equivalent HTML. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. + * @return The contents of the list as HTML text. + */ + public String toHtml (boolean verbatim) + { + StringBuffer ret; + + ret = new StringBuffer (); + for (int i = 0; i < size; i++) + ret.append (nodeData[i].toHtml (verbatim)); + + return (ret.toString ()); + } + + /** + * Convert this nodelist into the equivalent HTML. * @return The contents of the list as HTML text. */ public String toHtml () { ! return (toHtml (false)); } ! /** * Remove the node at index. |
From: Derrick O. <der...@us...> - 2006-05-31 02:10:20
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27353/tests/utilTests Modified Files: NodeListTest.java Log Message: implement task #93148 toHtml(boolean verbatim) To avoid printing generated end tags use toHtml(true). Index: NodeListTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/NodeListTest.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** NodeListTest.java 18 Sep 2005 23:00:27 -0000 1.28 --- NodeListTest.java 31 May 2006 02:10:14 -0000 1.29 *************** *** 123,127 **** } ! private Node createHTMLNodeObject() { Node node = new AbstractNode(null,10,20) { public void accept(NodeVisitor visitor) { --- 123,128 ---- } ! private Node createHTMLNodeObject () ! { Node node = new AbstractNode(null,10,20) { public void accept(NodeVisitor visitor) { *************** *** 132,135 **** --- 133,140 ---- } + public String toHtml(boolean verbatim) { + return null; + } + public String toPlainTextString() { return null; |
From: Derrick O. <der...@us...> - 2006-05-31 02:10:20
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27353 Modified Files: Node.java Log Message: implement task #93148 toHtml(boolean verbatim) To avoid printing generated end tags use toHtml(true). Index: Node.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Node.java,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** Node.java 15 Nov 2005 02:09:10 -0000 1.55 --- Node.java 31 May 2006 02:10:15 -0000 1.56 *************** *** 69,72 **** --- 69,84 ---- /** * Return the HTML for this node. + * This should be the sequence of characters that were encountered by + * the parser that caused this node to be created. Where this breaks down is + * where broken nodes (tags and remarks) have been encountered and fixed. + * Applications reproducing html can use this method on nodes which are to + * be used or transferred as they were received or created. + * @return The sequence of characters that would cause this node + * to be returned by the parser or lexer. + */ + String toHtml (); + + /** + * Return the HTML for this node. * This should be the exact sequence of characters that were encountered by * the parser that caused this node to be created. Where this breaks down is *************** *** 74,81 **** * Applications reproducing html can use this method on nodes which are to * be used or transferred as they were received or created. * @return The (exact) sequence of characters that would cause this node * to be returned by the parser or lexer. */ ! String toHtml (); /** --- 86,95 ---- * Applications reproducing html can use this method on nodes which are to * be used or transferred as they were received or created. + * @param verbatim If <code>true</code> return as close to the original + * page text as possible. * @return The (exact) sequence of characters that would cause this node * to be returned by the parser or lexer. */ ! String toHtml (boolean verbatim); /** |
From: Derrick O. <der...@us...> - 2006-05-30 03:11:07
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv25700 Modified Files: Parser.java Log Message: Update javadoc for new Parser constructor behaviour. Index: Parser.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.115 retrieving revision 1.116 diff -C2 -d -r1.115 -r1.116 *** Parser.java 30 May 2006 02:53:56 -0000 1.115 --- Parser.java 30 May 2006 03:11:02 -0000 1.116 *************** *** 51,59 **** * constructors that take a {@link #Parser(String) String}, * a {@link #Parser(URLConnection) URLConnection}, or a ! * {@link #Parser(Lexer) Lexer}. In the case of a String, an * attempt is made to open it as a URL, and if that fails it assumes it is a ! * local disk file. If you want to actually parse a String, use ! * {@link #setInputHTML setInputHTML()} after using the ! * {@link #Parser() no-args} constructor, or use {@link #createParser}. * <p>The Parser provides access to the contents of the * page, via a {@link #elements() NodeIterator}, a --- 51,61 ---- * constructors that take a {@link #Parser(String) String}, * a {@link #Parser(URLConnection) URLConnection}, or a ! * {@link #Parser(Lexer) Lexer}. In the case of a String, ! * a check is made to see if the first non-whitespace character is a <, in ! * which case it is assumed to be HTML. Otherwise an * attempt is made to open it as a URL, and if that fails it assumes it is a ! * local disk file. If you want to parse a String after using the ! * {@link #Parser() no-args} constructor, use ! * {@link #setInputHTML setInputHTML()}, or you can use {@link #createParser}. * <p>The Parser provides access to the contents of the * page, via a {@link #elements() NodeIterator}, a |
From: Derrick O. <der...@us...> - 2006-05-30 02:53:59
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv18017 Modified Files: Parser.java Log Message: Allow passing HTML in the Parser constructor. So now it allows HTML, a URL or a file name. Index: Parser.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.114 retrieving revision 1.115 diff -C2 -d -r1.114 -r1.115 *** Parser.java 27 May 2006 18:43:25 -0000 1.114 --- Parser.java 30 May 2006 02:53:56 -0000 1.115 *************** *** 298,302 **** * it in. * @see #Parser(URLConnection,ParserFeedback) ! * @param resourceLocn Either the URL or the filename (autodetects). * A standard HTTP GET is performed to read the content of the URL. * @param feedback The HTMLParserFeedback object to use when information, --- 298,306 ---- * it in. * @see #Parser(URLConnection,ParserFeedback) ! * @param resource Either a URL, a filename or a string of HTML. ! * The string is considered HTML if the first non-whitespace character ! * is a <. The use of a url or file is autodetected by first attempting ! * to open the resource as a URL, if that fails it is assumed to be a file ! * name. * A standard HTTP GET is performed to read the content of the URL. * @param feedback The HTMLParserFeedback object to use when information, *************** *** 305,313 **** * @throws ParserException If the URL is invalid. */ ! public Parser (String resourceLocn, ParserFeedback feedback) throws ParserException { ! this (getConnectionManager ().openConnection (resourceLocn), feedback); } --- 309,340 ---- * @throws ParserException If the URL is invalid. */ ! public Parser (String resource, ParserFeedback feedback) throws ParserException { ! int length; ! boolean html; ! char ch; ! ! if (null == resource) ! throw new IllegalArgumentException ("resource cannot be null"); ! setFeedback (feedback); ! length = resource.length (); ! html = false; ! for (int i = 0; i < length; i++) ! { ! ch = resource.charAt (i); ! if (!Character.isWhitespace (ch)) ! { ! if ('<' == ch) ! html = true; ! break; ! } ! } ! if (html) ! setLexer (new Lexer (new Page (resource))); ! else ! setLexer (new Lexer (getConnectionManager ().openConnection (resource))); ! setNodeFactory (new PrototypicalNodeFactory ()); } *************** *** 315,325 **** * Creates a Parser object with the location of the resource (URL or file). * A DefaultHTMLParserFeedback object is used for feedback. ! * @param resourceLocn Either the URL or the filename (autodetects). * @throws ParserException If the resourceLocn argument does not resolve * to a valid page or file. */ ! public Parser (String resourceLocn) throws ParserException { ! this (resourceLocn, STDOUT); } --- 342,353 ---- * Creates a Parser object with the location of the resource (URL or file). * A DefaultHTMLParserFeedback object is used for feedback. ! * @param resource Either HTML, a URL or a filename (autodetects). * @throws ParserException If the resourceLocn argument does not resolve * to a valid page or file. + * @see #Parser(string,ParserFeedback) */ ! public Parser (String resource) throws ParserException { ! this (resource, STDOUT); } *************** *** 808,821 **** try { - parser = new Parser (); if (1 < args.length) filter = new TagNameFilter (args[1]); else - { // for a simple dump, use more verbose settings filter = null; parser.setFeedback (Parser.STDOUT); getConnectionManager ().setMonitor (parser); } - parser.setURL (args[0]); System.out.println (parser.parse (filter)); } --- 836,850 ---- try { if (1 < args.length) filter = new TagNameFilter (args[1]); else filter = null; + parser = new Parser (args[0]); + if (1 < args.length) + { + // for a simple dump, use more verbose settings parser.setFeedback (Parser.STDOUT); getConnectionManager ().setMonitor (parser); } System.out.println (parser.parse (filter)); } |
From: Derrick O. <der...@us...> - 2006-05-30 01:30:20
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/http In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv12701 Modified Files: ConnectionManager.java Log Message: Handle bad cookie names. Traps cookie name problems, but ignores any following cookies. Index: ConnectionManager.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/http/ConnectionManager.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** ConnectionManager.java 14 Apr 2006 22:18:47 -0000 1.10 --- ConnectionManager.java 30 May 2006 01:30:16 -0000 1.11 *************** *** 1067,1072 **** if (null == cookie) { ! cookie = new Cookie (name, value); ! cookies.addElement (cookie); } else --- 1067,1081 ---- if (null == cookie) { ! try ! { ! cookie = new Cookie (name, value); ! cookies.addElement (cookie); ! } ! catch (IllegalArgumentException iae) ! { ! // should print a warning ! // for now just bail ! break; ! } } else *************** *** 1114,1124 **** } else ! { // error,? unknown attribute, // maybe just another cookie // not separated by a comma ! cookie = new Cookie (name, ! value); ! cookies.addElement (cookie); ! } } } --- 1123,1141 ---- } else ! // error,? unknown attribute, // maybe just another cookie // not separated by a comma ! try ! { ! cookie = new Cookie (name, ! value); ! cookies.addElement (cookie); ! } ! catch (IllegalArgumentException iae) ! { ! // should print a warning ! // for now just bail ! break; ! } } } |
From: Derrick O. <der...@us...> - 2006-05-30 01:07:19
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv32405/tests/utilTests Modified Files: BeanTest.java Log Message: fix bug#1496863 StringBean collapse() adds extra whitespace Keep collapsing state machine state as member variable. Index: BeanTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/BeanTest.java,v retrieving revision 1.50 retrieving revision 1.51 diff -C2 -d -r1.50 -r1.51 *** BeanTest.java 31 Jul 2004 16:42:32 -0000 1.50 --- BeanTest.java 30 May 2006 01:07:15 -0000 1.51 *************** *** 496,500 **** * Test output with pre and script tags */ ! public void xtestOutputWithPreAndScriptTags() { StringBean sb; sb = new StringBean (); --- 496,501 ---- * Test output with pre and script tags */ ! public void xtestOutputWithPreAndScriptTags () ! { StringBean sb; sb = new StringBean (); *************** *** 511,514 **** --- 512,535 ---- } + /** + * Test output with non-breaking tag within text. + */ + public void testTagWhitespace () + { + StringBean sb; + sb = new StringBean (); + + String pre = "AAAAA BBBBB AAA"; + String mid = "AA"; + String post = " BBBBB"; + String html = + "<HTML>\r\n" + + "<body>\r\n" + + "<p>" + pre + "<font color='red'>" + mid + "</font>" + post + "</p>\r\n" + + "</body>\r\n" + + "</HTML>\r\n"; + + check (sb, html, pre + mid + post); + } } |
From: Derrick O. <der...@us...> - 2006-05-30 01:07:18
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv32405/beans Modified Files: StringBean.java Log Message: fix bug#1496863 StringBean collapse() adds extra whitespace Keep collapsing state machine state as member variable. Index: StringBean.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** StringBean.java 15 May 2005 11:49:03 -0000 1.44 --- StringBean.java 30 May 2006 01:07:14 -0000 1.45 *************** *** 149,152 **** --- 149,157 ---- /** + * The state of the collapse processiung state machine. + */ + protected int mCollapseState; + + /** * The buffer text is stored in while traversing the HTML. */ *************** *** 189,192 **** --- 194,198 ---- mReplaceSpace = true; mCollapse = true; + mCollapseState = 0; mBuffer = new StringBuffer (4096); mIsScript = false; *************** *** 213,216 **** --- 219,223 ---- length - NEWLINE_SIZE, length).equals (NEWLINE)))) mBuffer.append (NEWLINE); + mCollapseState = 0; } *************** *** 238,243 **** { int chars; - int length; - int state; char character; --- 245,248 ---- *************** *** 245,255 **** if (0 != chars) { - length = buffer.length (); - state = ((0 == length) - || (buffer.charAt (length - 1) == ' ') - || ((NEWLINE_SIZE <= length) - && buffer.substring ( - length - NEWLINE_SIZE, length).equals (NEWLINE))) - ? 0 : 1; for (int i = 0; i < chars; i++) { --- 250,253 ---- *************** *** 265,275 **** case '\r': case '\n': ! if (0 != state) ! state = 1; break; default: ! if (1 == state) buffer.append (' '); ! state = 2; buffer.append (character); } --- 263,273 ---- case '\r': case '\n': ! if (0 != mCollapseState) ! mCollapseState = 1; break; default: ! if (1 == mCollapseState) buffer.append (' '); ! mCollapseState = 2; buffer.append (character); } *************** *** 289,292 **** --- 287,291 ---- String ret; + mCollapseState = 0; mParser.visitAllNodesWith (this); ret = mBuffer.toString (); *************** *** 319,322 **** --- 318,322 ---- protected void setStrings () { + mCollapseState = 0; if (null != getURL ()) try *************** *** 341,344 **** --- 341,345 ---- mParser.reset (); mBuffer = new StringBuffer (4096); + mCollapseState = 0; mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); *************** *** 558,561 **** --- 559,565 ---- * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. + * The internal state of the collapse state machine can be reset with + * code like this: + * <code>setCollapse (getCollapse ());</code> * @param collapse If <code>true</code>, sequences of whitespace * will be reduced to a single space. *************** *** 563,566 **** --- 567,571 ---- public void setCollapse (boolean collapse) { + mCollapseState = 0; boolean oldValue = mCollapse; if (oldValue != collapse) |
From: Derrick O. <der...@us...> - 2006-05-27 18:43:31
|
Update of /cvsroot/htmlparser/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv14113 Modified Files: build.xml Log Message: Update version to 1.6-20060527 Index: build.xml =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/build.xml,v retrieving revision 1.83 retrieving revision 1.84 diff -C2 -d -r1.83 -r1.84 *** build.xml 14 Apr 2006 22:18:47 -0000 1.83 --- build.xml 27 May 2006 18:43:25 -0000 1.84 *************** *** 11,15 **** is only accurate to the day since it comes from the version coded in the Parser.java file, that's why this step can't be automated ! - incorporate changes from ChangeLog into htmlparser/docs/changes under a heading like "Integration Build 1.5 - 20040522" - 'ant versionSource' updates the version in Parser.java, Lexer.java and release.txt --- 11,15 ---- is only accurate to the day since it comes from the version coded in the Parser.java file, that's why this step can't be automated ! - incorporate changes from ChangeLog into htmlparser/docs/changes.txt under a heading like "Integration Build 1.5 - 20040522" - 'ant versionSource' updates the version in Parser.java, Lexer.java and release.txt *************** *** 18,22 **** - perform a CVS update on htmlparser/ to identify new and changed files - commit changed files (i.e. Parser.java, docs/release.txt, docs/changes.txt, ! and docs/release.txt) to the head revision using a reason of the form: Update version to 1.5-20040522. - use CVS to tag the current head revisions with a name like v1_5_20040522. --- 18,22 ---- - perform a CVS update on htmlparser/ to identify new and changed files - commit changed files (i.e. Parser.java, docs/release.txt, docs/changes.txt, ! and lexer/Lexer.java) to the head revision using a reason of the form: Update version to 1.5-20040522. - use CVS to tag the current head revisions with a name like v1_5_20040522. |
From: Derrick O. <der...@us...> - 2006-05-27 18:43:29
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv14113/src/org/htmlparser/lexer Modified Files: Lexer.java Log Message: Update version to 1.6-20060527 Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** Lexer.java 27 May 2006 17:06:28 -0000 1.47 --- Lexer.java 27 May 2006 18:43:24 -0000 1.48 *************** *** 80,84 **** */ public static final String ! VERSION_DATE = "Mar 19, 2006" ; --- 80,84 ---- */ public static final String ! VERSION_DATE = "May 27, 2006" ; |
From: Derrick O. <der...@us...> - 2006-05-27 18:43:28
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv14113/docs Modified Files: changes.txt release.txt Log Message: Update version to 1.6-20060527 Index: release.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/release.txt,v retrieving revision 1.73 retrieving revision 1.74 diff -C2 -d -r1.73 -r1.74 *** release.txt 20 Mar 2006 00:26:01 -0000 1.73 --- release.txt 27 May 2006 18:43:24 -0000 1.74 *************** *** 1,3 **** ! HTMLParser Version 1.6 (Integration Build Mar 19, 2006) ********************************************* --- 1,3 ---- ! HTMLParser Version 1.6 (Integration Build May 27, 2006) ********************************************* *************** *** 5,11 **** ---------------------------- (i) jar files - lib directory ! HTML Parser jars: htmlparser.jar, lexer.jar, thumbelina.jar and filterbuilder.jar. ! Also thirdparty jar files checkstyle-all-3.1.jar, fit.jar and junit.jar. (ii) source code - src.zip --- 5,11 ---- ---------------------------- (i) jar files - lib directory ! HTML Parser jars: htmlparser.jar, htmllexer.jar, thumbelina.jar and filterbuilder.jar. ! Also thirdparty jar files sax2.jar and junit.jar. (ii) source code - src.zip *************** *** 39,42 **** --- 39,45 ---- NodeTreeWalker, a utility class to traverse a tree of Node objects using either depth-first or breadth-first tree order has been added. + An XorFilter has been added to round out our NOT, AND and OR filters, + along with new constructors to OrFilter/AndFilter that take an array of + NodeFilter's. Refactoring *************** *** 46,56 **** --- 49,66 ---- The NodeList class is a little more standard now with a remove(node) method. Some refactoring to allow the htmllexer jar file to be compiled by gcj. + Moved non-JUnit test code to Request For Enhancement (RFE) as attachments, + so all the code in the tests package should now compile. Bug Fixes --------- + #1488951 RemarkNode.toPlainTextString() incorrect behaviour + #1467712 Page#getCharset never works + #1461473 Relative links starting with ? + #1457371 Script tag consumes too much from document being parsed #1445795 return as TextNode when processing jsp #1445309 XML processing instructions are returned as text #1376851 Null-valued cookies cause exception #1375230 some javascript breaks stringbean + #1345049 HTMLParser should not terminate a comment with ---> #1344687 A bug when set cookies #1334408 Exception occurs based on string length *************** *** 186,189 **** --- 196,200 ---- [40] Jamie McCrindle [41] John Derrick + [42] Ian MacFarlane If you find any bugs, please go to Index: changes.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/changes.txt,v retrieving revision 1.209 retrieving revision 1.210 diff -C2 -d -r1.209 -r1.210 *** changes.txt 20 Mar 2006 00:26:01 -0000 1.209 --- changes.txt 27 May 2006 18:43:24 -0000 1.210 *************** *** 16,19 **** --- 16,146 ---- ******************************************************************************* + Integration Build 1.6 - 20060527 + -------------------------------- + 2006-05-27 10:36 derrickoswald + + * src/org/htmlparser/: scanners/ScriptScanner.java, + tests/scannersTests/ScriptScannerTest.java: + + fix bug #1457371 Script tag consumes too much from document being parsed + The default for ScriptScanner.STRICT was set to true. + If you want the older, more lax, script parsing, set it to false. + + 2006-05-27 10:03 derrickoswald + + * src/org/htmlparser/: nodes/RemarkNode.java, + tests/tagTests/FormTagTest.java: + + fix bug #1488951 RemarkNode.toPlainTextString() incorrect behaviour + RemarkNode.toPlainTextString() now always returns an empty string + if you want the remark text use getText() + + 2006-05-27 10:02 derrickoswald + + * src/org/htmlparser/: lexer/Lexer.java, + tests/FunctionalTests.java, tests/lexerTests/LexerTests.java, + tests/parserHelperTests/RemarkNodeParserTest.java: + + fix bug #1345049 HTMLParser should not terminate a comment with ---> + add static STRICT_REMARKS to Lexer class, which when true follows the specification for remarks + + 2006-05-16 05:14 ian_macfarlane + + * src/org/htmlparser/filters/: AndFilter.java, OrFilter.java: + + Incorrect grammar in javadoc. Changed [it's] to [its]. + + 2006-05-16 05:11 ian_macfarlane + + * src/org/htmlparser/filters/XorFilter.java: + + New class that does XOR logic (to round out our NOT, AND and OR filters). + + 2006-05-16 03:58 ian_macfarlane + + * src/org/htmlparser/filters/: AndFilter.java, OrFilter.java: + + Added constructors to OrFilter/AndFilter that take an array of NodeFilter's. + + 2006-04-24 18:12 derrickoswald + + * src/org/htmlparser/Parser.java: + + Fix incorrect example. + + 2006-04-23 07:59 derrickoswald + + * src/org/htmlparser/tags/TableHeader.java: + + Change copyright as per request by P.I.M. Schrama + + 2006-04-17 20:08 derrickoswald + + * src/org/htmlparser/tests/: lexerTests/KitTest.java, + PerformanceTest.java: + + Move non-junit test code to Request For Enhancement (RFE) as attachments. + + 2006-04-17 19:45 derrickoswald + + * src/org/htmlparser/tests/: ParserTestCase.java, + PerformanceTest.java: + + Fix unit tests. + + 2006-04-17 09:53 derrickoswald + + * src/org/htmlparser/tests/: ParserTest.java, + lexerTests/LexerTests.java, tagTests/InputTagTest.java, + tagTests/TableTagTest.java, + utilTests/CharacterTranslationTest.java: + + Fix unit tests. Move failing test cases to downloads on corresponding RFE artifacts. + + 2006-04-17 09:51 derrickoswald + + * bin/: translate.cmd, beanybaby.cmd, filterbuilder.cmd, lexer.cmd, + linkextractor.cmd, parser.cmd, sitecapturer.cmd, + stringextractor.cmd, thumbelina.cmd: + + Allow execution from directory name containing spaces on Windows. + + 2006-04-14 18:18 derrickoswald + + * build.xml, src/org/htmlparser/Parser.java, + src/org/htmlparser/http/ConnectionManager.java, + src/org/htmlparser/lexer/Lexer.java, + src/org/htmlparser/util/NodeList.java: + + Cleanup to isolate htmllexer jar build. + + 2006-04-11 08:03 derrickoswald + + * src/org/htmlparser/tests/: AllTests.java, MemoryTest.java: + + Move failing unit test to RFE as a download. + + 2006-04-10 17:38 derrickoswald + + * src/org/htmlparser/lexer/Page.java: + + Fix Bug #1467712 Page#getCharset never works + Use Content-Type header field instead of connection's getContentType method. + + 2006-04-08 09:33 derrickoswald + + * src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java: + + Typo. + + 2006-04-06 20:58 derrickoswald + + * src/org/htmlparser/: lexer/Page.java, + tests/lexerTests/PageTests.java: + + Fix Bug #1461473 Relative links starting with ? + Added overloaded methods taking boolean 'strict' flag on URL manipulators. + Default is loose interpretation like most browsers. + Integration Build 1.6 - 20060319 -------------------------------- |
From: Derrick O. <der...@us...> - 2006-05-27 18:43:28
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv14113/src/org/htmlparser Modified Files: Parser.java Log Message: Update version to 1.6-20060527 Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.113 retrieving revision 1.114 diff -C2 -d -r1.113 -r1.114 *** Parser.java 24 Apr 2006 22:12:05 -0000 1.113 --- Parser.java 27 May 2006 18:43:25 -0000 1.114 *************** *** 133,137 **** */ public static final String ! VERSION_DATE = "Mar 19, 2006" ; --- 133,137 ---- */ public static final String ! VERSION_DATE = "May 27, 2006" ; |
From: Derrick O. <der...@us...> - 2006-05-27 17:06:37
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv6402/lexer Modified Files: Lexer.java Page.java Log Message: fix bug #1493884 Lexer returns a TagNode with a 'null' name Use a more careful cursor retreat - Page.ungetCharacter(). Index: Page.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** Page.java 10 Apr 2006 21:38:41 -0000 1.55 --- Page.java 27 May 2006 17:06:28 -0000 1.56 *************** *** 680,684 **** * current source position. * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n, ! * and updates the end-of-line index accordingly * Advances the cursor position by one (or two in the \r\n case). * @param cursor The position to read at. --- 680,684 ---- * current source position. * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n, ! * and updates the end-of-line index accordingly. * Advances the cursor position by one (or two in the \r\n case). * @param cursor The position to read at. *************** *** 686,690 **** * prepare for the next read. If the source is exhausted a zero is returned. * @exception ParserException If an IOException on the underlying source ! * occurs, or an attemp is made to read characters in the future (the * cursor position is ahead of the underlying stream) */ --- 686,690 ---- * prepare for the next read. If the source is exhausted a zero is returned. * @exception ParserException If an IOException on the underlying source ! * occurs, or an attempt is made to read characters in the future (the * cursor position is ahead of the underlying stream) */ *************** *** 793,796 **** --- 793,832 ---- /** + * Return a character. + * Handles end of lines (EOL) specially, retreating the cursor twice for + * the '\r\n' case. + * The cursor position is moved back by one (or two in the \r\n case). + * @param cursor The position to 'unread' at. + * @exception ParserException If an IOException on the underlying source + * occurs. + */ + public void ungetCharacter (Cursor cursor) + throws + ParserException + { + int i; + char ch; + + cursor.retreat (); + i = cursor.getPosition (); + try + { + ch = mSource.getCharacter (i); + if (('\n' == ch) && (0 != i)) + { + ch = mSource.getCharacter (i - 1); + if ('\r' == ch) + cursor.retreat (); + } + } + catch (IOException ioe) + { + throw new ParserException ( + "can't read a character at position " + + cursor.getPosition (), ioe); + } + } + + /** * Get the current encoding being used. * @return The encoding used to convert characters. Index: Lexer.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** Lexer.java 27 May 2006 14:02:27 -0000 1.46 --- Lexer.java 27 May 2006 17:06:28 -0000 1.47 *************** *** 356,370 **** else if ('%' == ch) { ! mCursor.retreat (); ret = parseJsp (start); } else if ('?' == ch) { ! mCursor.retreat (); ret = parsePI (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { ! mCursor.retreat (); ret = parseTag (start); } --- 356,370 ---- else if ('%' == ch) { ! mPage.ungetCharacter (mCursor); ret = parseJsp (start); } else if ('?' == ch) { ! mPage.ungetCharacter (mCursor); ret = parsePI (start); } else if ('/' == ch || '%' == ch || Character.isLetter (ch)) { ! mPage.ungetCharacter (mCursor); ret = parseTag (start); } *************** *** 380,389 **** else { ! mCursor.retreat (); // remark/tag need this char if ('-' == ch) ret = parseRemark (start, quotesmart); else { ! mCursor.retreat (); // tag needs prior one too ret = parseTag (start); } --- 380,389 ---- else { ! mPage.ungetCharacter (mCursor); // remark/tag need this char if ('-' == ch) ret = parseRemark (start, quotesmart); else { ! mPage.ungetCharacter (mCursor); // tag needs prior one too ret = parseTag (start); } *************** *** 395,399 **** break; default: ! mCursor.retreat (); // string needs to see leading foreslash ret = parseString (start, quotesmart); break; --- 395,399 ---- break; default: ! mPage.ungetCharacter (mCursor); // string needs to see leading foreslash ret = parseString (start, quotesmart); break; *************** *** 489,493 **** done = true; else if ( (ch != '\\') && (ch != quote)) ! mCursor.retreat (); // unconsume char if character was not an escapable char. } break; --- 489,494 ---- done = true; else if ( (ch != '\\') && (ch != quote)) ! // unconsume char if character was not an escapable char. ! mPage.ungetCharacter (mCursor); } break; *************** *** 511,520 **** ch = mPage.getCharacter (mCursor); if (ch == '*') ! mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mCursor.retreat (); } break; --- 512,521 ---- ch = mPage.getCharacter (mCursor); if (ch == '*') ! mPage.ungetCharacter (mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mPage.ungetCharacter (mCursor); } break; *************** *** 574,580 **** done = true; // back up to the start of ETAGO ! mCursor.retreat (); ! mCursor.retreat (); ! mCursor.retreat (); } else --- 575,581 ---- done = true; // back up to the start of ETAGO ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } else *************** *** 599,608 **** else { ! mCursor.retreat (); ! mCursor.retreat (); } } else ! mCursor.retreat (); } break; --- 600,609 ---- else { ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } } else ! mPage.ungetCharacter (mCursor); } break; *************** *** 749,758 **** else { ! mCursor.retreat (); ! mCursor.retreat (); } } else ! mCursor.retreat (); } else if (quotesmart && (0 == quote) --- 750,759 ---- else { ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } } else ! mPage.ungetCharacter (mCursor); } else if (quotesmart && (0 == quote) *************** *** 767,771 **** && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) ! mCursor.retreat(); // unconsume char if char not an escape } else if (quotesmart && (ch == quote)) --- 768,772 ---- && (ch != quote)) // escaped quote character // ( reflects ["] or ['] whichever opened the quotation) ! mPage.ungetCharacter (mCursor); // unconsume char if char not an escape } else if (quotesmart && (ch == quote)) *************** *** 794,803 **** ch = mPage.getCharacter (mCursor); if (ch == '*') ! mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mCursor.retreat (); } else if ((0 == quote) && ('<' == ch)) --- 795,804 ---- ch = mPage.getCharacter (mCursor); if (ch == '*') ! mPage.ungetCharacter (mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mPage.ungetCharacter (mCursor); } else if ((0 == quote) && ('<' == ch)) *************** *** 811,821 **** { done = true; ! mCursor.retreat (); ! mCursor.retreat (); } else { // it's not a tag, so keep going, but check for quotes ! mCursor.retreat (); } } --- 812,822 ---- { done = true; ! mPage.ungetCharacter (mCursor); ! mPage.ungetCharacter (mCursor); } else { // it's not a tag, so keep going, but check for quotes ! mPage.ungetCharacter (mCursor); } } *************** *** 1013,1017 **** { // don't consume the opening angle ! mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } --- 1014,1018 ---- { // don't consume the opening angle ! mPage.ungetCharacter (mCursor); bookmarks[state + 1] = mCursor.getPosition (); } *************** *** 1031,1035 **** { // don't consume the opening angle ! mCursor.retreat (); bookmarks[state + 1] = mCursor.getPosition (); } --- 1032,1036 ---- { // don't consume the opening angle ! mPage.ungetCharacter (mCursor); bookmarks[state + 1] = mCursor.getPosition (); } *************** *** 1121,1125 **** standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mCursor.retreat(); state=0; } --- 1122,1126 ---- standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mPage.ungetCharacter (mCursor); state=0; } *************** *** 1143,1147 **** standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mCursor.retreat(); state=0; } --- 1144,1148 ---- standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; ! mPage.ungetCharacter (mCursor); state=0; } *************** *** 1263,1267 **** else { ! mCursor.retreat (); state = 2; } --- 1264,1268 ---- else { ! mPage.ungetCharacter (mCursor); state = 2; } *************** *** 1442,1453 **** ch = mPage.getCharacter (mCursor); if (ch == '*') ! mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else ! { ! mCursor.retreat (); ! } break; default: // <%???x --- 1443,1452 ---- ch = mPage.getCharacter (mCursor); if (ch == '*') ! mPage.ungetCharacter (mCursor); } while ((Page.EOF != ch) && ('/' != ch)); } else ! mPage.ungetCharacter (mCursor); break; default: // <%???x |
From: Derrick O. <der...@us...> - 2006-05-27 17:06:37
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv6402/tests/lexerTests Modified Files: LexerTests.java StreamTests.java Log Message: fix bug #1493884 Lexer returns a TagNode with a 'null' name Use a more careful cursor retreat - Page.ungetCharacter(). Index: LexerTests.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** LexerTests.java 27 May 2006 14:02:28 -0000 1.30 --- LexerTests.java 27 May 2006 17:06:28 -0000 1.31 *************** *** 930,933 **** --- 930,958 ---- assertNull ("too many nodes", iterator.nextNode ()); } + + /** + * See bug #1493884 Lexer returns a TagNode with a 'null' name + */ + public void testDosLineEndingInName () throws ParserException + { + String html; + NodeIterator iterator; + Node node; + + html = "<!\r\nMSIE->"; + parser = new Parser (); + parser.setInputHTML (html); + iterator = parser.elements (); + node = iterator.nextNode (); + if (node == null) + fail ("too few nodes"); + else + { + assertNotNull ("null node", node); + assertTrue (node instanceof Tag); + assertNotNull ("null name", ((Tag)node).getTagName ()); + assertStringEquals ("bad parse", "!", ((Tag)node).getTagName ()); + } + } } Index: StreamTests.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/StreamTests.java,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** StreamTests.java 14 Jan 2004 02:53:47 -0000 1.16 --- StreamTests.java 27 May 2006 17:06:28 -0000 1.17 *************** *** 102,107 **** int index; ! // pick a big file ! link = "http://htmlparser.sourceforge.net/HTMLParser_Coverage.html"; try { --- 102,106 ---- int index; ! link = "http://htmlparser.sourceforge.net"; try { |
From: Derrick O. <der...@us...> - 2006-05-27 14:36:49
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv13002/tests/scannersTests Modified Files: ScriptScannerTest.java Log Message: fix bug #1457371 Script tag consumes too much from document being parsed The default for ScriptScanner.STRICT was set to true. If you want the older, more lax, script parsing, set it to false. Index: ScriptScannerTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java,v retrieving revision 1.59 retrieving revision 1.60 diff -C2 -d -r1.59 -r1.60 *** ScriptScannerTest.java 12 Mar 2005 17:53:11 -0000 1.59 --- ScriptScannerTest.java 27 May 2006 14:36:46 -0000 1.60 *************** *** 185,477 **** * @throws Exception */ ! public void testScriptTagsGeneratedByScriptCode() throws Exception { ! createParser( ! "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 " + ! "Transitional//EN\">" + ! "<html>" + ! "<head>" + ! "<title>Untitled Document</title>" + ! "<meta http-equiv=\"Content-Type\" content=\"text/html; " + ! "charset=iso-8859-1\">" + ! "</head>" + ! "<script language=\"JavaScript\">" + ! "document.write(\"<script " + ! "language=\\\"JavaScript\\\">\");" + ! "document.write(\"function onmousedown" + ! "(event)\");" + ! "document.write(\"{ // do something\"); " + ! "document.write(\"}\"); " + ! "// parser thinks this is the end tag.\n" + ! "document.write(\"</script>\");" + ! "</script>" + ! "<body>" + ! "</body>" + ! "</html>" ! ); ! Node scriptNodes [] = ! parser.extractAllNodesThatAre(ScriptTag.class); ! assertType( ! "scriptnode", ! ScriptTag.class, ! scriptNodes[0] ! ); ! ScriptTag scriptTag = (ScriptTag)scriptNodes[0]; ! assertStringEquals( ! "script code", ! "document.write(\"<script " + ! "language=\\\"JavaScript\\\">\");" + ! "document.write(\"function onmousedown" + ! "(event)\");" + ! "document.write(\"{ // do something\"); " + ! "document.write(\"}\"); " + ! "// parser thinks this is the end tag.\n" + ! "document.write(\"</script>\");", ! scriptTag.getScriptCode() ! ); ! } ! public void testScriptCodeExtraction() throws ParserException { ! createParser( ! "<SCRIPT language=JavaScript>" + ! "document.write(\"<a href=\"1.htm\"><img src=\"1.jpg\" " + ! "width=\"80\" height=\"20\" border=\"0\"></a>\");" + ! "</SCRIPT>" ! ); ! parseAndAssertNodeCount(1); ! assertType("script",ScriptTag.class,node[0]); ! ScriptTag scriptTag = (ScriptTag)node[0]; ! assertStringEquals( ! "script code", ! "document.write(\"<a href=\"1.htm\"><img src=\"1.jpg\" " + ! "width=\"80\" height=\"20\" border=\"0\"></a>\");", ! scriptTag.getScriptCode() ! ); } ! public void testScriptCodeExtractionWithMultipleQuotes() throws ParserException { ! createParser( ! "<SCRIPT language=JavaScript>" + ! "document.write(\"<a href=\\\"1.htm\\\"><img src=\\\"1.jpg\\\" " + ! "width=\\\"80\\\" height=\\\"20\\\" border=\\\"0\\\"></a>\");" + ! "</SCRIPT>" ! ); ! parseAndAssertNodeCount(1); ! assertType("script",ScriptTag.class,node[0]); ! ScriptTag scriptTag = (ScriptTag)node[0]; ! assertStringEquals( ! "script code", ! "document.write(\"<a href=\\\"1.htm\\\"><img src=\\\"1.jpg\\\" " + ! "width=\\\"80\\\" height=\\\"20\\\" border=\\\"0\\\"></a>\");", ! scriptTag.getScriptCode() ! ); } ! public void testScriptWithinComments() throws Exception { ! createParser( ! "<script language=\"JavaScript1.2\">" + ! "\n" + ! "var linkset=new Array()" + ! "\n" + ! "var ie4=document.all&&navigator.userAgent.indexOf(\"Opera\")==-1" + ! "\n" + ! "var ns6=document.getElementById&&!document.all" + ! "\n" + ! "var ns4=document.layers" + ! "\n" + ! "\n" + ! "\n" + ! "function showmenu(e,which){" + ! "\n" + ! "\n" + ! "\n" + ! "if (!document.all&&!document.getElementById&&!document.layers)" + ! "\n" + ! "return" + ! "\n" + ! "\n" + ! "\n" + ! "clearhidemenu()" + ! "\n" + ! "\n" + ! "\n" + ! "menuobj=ie4? document.all.popmenu : ns6? document.getElementById(\"popmenu\") : ns4? document.popmenu : \"\"\n" + ! "\n" + ! "menuobj.thestyle=(ie4||ns6)? menuobj.style : menuobj" + ! "\n" + ! "\n" + ! "\n" + ! "if (ie4||ns6)" + ! "\n" + ! "menuobj.innerHTML=which" + ! "\n" + ! "else{" + ! "\n" + ! "menuobj.document.write('<layer name=gui bgColor=#E6E6E6 width=165 onmouseover=\"clearhidemenu()\" onmouseout=\"hidemenu()\">'+which+'</layer>')" + ! "\n" + ! "menuobj.document.close()" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "menuobj.contentwidth=(ie4||ns6)? menuobj.offsetWidth : menuobj.document.gui.document.width" + ! "\n" + ! "menuobj.contentheight=(ie4||ns6)? menuobj.offsetHeight : menuobj.document.gui.document.height" + ! "\n" + ! "eventX=ie4? event.clientX : ns6? e.clientX : e.x" + ! "\n" + ! "eventY=ie4? event.clientY : ns6? e.clientY : e.y" + ! "\n" + ! "\n" + ! "\n" + ! "//Find out how close the mouse is to the corner of the window" + ! "\n" + ! "var rightedge=ie4? document.body.clientWidth-eventX : window.innerWidth-eventX" + ! "\n" + ! "var bottomedge=ie4? document.body.clientHeight-eventY : window.innerHeight-eventY" + ! "\n" + ! "\n" + ! "\n" + ! "//if the horizontal distance isn't enough to accomodate the width of the context menu" + ! "\n" + ! "if (rightedge < menuobj.contentwidth)" + ! "\n" + ! "//move the horizontal position of the menu to the left by it's width" + ! "\n" + ! "menuobj.thestyle.left=ie4? document.body.scrollLeft+eventX-menuobj.contentwidth : ns6? window.pageXOffset+eventX-menuobj.contentwidth : eventX-menuobj.contentwidth" + ! "\n" + ! "else" + ! "\n" + ! "//position the horizontal position of the menu where the mouse was clicked" + ! "\n" + ! "menuobj.thestyle.left=ie4? document.body.scrollLeft+eventX : ns6? window.pageXOffset+eventX : eventX" + ! "\n" + ! "\n" + ! "\n" + ! "//same concept with the vertical position" + ! "\n" + ! "if (bottomedge<menuobj.contentheight)" + ! "\n" + ! "menuobj.thestyle.top=ie4? document.body.scrollTop+eventY-menuobj.contentheight : ns6? window.pageYOffset+eventY-menuobj.contentheight : eventY-menuobj.contentheight" + ! "\n" + ! "else" + ! "\n" + ! "menuobj.thestyle.top=ie4? document.body.scrollTop+event.clientY : ns6? window.pageYOffset+eventY : eventY" + ! "\n" + ! "menuobj.thestyle.visibility=\"visible\"\n" + ! "\n" + ! "return false" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function contains_ns6(a, b) {" + ! "\n" + ! "//Determines if 1 element in contained in another- by Brainjar.com" + ! "\n" + ! "while (b.parentNode)" + ! "\n" + ! "if ((b = b.parentNode) == a)" + ! "\n" + ! "return true;" + ! "\n" + ! "return false;" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function hidemenu(){" + ! "\n" + ! "if (window.menuobj)" + ! "\n" + ! "menuobj.thestyle.visibility=(ie4||ns6)? \"hidden\" : \"hide\"\n" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function dynamichide(e){" + ! "\n" + ! "if (ie4&&!menuobj.contains(e.toElement))" + ! "\n" + ! "hidemenu()" + ! "\n" + ! "else if (ns6&&e.currentTarget!= e.relatedTarget&& !contains_ns6(e.currentTarget, e.relatedTarget))" + ! "\n" + ! "hidemenu()" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function delayhidemenu(){" + ! "\n" + ! "if (ie4||ns6||ns4)" + ! "\n" + ! "delayhide=setTimeout(\"hidemenu()\",500)" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function clearhidemenu(){" + ! "\n" + ! "if (window.delayhide)" + ! "\n" + ! "clearTimeout(delayhide)" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function highlightmenu(e,state){" + ! "\n" + ! "if (document.all)" + ! "\n" + ! "source_el=event.srcElement" + ! "\n" + ! "else if (document.getElementById)" + ! "\n" + ! "source_el=e.target" + ! "\n" + ! "if (source_el.className==\"menuitems\"){" + ! "\n" + ! "source_el.id=(state==\"on\")? \"mouseoverstyle\" : \"\"\n" + ! "\n" + ! "}" + ! "\n" + ! "else{" + ! "\n" + ! "while(source_el.id!=\"popmenu\"){" + ! "\n" + ! "source_el=document.getElementById? source_el.parentNode : source_el.parentElement" + ! "\n" + ! "if (source_el.className==\"menuitems\"){" + ! "\n" + ! "source_el.id=(state==\"on\")? \"mouseoverstyle\" : \"\"\n" + ! "\n" + ! "}" + ! "\n" + ! "}" + ! "\n" + ! "}" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "if (ie4||ns6)" + ! "\n" + ! "document.onclick=hidemenu" + ! "\n" + ! "\n" + ! "\n" + ! "</script>" ! ); ! parseAndAssertNodeCount(1); ! } --- 185,515 ---- * @throws Exception */ ! public void testScriptTagsGeneratedByScriptCode() throws Exception ! { ! boolean old_strict = org.htmlparser.scanners.ScriptScanner.STRICT; ! try ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = false; ! createParser( ! "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 " + ! "Transitional//EN\">" + ! "<html>" + ! "<head>" + ! "<title>Untitled Document</title>" + ! "<meta http-equiv=\"Content-Type\" content=\"text/html; " + ! "charset=iso-8859-1\">" + ! "</head>" + ! "<script language=\"JavaScript\">" + ! "document.write(\"<script " + ! "language=\\\"JavaScript\\\">\");" + ! "document.write(\"function onmousedown" + ! "(event)\");" + ! "document.write(\"{ // do something\"); " + ! "document.write(\"}\"); " + ! "// parser thinks this is the end tag.\n" + ! "document.write(\"</script>\");" + ! "</script>" + ! "<body>" + ! "</body>" + ! "</html>" ! ); ! Node scriptNodes [] = ! parser.extractAllNodesThatAre(ScriptTag.class); ! assertType( ! "scriptnode", ! ScriptTag.class, ! scriptNodes[0] ! ); ! ScriptTag scriptTag = (ScriptTag)scriptNodes[0]; ! assertStringEquals( ! "script code", ! "document.write(\"<script " + ! "language=\\\"JavaScript\\\">\");" + ! "document.write(\"function onmousedown" + ! "(event)\");" + ! "document.write(\"{ // do something\"); " + ! "document.write(\"}\"); " + ! "// parser thinks this is the end tag.\n" + ! "document.write(\"</script>\");", ! scriptTag.getScriptCode() ! ); ! } ! finally ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = old_strict; ! } } ! public void testScriptCodeExtraction() throws ParserException ! { ! boolean old_strict = org.htmlparser.scanners.ScriptScanner.STRICT; ! try ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = false; ! createParser( ! "<SCRIPT language=JavaScript>" + ! "document.write(\"<a href=\"1.htm\"><img src=\"1.jpg\" " + ! "width=\"80\" height=\"20\" border=\"0\"></a>\");" + ! "</SCRIPT>" ! ); ! parseAndAssertNodeCount(1); ! assertType("script",ScriptTag.class,node[0]); ! ScriptTag scriptTag = (ScriptTag)node[0]; ! assertStringEquals( ! "script code", ! "document.write(\"<a href=\"1.htm\"><img src=\"1.jpg\" " + ! "width=\"80\" height=\"20\" border=\"0\"></a>\");", ! scriptTag.getScriptCode() ! ); ! } ! finally ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = old_strict; ! } } ! public void testScriptCodeExtractionWithMultipleQuotes() throws ParserException ! { ! boolean old_strict = org.htmlparser.scanners.ScriptScanner.STRICT; ! try ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = false; ! createParser( ! "<SCRIPT language=JavaScript>" + ! "document.write(\"<a href=\\\"1.htm\\\"><img src=\\\"1.jpg\\\" " + ! "width=\\\"80\\\" height=\\\"20\\\" border=\\\"0\\\"></a>\");" + ! "</SCRIPT>" ! ); ! parseAndAssertNodeCount(1); ! assertType("script",ScriptTag.class,node[0]); ! ScriptTag scriptTag = (ScriptTag)node[0]; ! assertStringEquals( ! "script code", ! "document.write(\"<a href=\\\"1.htm\\\"><img src=\\\"1.jpg\\\" " + ! "width=\\\"80\\\" height=\\\"20\\\" border=\\\"0\\\"></a>\");", ! scriptTag.getScriptCode() ! ); ! } ! finally ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = old_strict; ! } } ! public void testScriptWithinComments() throws Exception ! { ! boolean old_strict = org.htmlparser.scanners.ScriptScanner.STRICT; ! try ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = false; ! createParser( ! "<script language=\"JavaScript1.2\">" + ! "\n" + ! "var linkset=new Array()" + ! "\n" + ! "var ie4=document.all&&navigator.userAgent.indexOf(\"Opera\")==-1" + ! "\n" + ! "var ns6=document.getElementById&&!document.all" + ! "\n" + ! "var ns4=document.layers" + ! "\n" + ! "\n" + ! "\n" + ! "function showmenu(e,which){" + ! "\n" + ! "\n" + ! "\n" + ! "if (!document.all&&!document.getElementById&&!document.layers)" + ! "\n" + ! "return" + ! "\n" + ! "\n" + ! "\n" + ! "clearhidemenu()" + ! "\n" + ! "\n" + ! "\n" + ! "menuobj=ie4? document.all.popmenu : ns6? document.getElementById(\"popmenu\") : ns4? document.popmenu : \"\"\n" + ! "\n" + ! "menuobj.thestyle=(ie4||ns6)? menuobj.style : menuobj" + ! "\n" + ! "\n" + ! "\n" + ! "if (ie4||ns6)" + ! "\n" + ! "menuobj.innerHTML=which" + ! "\n" + ! "else{" + ! "\n" + ! "menuobj.document.write('<layer name=gui bgColor=#E6E6E6 width=165 onmouseover=\"clearhidemenu()\" onmouseout=\"hidemenu()\">'+which+'</layer>')" + ! "\n" + ! "menuobj.document.close()" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "menuobj.contentwidth=(ie4||ns6)? menuobj.offsetWidth : menuobj.document.gui.document.width" + ! "\n" + ! "menuobj.contentheight=(ie4||ns6)? menuobj.offsetHeight : menuobj.document.gui.document.height" + ! "\n" + ! "eventX=ie4? event.clientX : ns6? e.clientX : e.x" + ! "\n" + ! "eventY=ie4? event.clientY : ns6? e.clientY : e.y" + ! "\n" + ! "\n" + ! "\n" + ! "//Find out how close the mouse is to the corner of the window" + ! "\n" + ! "var rightedge=ie4? document.body.clientWidth-eventX : window.innerWidth-eventX" + ! "\n" + ! "var bottomedge=ie4? document.body.clientHeight-eventY : window.innerHeight-eventY" + ! "\n" + ! "\n" + ! "\n" + ! "//if the horizontal distance isn't enough to accomodate the width of the context menu" + ! "\n" + ! "if (rightedge < menuobj.contentwidth)" + ! "\n" + ! "//move the horizontal position of the menu to the left by it's width" + ! "\n" + ! "menuobj.thestyle.left=ie4? document.body.scrollLeft+eventX-menuobj.contentwidth : ns6? window.pageXOffset+eventX-menuobj.contentwidth : eventX-menuobj.contentwidth" + ! "\n" + ! "else" + ! "\n" + ! "//position the horizontal position of the menu where the mouse was clicked" + ! "\n" + ! "menuobj.thestyle.left=ie4? document.body.scrollLeft+eventX : ns6? window.pageXOffset+eventX : eventX" + ! "\n" + ! "\n" + ! "\n" + ! "//same concept with the vertical position" + ! "\n" + ! "if (bottomedge<menuobj.contentheight)" + ! "\n" + ! "menuobj.thestyle.top=ie4? document.body.scrollTop+eventY-menuobj.contentheight : ns6? window.pageYOffset+eventY-menuobj.contentheight : eventY-menuobj.contentheight" + ! "\n" + ! "else" + ! "\n" + ! "menuobj.thestyle.top=ie4? document.body.scrollTop+event.clientY : ns6? window.pageYOffset+eventY : eventY" + ! "\n" + ! "menuobj.thestyle.visibility=\"visible\"\n" + ! "\n" + ! "return false" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function contains_ns6(a, b) {" + ! "\n" + ! "//Determines if 1 element in contained in another- by Brainjar.com" + ! "\n" + ! "while (b.parentNode)" + ! "\n" + ! "if ((b = b.parentNode) == a)" + ! "\n" + ! "return true;" + ! "\n" + ! "return false;" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function hidemenu(){" + ! "\n" + ! "if (window.menuobj)" + ! "\n" + ! "menuobj.thestyle.visibility=(ie4||ns6)? \"hidden\" : \"hide\"\n" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function dynamichide(e){" + ! "\n" + ! "if (ie4&&!menuobj.contains(e.toElement))" + ! "\n" + ! "hidemenu()" + ! "\n" + ! "else if (ns6&&e.currentTarget!= e.relatedTarget&& !contains_ns6(e.currentTarget, e.relatedTarget))" + ! "\n" + ! "hidemenu()" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function delayhidemenu(){" + ! "\n" + ! "if (ie4||ns6||ns4)" + ! "\n" + ! "delayhide=setTimeout(\"hidemenu()\",500)" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function clearhidemenu(){" + ! "\n" + ! "if (window.delayhide)" + ! "\n" + ! "clearTimeout(delayhide)" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "function highlightmenu(e,state){" + ! "\n" + ! "if (document.all)" + ! "\n" + ! "source_el=event.srcElement" + ! "\n" + ! "else if (document.getElementById)" + ! "\n" + ! "source_el=e.target" + ! "\n" + ! "if (source_el.className==\"menuitems\"){" + ! "\n" + ! "source_el.id=(state==\"on\")? \"mouseoverstyle\" : \"\"\n" + ! "\n" + ! "}" + ! "\n" + ! "else{" + ! "\n" + ! "while(source_el.id!=\"popmenu\"){" + ! "\n" + ! "source_el=document.getElementById? source_el.parentNode : source_el.parentElement" + ! "\n" + ! "if (source_el.className==\"menuitems\"){" + ! "\n" + ! "source_el.id=(state==\"on\")? \"mouseoverstyle\" : \"\"\n" + ! "\n" + ! "}" + ! "\n" + ! "}" + ! "\n" + ! "}" + ! "\n" + ! "}" + ! "\n" + ! "\n" + ! "\n" + ! "if (ie4||ns6)" + ! "\n" + ! "document.onclick=hidemenu" + ! "\n" + ! "\n" + ! "\n" + ! "</script>" ! ); ! parseAndAssertNodeCount(1); ! } ! finally ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = old_strict; ! } } *************** *** 519,530 **** public void testScanQuotedEndTag() throws ParserException { ! String html = "<SCRIPT language=\"JavaScript\">document.write('</SCRIPT>');</SCRIPT>"; ! createParser(html); ! parseAndAssertNodeCount(1); ! assertStringEquals ("Parse error", html, node[0].toHtml ()); } - - public void testScanScriptWithTagsInComment() throws ParserException { String javascript = "\n// This is javascript with <li> tag in the comment\n"; --- 557,575 ---- public void testScanQuotedEndTag() throws ParserException { ! boolean old_strict = org.htmlparser.scanners.ScriptScanner.STRICT; ! try ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = false; ! String html = "<SCRIPT language=\"JavaScript\">document.write('</SCRIPT>');</SCRIPT>"; ! createParser(html); ! parseAndAssertNodeCount(1); ! assertStringEquals ("Parse error", html, node[0].toHtml ()); ! } ! finally ! { ! org.htmlparser.scanners.ScriptScanner.STRICT = old_strict; ! } } public void testScanScriptWithTagsInComment() throws ParserException { String javascript = "\n// This is javascript with <li> tag in the comment\n"; |
From: Derrick O. <der...@us...> - 2006-05-27 14:36:49
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv13002/scanners Modified Files: ScriptScanner.java Log Message: fix bug #1457371 Script tag consumes too much from document being parsed The default for ScriptScanner.STRICT was set to true. If you want the older, more lax, script parsing, set it to false. Index: ScriptScanner.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.63 retrieving revision 1.64 diff -C2 -d -r1.63 -r1.64 *** ScriptScanner.java 12 Mar 2005 17:53:10 -0000 1.63 --- ScriptScanner.java 27 May 2006 14:36:46 -0000 1.64 *************** *** 74,78 **** * out in the wild. */ ! public static boolean STRICT = false; /** --- 74,78 ---- * out in the wild. */ ! public static boolean STRICT = true; /** |
From: Derrick O. <der...@us...> - 2006-05-27 14:04:03
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27983/nodes Modified Files: RemarkNode.java Log Message: fix bug #1488951 RemarkNode.toPlainTextString() incorrect behaviour RemarkNode.toPlainTextString() now always returns an empty string if you want the remark text use getText() Index: RemarkNode.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/RemarkNode.java,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** RemarkNode.java 10 Apr 2005 23:20:44 -0000 1.4 --- RemarkNode.java 27 May 2006 14:03:52 -0000 1.5 *************** *** 74,78 **** * @return The contents of the text inside the comment delimiters. */ ! public String getText() { int start; --- 74,78 ---- * @return The contents of the text inside the comment delimiters. */ ! public String getText () { int start; *************** *** 115,119 **** public String toPlainTextString () { ! return (getText()); } --- 115,119 ---- public String toPlainTextString () { ! return (""); } |
From: Derrick O. <der...@us...> - 2006-05-27 14:03:55
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27983/tests/tagTests Modified Files: FormTagTest.java Log Message: fix bug #1488951 RemarkNode.toPlainTextString() incorrect behaviour RemarkNode.toPlainTextString() now always returns an empty string if you want the remark text use getText() Index: FormTagTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/FormTagTest.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** FormTagTest.java 12 Nov 2005 16:44:54 -0000 1.47 --- FormTagTest.java 27 May 2006 14:03:52 -0000 1.48 *************** *** 227,231 **** e.nextNode ().collectInto (remarkNodes, filter); assertEquals("Remark Node Count",1,remarkNodes.size ()); ! assertEquals("First Remark Node"," Hello World ",remarkNodes.elementAt (0).toPlainTextString()); } /** --- 227,231 ---- e.nextNode ().collectInto (remarkNodes, filter); assertEquals("Remark Node Count",1,remarkNodes.size ()); ! assertEquals("First Remark Node"," Hello World ",remarkNodes.elementAt (0).getText ()); } /** |
From: Derrick O. <der...@us...> - 2006-05-27 14:02:44
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27338/tests Modified Files: FunctionalTests.java Log Message: fix bug #1345049 HTMLParser should not terminate a comment with ---> add static STRICT_REMARKS to Lexer class, which when true follows the specification for remarks Index: FunctionalTests.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/FunctionalTests.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** FunctionalTests.java 31 Jul 2004 16:42:33 -0000 1.56 --- FunctionalTests.java 27 May 2006 14:02:28 -0000 1.57 *************** *** 36,39 **** --- 36,40 ---- import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; + import org.htmlparser.lexer.Lexer; import org.htmlparser.tags.ImageTag; import org.htmlparser.util.DefaultParserFeedback; *************** *** 57,66 **** * identified by the parser */ ! public void testNumImageTagsInYahooWithoutRegisteringScanners() throws ParserException { ! // First count the image tags as is ! int imgTagCount; ! int parserImgTagCount = countImageTagsWithHTMLParser(); ! imgTagCount = findImageTagCount(getParser ()); ! assertEquals("Image Tag Count",imgTagCount,parserImgTagCount); } --- 58,78 ---- * identified by the parser */ ! public void testNumImageTagsInYahooWithoutRegisteringScanners() throws ParserException ! { ! boolean old_remark_handling = Lexer.STRICT_REMARKS; ! try ! { ! // this page is full of bad comments like <!---resources---> ! Lexer.STRICT_REMARKS = false; ! // First count the image tags as is ! int imgTagCount; ! int parserImgTagCount = countImageTagsWithHTMLParser(); ! imgTagCount = findImageTagCount(getParser ()); ! assertEquals("Image Tag Count",imgTagCount,parserImgTagCount); ! } ! finally ! { ! Lexer.STRICT_REMARKS = old_remark_handling; ! } } |
From: Derrick O. <der...@us...> - 2006-05-27 14:02:41
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/parserHelperTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27338/tests/parserHelperTests Modified Files: RemarkNodeParserTest.java Log Message: fix bug #1345049 HTMLParser should not terminate a comment with ---> add static STRICT_REMARKS to Lexer class, which when true follows the specification for remarks Index: RemarkNodeParserTest.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/parserHelperTests/RemarkNodeParserTest.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** RemarkNodeParserTest.java 2 Sep 2004 02:28:16 -0000 1.48 --- RemarkNodeParserTest.java 27 May 2006 14:02:28 -0000 1.49 *************** *** 31,34 **** --- 31,35 ---- import org.htmlparser.Tag; import org.htmlparser.Text; + import org.htmlparser.lexer.Lexer; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.ParserException; *************** *** 85,89 **** } ! public void testToPlainTextString() throws ParserException { createParser( "<!-- saved from url=(0022)http://internet.e-mail -->\n"+ --- 86,90 ---- } ! public void testGetText () throws ParserException { createParser( "<!-- saved from url=(0022)http://internet.e-mail -->\n"+ *************** *** 101,105 **** assertTrue("First node should be a Remark",node[0] instanceof Remark); Remark Remark = (Remark)node[0]; ! assertEquals("Plain Text of the Remark #1"," saved from url=(0022)http://internet.e-mail ",Remark.toPlainTextString()); // The tenth node should be a Remark assertTrue("Tenth node should be a Remark",node[9] instanceof Remark); --- 102,106 ---- assertTrue("First node should be a Remark",node[0] instanceof Remark); Remark Remark = (Remark)node[0]; ! assertEquals("Plain Text of the Remark #1"," saved from url=(0022)http://internet.e-mail ",Remark.getText ()); // The tenth node should be a Remark assertTrue("Tenth node should be a Remark",node[9] instanceof Remark); *************** *** 364,382 **** ParserException { ! createParser ( ! "<html>\n" ! + "<head>\n" ! + "<title>foobar</title>\n" ! + "</head>\n" ! + "<body>\n" ! + "<!-- foobar --!>\n" ! + "</body>\n" ! + "</html>\n" ! ); parser.setNodeFactory (new PrototypicalNodeFactory (true)); ! parseAndAssertNodeCount (18); ! assertTrue("Node should be a Remark but was " + node[12], node[12] instanceof Remark); ! assertStringEquals ("remark text", "<!-- foobar --!>", node[12].toHtml ()); } } --- 365,429 ---- ParserException { ! boolean old_remark_handling = Lexer.STRICT_REMARKS; ! try ! { ! // handling this requires non-strict handling ! Lexer.STRICT_REMARKS = false; ! createParser ( ! "<html>\n" ! + "<head>\n" ! + "<title>foobar</title>\n" ! + "</head>\n" ! + "<body>\n" ! + "<!-- foobar --!>\n" ! + "</body>\n" ! + "</html>\n" ! ); ! parser.setNodeFactory (new PrototypicalNodeFactory (true)); ! parseAndAssertNodeCount (18); ! assertTrue("Node should be a Remark but was " + node[12], node[12] instanceof Remark); ! assertStringEquals ("remark text", "<!-- foobar --!>", node[12].toHtml ()); ! } ! finally ! { ! Lexer.STRICT_REMARKS = old_remark_handling; ! } ! } ! ! /** ! * Test a comment ending with -. ! * See also the Acid2 test at http://www.webstandards.org/act/acid2/test.html. ! */ ! public void testDashEnding () ! throws ! ParserException ! { ! String preamble = "<div class=\"parser\">"; ! String remark = "<!-- ->ERROR<!- -->"; ! String rest = "</div></div> <!-- two dashes is what delimits a comment, so the text \"->ERROR<!-\" earlier on this line is actually part of a comment -->"; ! createParser (preamble + remark + rest); parser.setNodeFactory (new PrototypicalNodeFactory (true)); ! parseAndAssertNodeCount (6); ! assertTrue("Node should be a Remark but was " + node[1], node[1] instanceof Remark); ! assertStringEquals ("remark text", remark, node[1].toHtml ()); } + /** + * Test a comment ending with ---. + * See bug #1345049 HTMLParser should not terminate a comment with ---> + * See also the Acid2 test at http://www.webstandards.org/act/acid2/test.html. + */ + public void test3DashesEnding () + throws + ParserException + { + String preamble = "<div class=\"parser\">"; + String remark = "<!-- --->ERROR<!- -->"; + String rest = "</div></div> <!-- two dashes is what delimits a comment, so the text \"->ERROR<!-\" earlier on this line is actually part of a comment -->"; + createParser (preamble + remark + rest); + parser.setNodeFactory (new PrototypicalNodeFactory (true)); + parseAndAssertNodeCount (6); + assertTrue("Node should be a Remark but was " + node[1], node[1] instanceof Remark); + assertStringEquals ("remark text", remark, node[1].toHtml ()); + } } |
From: Derrick O. <der...@us...> - 2006-05-27 14:02:34
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27338/tests/lexerTests Modified Files: LexerTests.java Log Message: fix bug #1345049 HTMLParser should not terminate a comment with ---> add static STRICT_REMARKS to Lexer class, which when true follows the specification for remarks Index: LexerTests.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** LexerTests.java 17 Apr 2006 13:53:12 -0000 1.29 --- LexerTests.java 27 May 2006 14:02:28 -0000 1.30 *************** *** 622,625 **** --- 622,628 ---- mAcceptable.add ("H1"); mAcceptable.add ("H3"); + mAcceptable.add ("OBJECT"); + mAcceptable.add ("PARAM"); + mAcceptable.add ("EMBED"); } |
From: Derrick O. <der...@us...> - 2006-05-27 14:02:33
|
Update of //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs5.sourceforge.net:/tmp/cvs-serv27338/lexer Modified Files: Lexer.java Log Message: fix bug #1345049 HTMLParser should not terminate a comment with ---> add static STRICT_REMARKS to Lexer class, which when true follows the specification for remarks Index: Lexer.java =================================================================== RCS file: //cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** Lexer.java 14 Apr 2006 22:18:47 -0000 1.45 --- Lexer.java 27 May 2006 14:02:27 -0000 1.46 *************** *** 93,96 **** --- 93,106 ---- /** + * Process remarks strictly flag. + * If <code>true</code>, remarks are not terminated by ---$gt; + * or --!$gt;, i.e. more than two dashes. If <code>false</code>, + * a more lax (and closer to typical browser handling) remark parsing + * is used. + * Default <code>{@value}</code>. + */ + public static boolean STRICT_REMARKS = true; + + /** * The page lexemes are retrieved from. */ *************** *** 1201,1208 **** * This method uses a state machine with the following states: * <ol> ! * <li>state 0 - prior to the first open delimiter</li> ! * <li>state 1 - prior to the second open delimiter</li> ! * <li>state 2 - prior to the first closing delimiter</li> ! * <li>state 3 - prior to the second closing delimiter</li> * <li>state 4 - prior to the terminating ></li> * </ol> --- 1211,1218 ---- * This method uses a state machine with the following states: * <ol> ! * <li>state 0 - prior to the first open delimiter (first dash)</li> ! * <li>state 1 - prior to the second open delimiter (second dash)</li> ! * <li>state 2 - prior to the first closing delimiter (first dash)</li> ! * <li>state 3 - prior to the second closing delimiter (second dash)</li> * <li>state 4 - prior to the terminating ></li> * </ol> *************** *** 1275,1284 **** if ('>' == ch) done = true; ! else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch)) { // stay in state 4 } else ! state = 2; break; default: --- 1285,1301 ---- if ('>' == ch) done = true; ! else if (Character.isWhitespace (ch)) { // stay in state 4 } else ! if (!STRICT_REMARKS && (('-' == ch) || ('!' == ch))) ! { ! // stay in state 4 ! } ! else ! // bug #1345049 HTMLParser should not terminate a comment with ---> ! // should maybe issue a warning mentioning STRICT_REMARKS ! state = 2; break; default: |