[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners ScriptScanner.java,1.61,1.62 StyleScanner.ja
Brought to you by:
derrickoswald
From: Derrick O. <der...@us...> - 2005-03-07 02:18:57
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5186/scanners Modified Files: ScriptScanner.java StyleScanner.java Log Message: Bug #1104627 Parser Crash reading javascript Bug #1024045 StringBean crashes on an URL Bug #1021925 StyleTag with missing linefeed prevents page from parsing Corrected operation with script and style scanners to recognize the ETAGO when parsing CDATA -- see http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data. Original solution to bug #741769 ScriptScanner doesn't handle quoted </script> tags, was erroneous; it should have been recognized as faulty HTML. Several test cases changed to follow this advice: "Authors should therefore escape "</" within the content." Index: StyleScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/StyleScanner.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** StyleScanner.java 31 Jul 2004 16:42:32 -0000 1.38 --- StyleScanner.java 7 Mar 2005 02:18:46 -0000 1.39 *************** *** 29,32 **** --- 29,33 ---- import java.util.Vector; + import org.htmlparser.Attribute; import org.htmlparser.Node; import org.htmlparser.NodeFactory; *************** *** 54,63 **** /** * Scan for style definitions. ! * Accumulates nodes returned from the lexer, until </STYLE>, ! * <BODY> or <HTML> is encountered. Replaces the node factory ! * in the lexer with a new (empty) one to avoid other scanners missing their ! * end tags and accumulating even the </STYLE> tag. * @param tag The tag this scanner is responsible for. ! * @param lexer The source of subsequent nodes. * @param stack The parse stack, <em>not used</em>. */ --- 55,61 ---- /** * Scan for style definitions. ! * Accumulates text from the page, until </[a-zA-Z] is encountered. * @param tag The tag this scanner is responsible for. ! * @param lexer The source of CDATA. * @param stack The parse stack, <em>not used</em>. */ *************** *** 65,134 **** throws ParserException { ! Node node; ! boolean done; int position; ! int startpos; ! int endpos; ! Tag end; ! NodeFactory factory; ! Text content; ! Tag ret; ! done = false; ! startpos = lexer.getPosition (); ! endpos = startpos; ! end = null; ! factory = lexer.getNodeFactory (); ! lexer.setNodeFactory (new PrototypicalNodeFactory (true)); ! try ! { ! do { ! position = lexer.getPosition (); ! node = lexer.nextNode (true); ! if (null == node) ! done = true; ! else ! if (node instanceof Tag) ! if ( ((Tag)node).isEndTag () ! && ((Tag)node).getTagName ().equals (tag.getIds ()[0])) ! { ! end = (Tag)node; ! done = true; ! } ! else if (isTagToBeEndedFor (tag, (Tag)node)) ! { ! lexer.setPosition (position); ! done = true; ! } ! else ! // must be a string, even though it looks like a tag ! endpos = node.getEndPosition (); ! else if (node instanceof Remark) ! endpos = node.getEndPosition (); ! else // Text ! endpos = node.getEndPosition (); ! } - while (!done); ! content = factory.createStringNode (lexer.getPage (), startpos, endpos); ! // build new end tag if required ! if (null == end) ! end = lexer.getNodeFactory ().createTagNode ( ! lexer.getPage (), endpos, endpos, new Vector ()); ! ret = tag; ! ret.setEndTag (end); ! ret.setChildren (new NodeList (content)); ! content.setParent (ret); ! end.setParent (ret); ! ret.doSemanticAction (); } ! finally { ! lexer.setNodeFactory (factory); } ! return (ret); } } --- 63,102 ---- throws ParserException { ! Node content; int position; ! Node node; ! Attribute attribute; ! Vector vector; ! content = lexer.parseCDATA (); ! position = lexer.getPosition (); ! node = lexer.nextNode (false); ! if (null != node) ! if (!(node instanceof Tag) || !( ((Tag)node).isEndTag () ! && ((Tag)node).getTagName ().equals (tag.getIds ()[0]))) { ! lexer.setPosition (position); ! node = null; } ! // build new end tag if required ! if (null == node) ! { ! attribute = new Attribute ("/style", null); ! vector = new Vector (); ! vector.addElement (attribute); ! node = lexer.getNodeFactory ().createTagNode ( ! lexer.getPage (), position, position, vector); } ! tag.setEndTag ((Tag)node); ! if (null != content) { ! tag.setChildren (new NodeList (content)); ! content.setParent (tag); } + node.setParent (tag); + tag.doSemanticAction (); ! return (tag); } } Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.61 retrieving revision 1.62 diff -C2 -d -r1.61 -r1.62 *** ScriptScanner.java 31 Jul 2004 16:42:32 -0000 1.61 --- ScriptScanner.java 7 Mar 2005 02:18:46 -0000 1.62 *************** *** 29,32 **** --- 29,33 ---- import java.util.Vector; + import org.htmlparser.Attribute; import org.htmlparser.Node; import org.htmlparser.NodeFactory; *************** *** 35,39 **** --- 36,42 ---- import org.htmlparser.Tag; import org.htmlparser.Text; + import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; + import org.htmlparser.lexer.Page; import org.htmlparser.scanners.ScriptDecoder; import org.htmlparser.tags.ScriptTag; *************** *** 42,47 **** /** ! * The ScriptScanner handles script code. ! * It gathers all interior nodes into one undifferentiated string node. */ public class ScriptScanner --- 45,49 ---- /** ! * The ScriptScanner handles script CDATA. */ public class ScriptScanner *************** *** 58,67 **** /** * Scan for script. ! * Accumulates nodes returned from the lexer, until </SCRIPT>, ! * <BODY> or <HTML> is encountered. Replaces the node factory ! * in the lexer with a new (empty) one to avoid other scanners missing their ! * end tags and accumulating even the </SCRIPT> tag. * @param tag The tag this scanner is responsible for. ! * @param lexer The source of subsequent nodes. * @param stack The parse stack, <em>not used</em>. */ --- 60,66 ---- /** * Scan for script. ! * Accumulates text from the page, until </[a-zA-Z] is encountered. * @param tag The tag this scanner is responsible for. ! * @param lexer The source of CDATA. * @param stack The parse stack, <em>not used</em>. */ *************** *** 70,88 **** { String language; ! Node node; ! boolean done; int position; ! int startpos; ! int endpos; ! Tag end; ! NodeFactory factory; ! Text content; ! Tag ret; - done = false; - startpos = lexer.getPosition (); - endpos = startpos; - end = null; - factory = lexer.getNodeFactory (); if (tag instanceof ScriptTag) { --- 69,79 ---- { String language; ! String code; ! Node content; int position; ! Node node; ! Attribute attribute; ! Vector vector; if (tag instanceof ScriptTag) { *************** *** 92,150 **** language.equalsIgnoreCase ("VBScript.Encode"))) { ! String code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ()); ((ScriptTag)tag).setScriptCode (code); - endpos = lexer.getPosition (); } } ! lexer.setNodeFactory (new PrototypicalNodeFactory (true)); ! try ! { ! do { ! position = lexer.getPosition (); ! node = lexer.nextNode (true); ! if (null == node) ! done = true; ! else ! if (node instanceof Tag) ! if ( ((Tag)node).isEndTag () ! && ((Tag)node).getTagName ().equals (tag.getIds ()[0])) ! { ! end = (Tag)node; ! done = true; ! } ! else if (isTagToBeEndedFor (tag, (Tag)node)) ! { ! lexer.setPosition (position); ! done = true; ! } ! else ! // must be a string, even though it looks like a tag ! endpos = node.getEndPosition (); ! else if (node instanceof Remark) ! endpos = node.getEndPosition (); ! else // Text ! endpos = node.getEndPosition (); } - while (!done); ! content = factory.createStringNode (lexer.getPage (), startpos, endpos); ! // build new end tag if required ! if (null == end) ! end = lexer.getNodeFactory ().createTagNode ( ! lexer.getPage (), endpos, endpos, new Vector ()); ! ret = tag; ! ret.setEndTag (end); ! ret.setChildren (new NodeList (content)); ! content.setParent (ret); ! end.setParent (ret); ! ret.doSemanticAction (); } ! finally { ! lexer.setNodeFactory (factory); } ! return (ret); } } --- 83,120 ---- language.equalsIgnoreCase ("VBScript.Encode"))) { ! code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ()); ((ScriptTag)tag).setScriptCode (code); } } ! content = lexer.parseCDATA (); ! position = lexer.getPosition (); ! node = lexer.nextNode (false); ! if (null != node) ! if (!(node instanceof Tag) || !( ((Tag)node).isEndTag () ! && ((Tag)node).getTagName ().equals (tag.getIds ()[0]))) { ! lexer.setPosition (position); ! node = null; } ! // build new end tag if required ! if (null == node) ! { ! attribute = new Attribute ("/script", null); ! vector = new Vector (); ! vector.addElement (attribute); ! node = lexer.getNodeFactory ().createTagNode ( ! lexer.getPage (), position, position, vector); } ! tag.setEndTag ((Tag)node); ! if (null != content) { ! tag.setChildren (new NodeList (content)); ! content.setParent (tag); } + node.setParent (tag); + tag.doSemanticAction (); ! return (tag); } } |