[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners ScriptScanner.java,1.61,1.62 StyleScanner.ja

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5186/scanners

Modified Files:
	ScriptScanner.java StyleScanner.java 
Log Message:
Bug #1104627 Parser Crash reading javascript
Bug #1024045 StringBean crashes on an URL
Bug #1021925 StyleTag with missing linefeed prevents page from parsing
Corrected operation with script and style scanners to recognize the ETAGO
when parsing CDATA -- see http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data.
Original solution to bug #741769 ScriptScanner doesn't handle quoted &lt;/script&gt; tags,
was erroneous; it should have been recognized as faulty HTML.
Several test cases changed to follow this advice:
   "Authors should therefore escape "</" within the content."


Index: StyleScanner.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/StyleScanner.java,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** StyleScanner.java	31 Jul 2004 16:42:32 -0000	1.38
--- StyleScanner.java	7 Mar 2005 02:18:46 -0000	1.39
***************
*** 29,32 ****
--- 29,33 ----
  import java.util.Vector;
  
+ import org.htmlparser.Attribute;
  import org.htmlparser.Node;
  import org.htmlparser.NodeFactory;
***************
*** 54,63 ****
      /**
       * Scan for style definitions.
!      * Accumulates nodes returned from the lexer, until &lt;/STYLE&gt;,
!      * &lt;BODY&gt; or &lt;HTML&gt; is encountered. Replaces the node factory
!      * in the lexer with a new (empty) one to avoid other scanners missing their 
!      * end tags and accumulating even the &lt;/STYLE&gt; tag.
       * @param tag The tag this scanner is responsible for.
!      * @param lexer The source of subsequent nodes.
       * @param stack The parse stack, <em>not used</em>.
       */
--- 55,61 ----
      /**
       * Scan for style definitions.
!      * Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
       * @param tag The tag this scanner is responsible for.
!      * @param lexer The source of CDATA.
       * @param stack The parse stack, <em>not used</em>.
       */
***************
*** 65,134 ****
          throws ParserException
      {
!         Node node;
!         boolean done;
          int position;
!         int startpos;
!         int endpos;
!         Tag end;
!         NodeFactory factory;
!         Text content;
!         Tag ret;
  
!         done = false;
!         startpos = lexer.getPosition ();
!         endpos = startpos;
!         end = null;
!         factory = lexer.getNodeFactory ();
!         lexer.setNodeFactory (new PrototypicalNodeFactory (true));
!         try
!         {
!             do
              {
!                 position = lexer.getPosition ();
!                 node = lexer.nextNode (true);
!                 if (null == node)
!                     done = true;
!                 else
!                     if (node instanceof Tag)
!                         if (   ((Tag)node).isEndTag ()
!                             && ((Tag)node).getTagName ().equals (tag.getIds ()[0]))
!                         {
!                             end = (Tag)node;
!                             done = true;
!                         }
!                         else if (isTagToBeEndedFor (tag, (Tag)node))
!                         {
!                             lexer.setPosition (position);
!                             done = true;
!                         }
!                         else
!                             // must be a string, even though it looks like a tag
!                             endpos = node.getEndPosition ();
!                     else if (node instanceof Remark)
!                         endpos = node.getEndPosition ();
!                     else // Text
!                         endpos = node.getEndPosition ();
! 
              }
-             while (!done);
  
!             content = factory.createStringNode (lexer.getPage (), startpos, endpos);
!             // build new end tag if required
!             if (null == end)
!                 end = lexer.getNodeFactory ().createTagNode (
!                     lexer.getPage (), endpos, endpos, new Vector ());
!             ret = tag;
!             ret.setEndTag (end);
!             ret.setChildren (new NodeList (content));
!             content.setParent (ret);
!             end.setParent (ret);
!             ret.doSemanticAction ();
          }
!         finally
          {
!             lexer.setNodeFactory (factory);
          }
  
!         return (ret);
      }
  }
--- 63,102 ----
          throws ParserException
      {
!         Node content;
          int position;
!         Node node;
!         Attribute attribute;
!         Vector vector;
  
!         content = lexer.parseCDATA ();
!         position = lexer.getPosition ();
!         node = lexer.nextNode (false);
!         if (null != node)
!             if (!(node instanceof Tag) || !(   ((Tag)node).isEndTag ()
!                 && ((Tag)node).getTagName ().equals (tag.getIds ()[0])))
              {
!                 lexer.setPosition (position);
!                 node = null;
              }
  
!         // build new end tag if required
!         if (null == node)
!         {
!             attribute = new Attribute ("/style", null);
!             vector = new Vector ();
!             vector.addElement (attribute);
!             node = lexer.getNodeFactory ().createTagNode (
!                 lexer.getPage (), position, position, vector);
          }
!         tag.setEndTag ((Tag)node);
!         if (null != content)
          {
!             tag.setChildren (new NodeList (content));
!             content.setParent (tag);
          }
+         node.setParent (tag);
+         tag.doSemanticAction ();
  
!         return (tag);
      }
  }

Index: ScriptScanner.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v
retrieving revision 1.61
retrieving revision 1.62
diff -C2 -d -r1.61 -r1.62
*** ScriptScanner.java	31 Jul 2004 16:42:32 -0000	1.61
--- ScriptScanner.java	7 Mar 2005 02:18:46 -0000	1.62
***************
*** 29,32 ****
--- 29,33 ----
  import java.util.Vector;
  
+ import org.htmlparser.Attribute;
  import org.htmlparser.Node;
  import org.htmlparser.NodeFactory;
***************
*** 35,39 ****
--- 36,42 ----
  import org.htmlparser.Tag;
  import org.htmlparser.Text;
+ import org.htmlparser.lexer.Cursor;
  import org.htmlparser.lexer.Lexer;
+ import org.htmlparser.lexer.Page;
  import org.htmlparser.scanners.ScriptDecoder;
  import org.htmlparser.tags.ScriptTag;
***************
*** 42,47 ****
  
  /**
!  * The ScriptScanner handles script code.
!  * It gathers all interior nodes into one undifferentiated string node.
   */
  public class ScriptScanner
--- 45,49 ----
  
  /**
!  * The ScriptScanner handles script CDATA.
   */
  public class ScriptScanner
***************
*** 58,67 ****
      /**
       * Scan for script.
!      * Accumulates nodes returned from the lexer, until &lt;/SCRIPT&gt;,
!      * &lt;BODY&gt; or &lt;HTML&gt; is encountered. Replaces the node factory
!      * in the lexer with a new (empty) one to avoid other scanners missing their 
!      * end tags and accumulating even the &lt;/SCRIPT&gt; tag.
       * @param tag The tag this scanner is responsible for.
!      * @param lexer The source of subsequent nodes.
       * @param stack The parse stack, <em>not used</em>.
       */
--- 60,66 ----
      /**
       * Scan for script.
!      * Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
       * @param tag The tag this scanner is responsible for.
!      * @param lexer The source of CDATA.
       * @param stack The parse stack, <em>not used</em>.
       */
***************
*** 70,88 ****
      {
          String language;
!         Node node;
!         boolean done;
          int position;
!         int startpos;
!         int endpos;
!         Tag end;
!         NodeFactory factory;
!         Text content;
!         Tag ret;
  
-         done = false;
-         startpos = lexer.getPosition ();
-         endpos = startpos;
-         end = null;
-         factory = lexer.getNodeFactory ();
          if (tag instanceof ScriptTag)
          {
--- 69,79 ----
      {
          String language;
!         String code;
!         Node content;
          int position;
!         Node node;
!         Attribute attribute;
!         Vector vector;
  
          if (tag instanceof ScriptTag)
          {
***************
*** 92,150 ****
                   language.equalsIgnoreCase ("VBScript.Encode")))
              {
!                 String code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ());
                  ((ScriptTag)tag).setScriptCode (code);
-                 endpos = lexer.getPosition ();
              }
          }
!         lexer.setNodeFactory (new PrototypicalNodeFactory (true));
!         try
!         {
!             do
              {
!                 position = lexer.getPosition ();
!                 node = lexer.nextNode (true);
!                 if (null == node)
!                     done = true;
!                 else
!                     if (node instanceof Tag)
!                         if (   ((Tag)node).isEndTag ()
!                             && ((Tag)node).getTagName ().equals (tag.getIds ()[0]))
!                         {
!                             end = (Tag)node;
!                             done = true;
!                         }
!                         else if (isTagToBeEndedFor (tag, (Tag)node))
!                         {
!                             lexer.setPosition (position);
!                             done = true;
!                         }
!                         else
!                             // must be a string, even though it looks like a tag
!                             endpos = node.getEndPosition ();
!                     else if (node instanceof Remark)
!                         endpos = node.getEndPosition ();
!                     else // Text
!                         endpos = node.getEndPosition ();
              }
-             while (!done);
  
!             content = factory.createStringNode (lexer.getPage (), startpos, endpos);
!             // build new end tag if required
!             if (null == end)
!                 end = lexer.getNodeFactory ().createTagNode (
!                     lexer.getPage (), endpos, endpos, new Vector ());
!             ret = tag;
!             ret.setEndTag (end);
!             ret.setChildren (new NodeList (content));
!             content.setParent (ret);
!             end.setParent (ret);
!             ret.doSemanticAction ();
          }
!         finally
          {
!             lexer.setNodeFactory (factory);
          }
  
!         return (ret);
      }
  }
--- 83,120 ----
                   language.equalsIgnoreCase ("VBScript.Encode")))
              {
!                 code = ScriptDecoder.Decode (lexer.getPage (), lexer.getCursor ());
                  ((ScriptTag)tag).setScriptCode (code);
              }
          }
!         content = lexer.parseCDATA ();
!         position = lexer.getPosition ();
!         node = lexer.nextNode (false);
!         if (null != node)
!             if (!(node instanceof Tag) || !(   ((Tag)node).isEndTag ()
!                 && ((Tag)node).getTagName ().equals (tag.getIds ()[0])))
              {
!                 lexer.setPosition (position);
!                 node = null;
              }
  
!         // build new end tag if required
!         if (null == node)
!         {
!             attribute = new Attribute ("/script", null);
!             vector = new Vector ();
!             vector.addElement (attribute);
!             node = lexer.getNodeFactory ().createTagNode (
!                 lexer.getPage (), position, position, vector);
          }
!         tag.setEndTag ((Tag)node);
!         if (null != content)
          {
!             tag.setChildren (new NodeList (content));
!             content.setParent (tag);
          }
+         node.setParent (tag);
+         tag.doSemanticAction ();
  
!         return (tag);
      }
  }

[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners ScriptScanner.java,1.61,1.62 StyleScanner.ja

[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners ScriptScanner.java,1.61,1.62 StyleScanner.java,1.38,1.39