Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser AbstractNode.java,1.24,1.25 Node.java,1.48,1.49 Remar

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31556

Modified Files:
	AbstractNode.java Node.java RemarkNode.java StringNode.java 
Log Message:
Part two of a multiphase refactoring. Part one added the Tag interface.
This submission eliminates some of the duplication between the lexer.nodes package
and the htmlparser package by removing the tag specific signatures, visitTitleTag,
visitLinkTag and visitImageTag, from the NodeVisitor class. This allows the lexer to
return htmlparser level classes for StringNode and RemarkNode. The TagNode is
still present in the lexer.nodes package, but will move next.
This means that classes derived from NodeVisitor *will not* work using the above
signatures; instead a check for tag class (or name) should be performed in visitTag.
A document will be added to the visitors package with comprehensive porting instructions.

Index: StringNode.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v
retrieving revision 1.49
retrieving revision 1.50
diff -C2 -d -r1.49 -r1.50
*** StringNode.java	14 Jan 2004 02:53:46 -0000	1.49
--- StringNode.java	24 May 2004 00:38:15 -0000	1.50
***************
*** 1,5 ****
  // HTMLParser Library $Name$ - A java-based parser for HTML
  // http://sourceforge.org/projects/htmlparser
! // Copyright (C) 2004 Somik Raha
  //
  // Revision Control Information
--- 1,5 ----
  // HTMLParser Library $Name$ - A java-based parser for HTML
  // http://sourceforge.org/projects/htmlparser
! // Copyright (C) 2004 Derrick Oswald
  //
  // Revision Control Information
***************
*** 27,42 ****
  package org.htmlparser;

  import org.htmlparser.lexer.Page;
  import org.htmlparser.visitors.NodeVisitor;

  /**
!  * Normal text in the html document is identified and represented by this class.
   */
! public class StringNode
!     extends
!         org.htmlparser.lexer.nodes.StringNode
  {
      /**
!      * Constructor takes in the text string, beginning and ending posns.
       * @param page The page this string is on.
       * @param start The beginning position of the string.
--- 27,60 ----
  package org.htmlparser;

+ import org.htmlparser.AbstractNode;
+ import org.htmlparser.lexer.Cursor;
  import org.htmlparser.lexer.Page;
+ import org.htmlparser.util.NodeList;
+ import org.htmlparser.util.ParserException;
  import org.htmlparser.visitors.NodeVisitor;

  /**
!  * Normal text in the HTML document is represented by this class.
   */
! public class StringNode extends AbstractNode
  {
      /**
!      * The contents of the string node, or override text.
!      */
!     protected String mText;
! 
!     /**
!      * Constructor takes in the text string.
!      * @param text The string node text. For correct generation of HTML, this
!      * should not contain representations of tags (unless they are balanced).
!      */
!     public StringNode (String text)
!     {
!         super (null, 0, 0);
!         setText (text);
!     }
! 
!     /**
!      * Constructor takes in the page and beginning and ending posns.
       * @param page The page this string is on.
       * @param start The beginning position of the string.
***************
*** 46,49 ****
--- 64,198 ----
      {
          super (page, start, end);
+         mText = null;
+     }
+ 
+     /**
+      * Returns the text of the string line.
+      */
+     public String getText ()
+     {
+         return (toHtml ());
+     }
+ 
+     /**
+      * Sets the string contents of the node.
+      * @param text The new text for the node.
+      */
+     public void setText (String text)
+     {
+         mText = text;
+         nodeBegin = 0;
+         nodeEnd = mText.length ();
+     }
+ 
+     public String toPlainTextString ()
+     {
+         return (toHtml ());
+     }
+ 
+     public String toHtml ()
+     {
+         String ret;
+         
+         ret = mText;
+         if (null == ret)
+             ret = mPage.getText (getStartPosition (), getEndPosition ());
+ 
+         return (ret);
+     }
+ 
+     /**
+      * Express this string node as a printable string
+      * This is suitable for display in a debugger or output to a printout.
+      * Control characters are replaced by their equivalent escape
+      * sequence and contents is truncated to 80 characters.
+      * @return A string representation of the string node.
+      */
+     public String toString ()
+     {
+         int startpos;
+         int endpos;
+         Cursor start;
+         Cursor end;
+         char c;
+         StringBuffer ret;
+ 
+         startpos = getStartPosition ();
+         endpos = getEndPosition ();
+         ret = new StringBuffer (endpos - startpos + 20);
+         if (null == mText)
+         {
+             start = new Cursor (getPage (), startpos);
+             end = new Cursor (getPage (), endpos);
+             ret.append ("Txt (");
+             ret.append (start);
+             ret.append (",");
+             ret.append (end);
+             ret.append ("): ");
+             while (start.getPosition () < endpos)
+             {
+                 try
+                 {
+                     c = mPage.getCharacter (start);
+                     switch (c)
+                     {
+                         case '\t':
+                             ret.append ("\\t");
+                             break;
+                         case '\n':
+                             ret.append ("\\n");
+                             break;
+                         case '\r':
+                             ret.append ("\\r");
+                             break;
+                         default:
+                             ret.append (c);
+                     }
+                 }
+                 catch (ParserException pe)
+                 {
+                     // not really expected, but we're only doing toString, so ignore
+                 }
+                 if (77 <= ret.length ())
+                 {
+                     ret.append ("...");
+                     break;
+                 }
+             }
+         }
+         else
+         {
+             ret.append ("Txt (");
+             ret.append (startpos);
+             ret.append (",");
+             ret.append (endpos);
+             ret.append ("): ");
+             while (startpos < endpos)
+             {
+                 c = mText.charAt (startpos);
+                 switch (c)
+                 {
+                     case '\t':
+                         ret.append ("\\t");
+                         break;
+                     case '\n':
+                         ret.append ("\\n");
+                         break;
+                     case '\r':
+                         ret.append ("\\r");
+                         break;
+                     default:
+                         ret.append (c);
+                 }
+                 if (77 <= ret.length ())
+                 {
+                     ret.append ("...");
+                     break;
+                 }
+                 startpos++;
+             }
+         }
+ 
+         return (ret.toString ());
      }

***************
*** 53,59 ****
       * <code>visitStringNode()</code> on.
       */
!     public void accept (Object visitor)
      {
!         ((NodeVisitor)visitor).visitStringNode (this);
      }
  }
--- 202,208 ----
       * <code>visitStringNode()</code> on.
       */
!     public void accept (NodeVisitor visitor)
      {
!         visitor.visitStringNode (this);
      }
  }

Index: RemarkNode.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** RemarkNode.java	14 Jan 2004 02:53:46 -0000	1.41
--- RemarkNode.java	24 May 2004 00:38:15 -0000	1.42
***************
*** 1,5 ****
  // HTMLParser Library $Name$ - A java-based parser for HTML
  // http://sourceforge.org/projects/htmlparser
! // Copyright (C) 2004 Somik Raha
  //
  // Revision Control Information
--- 1,5 ----
  // HTMLParser Library $Name$ - A java-based parser for HTML
  // http://sourceforge.org/projects/htmlparser
! // Copyright (C) 2004 Derrick Oswald
  //
  // Revision Control Information
***************
*** 27,31 ****
--- 27,35 ----
  package org.htmlparser;

+ import org.htmlparser.AbstractNode;
+ import org.htmlparser.lexer.Cursor;
  import org.htmlparser.lexer.Page;
+ import org.htmlparser.util.NodeList;
+ import org.htmlparser.util.ParserException;
  import org.htmlparser.visitors.NodeVisitor;

***************
*** 33,49 ****
   * The remark tag is identified and represented by this class.
   */
! public class RemarkNode
!     extends
!         org.htmlparser.lexer.nodes.RemarkNode
  {
      /**
!      * Constructor takes in the text string, beginning and ending posns.
!      * @param page The page this string is on.
!      * @param start The beginning position of the string.
!      * @param end The ending positiong of the string.
       */
      public RemarkNode (Page page, int start, int end)
      {
          super (page, start, end);
      }

--- 37,228 ----
   * The remark tag is identified and represented by this class.
   */
! public class RemarkNode extends AbstractNode
  {
      /**
!      * The contents of the remark node, or override text.
!      */
!     protected String mText;
! 
!     /**
!      * Constructor takes in the text string.
!      * @param text The string node text. For correct generation of HTML, this
!      * should not contain representations of tags (unless they are balanced).
!      */
!     public RemarkNode (String text)
!     {
!         super (null, 0, 0);
!         setText (text);
!     }
! 
!     /**
!      * Constructor takes in the page and beginning and ending posns.
!      * @param page The page this remark is on.
!      * @param start The beginning position of the remark.
!      * @param end The ending positiong of the remark.
       */
      public RemarkNode (Page page, int start, int end)
      {
          super (page, start, end);
+         mText = null;
+     }
+ 
+     /**
+      * Returns the text contents of the comment tag.
+      * @return The contents of the text inside the comment delimiters.
+      */
+     public String getText()
+     {
+         int start;
+         int end;
+         String ret;
+ 
+         if (null == mText)
+         {
+             start = getStartPosition () + 4; // <!--
+             end = getEndPosition () - 3; // -->
+             if (start >= end)
+                 ret = "";
+             else
+                 ret = mPage.getText (start, end);
+         }
+         else
+             ret = mText;
+ 
+         return (ret);
+     }
+ 
+     /**
+      * Sets the string contents of the node.
+      * If the text has the remark delimiters (&lt;!-- --&gt;), these are stripped off.
+      * @param text The new text for the node.
+      */
+     public void setText (String text)
+     {
+         mText = text;
+         if (text.startsWith ("<!--") && text.endsWith ("-->"))
+             mText = text.substring (4, text.length () - 3);
+         nodeBegin = 0;
+         nodeEnd = mText.length ();
+     }
+ 
+     public String toPlainTextString()
+     {
+         return (getText());
+     }
+     
+     public String toHtml()
+     {
+         StringBuffer buffer;
+         String ret;
+         
+         if (null == mText)
+             ret = mPage.getText (getStartPosition (), getEndPosition ());
+         else
+         {
+             buffer = new StringBuffer (mText.length () + 7);
+             buffer.append ("<!--");
+             buffer.append (mText);
+             buffer.append ("-->");
+             ret = buffer.toString ();
+         }
+ 
+         return (ret);
+     }
+ 
+     /**
+      * Print the contents of the remark tag.
+      * This is suitable for display in a debugger or output to a printout.
+      * Control characters are replaced by their equivalent escape
+      * sequence and contents is truncated to 80 characters.
+      * @return A string representation of the remark node.
+      */
+     public String toString()
+     {
+         int startpos;
+         int endpos;
+         Cursor start;
+         Cursor end;
+         char c;
+         StringBuffer ret;
+ 
+         startpos = getStartPosition ();
+         endpos = getEndPosition ();
+         ret = new StringBuffer (endpos - startpos + 20);
+         if (null == mText)
+         {
+             start = new Cursor (getPage (), startpos);
+             end = new Cursor (getPage (), endpos);
+             ret.append ("Rem (");
+             ret.append (start);
+             ret.append (",");
+             ret.append (end);
+             ret.append ("): ");
+             start.setPosition (startpos + 4); // <!--
+             endpos -= 3; // -->
+             while (start.getPosition () < endpos)
+             {
+                 try
+                 {
+                     c = mPage.getCharacter (start);
+                     switch (c)
+                     {
+                         case '\t':
+                             ret.append ("\\t");
+                             break;
+                         case '\n':
+                             ret.append ("\\n");
+                             break;
+                         case '\r':
+                             ret.append ("\\r");
+                             break;
+                         default:
+                             ret.append (c);
+                     }
+                 }
+                 catch (ParserException pe)
+                 {
+                     // not really expected, but we're only doing toString, so ignore
+                 }
+                 if (77 <= ret.length ())
+                 {
+                     ret.append ("...");
+                     break;
+                 }
+             }
+         }
+         else
+         {
+             ret.append ("Rem (");
+             ret.append (startpos);
+             ret.append (",");
+             ret.append (endpos);
+             ret.append ("): ");
+             while (startpos < endpos)
+             {
+                 c = mText.charAt (startpos);
+                 switch (c)
+                 {
+                     case '\t':
+                         ret.append ("\\t");
+                         break;
+                     case '\n':
+                         ret.append ("\\n");
+                         break;
+                     case '\r':
+                         ret.append ("\\r");
+                         break;
+                     default:
+                         ret.append (c);
+                 }
+                 if (77 <= ret.length ())
+                 {
+                     ret.append ("...");
+                     break;
+                 }
+                 startpos++;
+             }
+         }
+ 
+         return (ret.toString ());
      }

***************
*** 53,59 ****
       * <code>visitRemarkNode()</code> on.
       */
!     public void accept (Object visitor)
      {
!         ((NodeVisitor)visitor).visitRemarkNode (this);
      }
  }
--- 232,238 ----
       * <code>visitRemarkNode()</code> on.
       */
!     public void accept (NodeVisitor visitor)
      {
!         visitor.visitRemarkNode (this);
      }
  }

Index: Node.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Node.java,v
retrieving revision 1.48
retrieving revision 1.49
diff -C2 -d -r1.48 -r1.49
*** Node.java	2 Jan 2004 16:24:52 -0000	1.48
--- Node.java	24 May 2004 00:38:15 -0000	1.49
***************
*** 29,32 ****
--- 29,33 ----
  import org.htmlparser.util.NodeList;
  import org.htmlparser.util.ParserException;
+ import org.htmlparser.visitors.NodeVisitor;

  public interface Node
***************
*** 135,139 ****
       * Apply the visitor object (of type NodeVisitor) to this node.
       */
!     public abstract void accept(Object visitor);

      /**
--- 136,140 ----
       * Apply the visitor object (of type NodeVisitor) to this node.
       */
!     public abstract void accept (NodeVisitor visitor);

      /**

Index: AbstractNode.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/AbstractNode.java,v
retrieving revision 1.24
retrieving revision 1.25
diff -C2 -d -r1.24 -r1.25
*** AbstractNode.java	2 Jan 2004 16:24:52 -0000	1.24
--- AbstractNode.java	24 May 2004 00:38:15 -0000	1.25
***************
*** 32,35 ****
--- 32,36 ----
  import org.htmlparser.util.NodeList;
  import org.htmlparser.util.ParserException;
+ import org.htmlparser.visitors.NodeVisitor;

  /**
***************
*** 219,223 ****
      }

!     public abstract void accept(Object visitor);

      /**
--- 220,224 ----
      }

!     public abstract void accept (NodeVisitor visitor);

      /**

Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser AbstractNode.java,1.24,1.25 Node.java,1.48,1.49 Remar

htmlparser-cvs