[Htmlparser-cvs] htmlparser/src/org/htmlparser/visitors HtmlPage.java,1.38,1.39 NodeVisitor.java,1.3

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors
In directory sc8-pr-cvs1:/tmp/cvs-serv16537/visitors

Modified Files:
	HtmlPage.java NodeVisitor.java UrlModifyingVisitor.java 
Log Message:
Remove most of the scanners.
The only scanners left are ones that really do something different (script and jsp).
Instead of registering a scanner to enable returning a specific tag you now add a 
tag to the a PrototypicalNodeFactory. All known tags are 'registered' by default
in a new Parser which is similar to having called the old 'registerDOMScanners()',
so tags are fully nested. This is different behaviour, and specifically,
you will need to recurse into returned nodes to get at what you want.
I've tried to adjust the applications accordingly, but worked examples are still scarce.
If you want to return only some of the derived tags while keeping most as generic tags,
there are various constructors and manipulators on the factory. See the javadocs
and examples in the tests package.
Nearly all the old scanner tests are folded into the tag tests.

toString() has been revamped.
This means that the default Parser mainline now returns an indented listing of tags,
making it easy to see the structure of a page. The downside is the text of the page
had to have newlines, tabs etc. turned into escape sequences. But if you were really
interested in content you would be using toHtml() or toPlainTextString().

Index: HtmlPage.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/HtmlPage.java,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** HtmlPage.java	9 Nov 2003 17:07:17 -0000	1.38
--- HtmlPage.java	7 Dec 2003 23:41:43 -0000	1.39
***************
*** 33,37 ****
  import org.htmlparser.RemarkNode;
  import org.htmlparser.StringNode;
! import org.htmlparser.scanners.TableScanner;
  import org.htmlparser.tags.TableTag;
  import org.htmlparser.tags.Tag;
--- 33,37 ----
  import org.htmlparser.RemarkNode;
  import org.htmlparser.StringNode;
! import org.htmlparser.tags.BodyTag;
  import org.htmlparser.tags.TableTag;
  import org.htmlparser.tags.Tag;
***************
*** 43,55 ****
      private NodeList nodesInBody;
      private NodeList tables;
-     private boolean bodyTagBegin;

      public HtmlPage(Parser parser) {
!         super(false);
!         parser.registerScanners();
!         parser.addScanner(new TableScanner(parser));
          nodesInBody = new NodeList();
          tables = new NodeList();
-         bodyTagBegin = false;
      }

--- 43,52 ----
      private NodeList nodesInBody;
      private NodeList tables;

      public HtmlPage(Parser parser) {
!         super(true);
!         title = "";
          nodesInBody = new NodeList();
          tables = new NodeList();
      }

***************
*** 64,104 ****
      public void visitTag(Tag tag)
      {
!         addTagToBodyIfApplicable(tag);
! 
!         if (isTable(tag)) {
              tables.add(tag);
!         }
!         else {
!             if (isBodyTag(tag))
!                 bodyTagBegin = true;
!         }
      }

!     public void visitEndTag(Tag tag)
      {
!         if (isBodyTag(tag))
!             bodyTagBegin = false;
!         addTagToBodyIfApplicable(tag);
!     }
! 
!     private boolean isTable(Tag tag) {
!         return tag instanceof TableTag;
!     }
! 
!     private void addTagToBodyIfApplicable(Node node) {
!         if (bodyTagBegin)
!             nodesInBody.add(node);
!     }
! 
!     public void visitRemarkNode(RemarkNode remarkNode) {
!         addTagToBodyIfApplicable(remarkNode);
!     }
! 
!     public void visitStringNode(StringNode stringNode) {
!         addTagToBodyIfApplicable(stringNode);
      }

!     private boolean isBodyTag(Tag tag) {
!         return tag.getTagName().equals("BODY");
      }

--- 61,78 ----
      public void visitTag(Tag tag)
      {
!         if (isTable(tag))
              tables.add(tag);
!         else if (isBodyTag(tag))
!             nodesInBody = tag.getChildren ();
      }

!     private boolean isTable(Tag tag)
      {
!         return (tag instanceof TableTag);
      }

!     private boolean isBodyTag(Tag tag)
!     {
!         return (tag instanceof BodyTag);
      }

***************
*** 107,122 ****
      }

!     public TableTag [] getTables() {
          TableTag [] tableArr = new TableTag[tables.size()];
!         for (int i=0;i<tables.size();i++)
!             tableArr[i] = (TableTag)tables.elementAt(i);
          return tableArr;
      }

! 
! 
!     public void visitTitleTag(TitleTag titleTag) {
          title = titleTag.getTitle();
      }
- 
  }
--- 81,94 ----
      }

!     public TableTag [] getTables()
!     {
          TableTag [] tableArr = new TableTag[tables.size()];
!         tables.copyToNodeArray (tableArr);
          return tableArr;
      }

!     public void visitTitleTag(TitleTag titleTag)
!     {
          title = titleTag.getTitle();
      }
  }

Index: NodeVisitor.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/NodeVisitor.java,v
retrieving revision 1.33
retrieving revision 1.34
diff -C2 -d -r1.33 -r1.34
*** NodeVisitor.java	9 Nov 2003 17:07:18 -0000	1.33
--- NodeVisitor.java	7 Dec 2003 23:41:43 -0000	1.34
***************
*** 67,71 ****
   *     {
   *         Parser parser = new Parser ("http://cbc.ca");
-  *         parser.registerScanners ();
   *         Visitor visitor = new Visitor ();
   *         parser.visitAllNodesWith (visitor);
--- 67,70 ----

Index: UrlModifyingVisitor.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/UrlModifyingVisitor.java,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** UrlModifyingVisitor.java	9 Nov 2003 17:07:18 -0000	1.39
--- UrlModifyingVisitor.java	7 Dec 2003 23:41:43 -0000	1.40
***************
*** 34,39 ****
  import org.htmlparser.Parser;
  import org.htmlparser.StringNode;
- import org.htmlparser.scanners.ImageScanner;
- import org.htmlparser.scanners.LinkScanner;
  import org.htmlparser.tags.CompositeTag;
  import org.htmlparser.tags.ImageTag;
--- 34,37 ----
***************
*** 49,54 ****
          super(true,true);
          this.parser = parser;
-         parser.addScanner(new LinkScanner());
-         parser.addScanner(new ImageScanner(ImageTag.IMAGE_TAG_FILTER));
          this.linkPrefix =linkPrefix;
          modifiedResult = new StringBuffer();
--- 47,50 ----
***************
*** 82,89 ****

          parent = tag.getParent ();
          if (null == parent)
              modifiedResult.append(tag.toHtml());
          else
!             modifiedResult.append(parent.toHtml());
      }

--- 78,89 ----

          parent = tag.getParent ();
+         // process only those nodes not processed by a parent
          if (null == parent)
+             // an orphan end tag
              modifiedResult.append(tag.toHtml());
          else
!             if (null == parent.getParent ())
!                 // a top level tag with no parents
!                 modifiedResult.append(parent.toHtml());
      }

[Htmlparser-cvs] htmlparser/src/org/htmlparser/visitors HtmlPage.java,1.38,1.39 NodeVisitor.java,1.3

[Htmlparser-cvs] htmlparser/src/org/htmlparser/visitors HtmlPage.java,1.38,1.39 NodeVisitor.java,1.33,1.34 UrlModifyingVisitor.java,1.39,1.40