[Htmlparser-cvs] htmlparser/docs/docs CustomTagLinks.html,NONE,1.1 CustomVisitorLinks.html,NONE,1.1

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/docs/docs
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26789/docs/docs

Modified Files:
	CustomTagExtraction.html EmailExtraction.html 
	FactoryMethod.html ImageExtraction.html LinkExtraction.html 
	PostOperation.html ReverseHtml.html SamplePrograms.html 
	SearchingForData.html StringExtraction.html 
	TemplateMethod.html WritingYourOwnScanners.html index.html 
Added Files:
	CustomTagLinks.html CustomVisitorLinks.html FilterLinks.html 
	LexerLinks.html LinkBeanLinks.html VisitorLinks.html 
Log Message:
Update version to 1.4-20040125

--- NEW FILE: CustomTagLinks.html ---
<html><head><title>Custom Tag Links</title></head><body>

<div class="wikitext">
<p><b>Using Custom Tags to Extract Links

<p>The use of custom tags provides for altered behaviour during the parse:

<pre>
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.ParserException;

class MyLinkTag extends LinkTag
{
    public void doSemanticAction () throws ParserException
    {
        System.out.print ("\"" + getLinkText () + "\" =&gt; ");
        System.out.println (getLink ());
    }
}

public class LinkDemo
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser = new Parser ("http://urlIWantToParse.com");
        PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
        factory.registerTag (new MyLinkTag ());
        parser.setNodeFactory (factory);
        for (NodeIterator e = parser.elements (); e.hasMoreNodes (); )
            e.nextNode (); // just parsing the nodes executes doSemanticAction
    }
}

<div id="actionbar" class="toolbar">

<hr class="printer" noshade="noshade" />

<p class="editdate">Last edited on Tuesday, January 13, 2004  5:39:34 pm.

<hr class="toolbar" noshade="noshade" />
</body></html>
--- NEW FILE: CustomVisitorLinks.html ---
<html><head><title>Custom Visitor Links</title></head><body>

<div class="wikitext">
<p><b>Using a Custom Visitor to Extract Links

<p>Creating a custom visitor is more powerful than just the processing of links demonstrated here:

<pre>
import org.htmlparser.Parser;
import org.htmlparser.RemarkNode;
import org.htmlparser.StringNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;

class MyCustomizedVisitor extends NodeVisitor
{
    public MyCustomizedVisitor ()
    {
        super (true); // recurse into children
    }

    public void visitTag (Tag tag)
    {
        // process tags here
        if (tag instanceof LinkTag)
        {
            LinkTag linkTag = (LinkTag)tag;
            System.out.print ("\"" + linkTag.getLinkText () + "\" =&gt; ");
            System.out.println (linkTag.getLink ());
        }
    }

    public void visitStringNode (StringNode stringNode)
    {
        // process text in the page here
    }

    public void visitEndTag (Tag endTag)
    {
        // process end tags here,
        // checking for end tags can be useful when performing
        // more involved page processing
    }

    public void visitRemarkNode (RemarkNode remarkNode)
    {
        // process remark nodes here
    }
}

public class LinkDemo
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser = new Parser ("http://urlIWantToParse.com");
        MyCustomizedVisitor visitor = new MyCustomizedVisitor ();
        parser.visitAllNodesWith (visitor);
    }
}

<div id="actionbar" class="toolbar">

<hr class="printer" noshade="noshade" />

<p class="editdate">Last edited on Wednesday, January  7, 2004  5:24:34 pm.

<hr class="toolbar" noshade="noshade" />
</body></html>
--- NEW FILE: FilterLinks.html ---
<html><head><title>Filter Links</title></head><body>

<div class="wikitext">
<p><b>Using a NodeFilter to Extract Links

<p>The filter capability is much more powerful than the simple link extraction illustrated here:

<pre>
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class LinkDemo
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser = new Parser ("http://urlIWantToParse.com");
        NodeFilter filter = new NodeClassFilter (LinkTag.class);
        NodeList links = new NodeList ();
        for (NodeIterator e = parser.elements (); e.hasMoreNodes (); )
            e.nextNode ().collectInto (links, filter);
        for (int i = 0; i &lt; links.size (); i++)
        {
            LinkTag linkTag = (LinkTag)links.elementAt (i);
            System.out.print ("\"" + linkTag.getLinkText () + "\" =&gt; ");
            System.out.println (linkTag.getLink ());
        }
    }
}

<p>In fact, this is so useful that there is a convenience method to apply a NodeClassFilter directly from the parser:

<pre>
import org.htmlparser.Parser;
import org.htmlparser.util.ParserException;
import org.htmlparser.Node;
import org.htmlparser.tags.LinkTag;

public class LinkDemo
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser = new Parser ("http://urlIWantToParse.com");
        Node [] links = parser.extractAllNodesThatAre (LinkTag.class);
        for (int i = 0; i &lt; links.length; i++)
        {
            LinkTag linkTag = (LinkTag)links[i];
            System.out.print ("\"" + linkTag.getLinkText () + "\" =&gt; ");
            System.out.println (linkTag.getLink ());
        }
    }
}

<div id="actionbar" class="toolbar">

<hr class="printer" noshade="noshade" />

<p class="editdate">Last edited on Wednesday, January  7, 2004  4:48:39 pm.

<hr class="toolbar" noshade="noshade" />
</body></html>
--- NEW FILE: LexerLinks.html ---
<html><head><title>Lexer Links</title></head><body>

<div class="wikitext">
<p><b>Using a Lexer to Extract Links

<p>If you are after raw link text only, then you can use a Lexer to access the links:

<pre>
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.nodes.TagNode;
import org.htmlparser.util.ParserException;

public class LinkDemo
{
    public static void main (String[] args) throws ParserException, IOException
    {
        Node node;

        URL url = new URL ("http://urlIWantToParse.com");
        URLConnection connection = url.openConnection ();
        Lexer lexer = new Lexer (connection);
        while (null != (node = lexer.nextNode ()))
            if (node instanceof TagNode)
            {
                TagNode tag = (TagNode)node;
                if (tag.getTagName ().equals ("A") &amp;&amp; !tag.isEndTag ())
                {
                    String href = tag.getAttribute ("href");
                    if (null != href)
                        System.out.println (href);
                }
            }
     }
}

<div id="actionbar" class="toolbar">

<hr class="printer" noshade="noshade" />

<p class="editdate">Last edited on Thursday, January  8, 2004  4:06:57 am.

<hr class="toolbar" noshade="noshade" />
</body></html>
--- NEW FILE: LinkBeanLinks.html ---
<html><head><title>Link Bean Links</title></head><body>

<div class="wikitext">
<p><b>Using a LinkBean to Extract Links

<p>A LinkBean is a pretty easy way to get just the links:

<pre>
import java.net.URL;
import org.htmlparser.beans.LinkBean;

public class LinkDemo
{
    public static void main (String[] args)
    {
        LinkBean lb = new LinkBean ();
        lb.setURL ("http://urlIWantToParse.com");
        URL[] urls = lb.getLinks ();
        for (int i = 0; i &lt; urls.length; i++)
            System.out.println (urls[i]);
    }
}

<div id="actionbar" class="toolbar">

<hr class="printer" noshade="noshade" />

<p class="editdate">Last edited on Wednesday, January  7, 2004  4:10:21 pm.

<hr class="toolbar" noshade="noshade" />
</body></html>
--- NEW FILE: VisitorLinks.html ---
<html><head><title>Visitor Links</title></head><body>

<div class="wikitext">
<p><b>Using an ObjectFindingVisitor to Extract Links

<p>A visitor visits all links, and an ObjectFindingVisitor is designed to find one specific class of nodes, in this case LinkTag tags:

<pre>
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.ObjectFindingVisitor;

public class LinkDemo
{
    public static void main (String[] args) throws ParserException
    {
        Parser parser = new Parser ("http://urlIWantToParse.com");
        ObjectFindingVisitor visitor = new ObjectFindingVisitor (LinkTag.class);
        parser.visitAllNodesWith (visitor);
        Node[] links = visitor.getTags ();
        for (int i = 0; i &lt; links.length; i++)
        {
            LinkTag linkTag = (LinkTag)links[i];
            System.out.print ("\"" + linkTag.getLinkText () + "\" =&gt; ");
            System.out.println (linkTag.getLink ());
        }
    }
}

<div id="actionbar" class="toolbar">

<hr class="printer" noshade="noshade" />

<p class="editdate">Last edited on Wednesday, January  7, 2004  4:09:50 pm.

<hr class="toolbar" noshade="noshade" />
</body></html>
Index: CustomTagExtraction.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/CustomTagExtraction.html,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** CustomTagExtraction.html	9 Nov 2003 17:07:07 -0000	1.7
--- CustomTagExtraction.html	26 Jan 2004 01:02:09 -0000	1.8
***************
*** 6,25 ****
  <p><b>Custom Tag Extraction

! Custom tag extraction is easy. Simply create an array of tag names that you want to extract from a page, and pass it in to <a href="TagFindingVisitor.html" class="wiki">TagFindingVisitor</a>, like so :

  <pre>
! Parser parser = new Parser(..);
! String [] tagsToBeFound = {"P","BR","MYTAG"};
! TagFindingVisitor visitor = new TagFindingVisitor(tagsToBeFound);
! parser.visitAllNodesWith(visitor);
! // First tag specified in search
! Node [] allPTags = visitor.getTags(0);
! // Second tag specified in search
! Node [] allBRTags = visitor.getTags(1);
! // Third tag specified in search
! Node [] allMyTags = visitor.getTags(2);

! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a>
! // Just a test of wiki

--- 6,33 ----
  <p><b>Custom Tag Extraction

! Custom tag extraction is easy. Simply create an array of tag names that you want to extract from a page, and pass it in to a TagFindingVisitor, like so:

  <pre>
! import org.htmlparser.Node;
! import org.htmlparser.Parser;
! import org.htmlparser.util.ParserException;
! import org.htmlparser.visitors.TagFindingVisitor;

! public class CustomTagDemo
! {
!     public static void main (String[] args) throws ParserException
!     {
!         Parser parser = new Parser ("http://urlIWantToParse.com");
!         String [] tagsToBeFound = {"P","BR","MYTAG"};
!         TagFindingVisitor visitor = new TagFindingVisitor (tagsToBeFound);
!         parser.visitAllNodesWith (visitor);
!         // First tag specified in search
!         Node [] allPTags = visitor.getTags(0);
!         // Second tag specified in search
!         Node [] allBRTags = visitor.getTags(1);
!         // Third tag specified in search
!         Node [] allMyTags = visitor.getTags(2);
!     }
! }

***************
*** 29,33 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, April  2, 2003  1:38:24 pm.

  <hr class="toolbar" noshade="noshade" />
--- 37,41 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, January  7, 2004  6:22:39 pm.

  <hr class="toolbar" noshade="noshade" />

Index: EmailExtraction.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/EmailExtraction.html,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** EmailExtraction.html	9 Nov 2003 17:07:07 -0000	1.5
--- EmailExtraction.html	26 Jan 2004 01:02:09 -0000	1.6
***************
*** 6,24 ****
  <p><b>Email Extraction

! This is very similar to link extraction. You have to extract links from a page and verify that they are email addresses. Link tags have a method - isMailLink()

  <pre>
!    Parser parser = new Parser(..);
!    parser.registerScanners();
!    Node links [] = parser.extractAllNodesThatAre(LinkTag.class);
!    for (int i=0;i&lt;links.length;i++) {
!      LinkTag linkTag = links[i];
!      if (linkTag[i].isMailLink()) {
!         // Yes, its an email id
!         System.out.println("Email address: "+linkTag.getLink());
!      }
!    }

! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a>, February 16, 2003 11:41 am

--- 6,48 ----
  <p><b>Email Extraction

! This is very similar to <a href="LinkExtraction.html" class="named-wiki" title="LinkExtraction">link extraction</a>. You have to extract links from a page and verify that they are email addresses. Link tags have a method - isMailLink() to check if the HREF starts with "mailto:". Using an inner class in the NodeFilter example:

  <pre>
! import org.htmlparser.Node;
! import org.htmlparser.NodeFilter;
! import org.htmlparser.Parser;
! import org.htmlparser.tags.LinkTag;
! import org.htmlparser.util.NodeIterator;
! import org.htmlparser.util.NodeList;
! import org.htmlparser.util.ParserException;

! public class EmailLinkDemo
! {
!     public static void main (String[] args) throws ParserException
!     {
!         Parser parser = new Parser ("http://urlIWantToParse.com");
!         NodeFilter filter = new NodeFilter ()
!         {
!             /**
!              * Accept nodes that are mail links.
!              * @param node The node to check.
!              */
!             public boolean accept (Node node)
!             {
!                  return (LinkTag.class.isAssignableFrom (node.getClass ())
!                     &amp;&amp; ((LinkTag)node).isMailLink ());
!             }
!         };
!         NodeList links = new NodeList ();
!         for (NodeIterator e = parser.elements (); e.hasMoreNodes (); )
!             e.nextNode ().collectInto (links, filter);
!         for (int i = 0; i &lt; links.size (); i++)
!         {
!             LinkTag linkTag = (LinkTag)links.elementAt (i);
!             System.out.print ("\"" + linkTag.getLinkText () + "\" =&gt; ");
!             System.out.println (linkTag.getLink ());
!         }
!     }
! }

***************
*** 28,32 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Sunday, February 23, 2003  5:24:25 pm.

  <hr class="toolbar" noshade="noshade" />
--- 52,56 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, January  7, 2004  5:26:12 pm.

  <hr class="toolbar" noshade="noshade" />

Index: FactoryMethod.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/FactoryMethod.html,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** FactoryMethod.html	9 Nov 2003 17:07:07 -0000	1.6
--- FactoryMethod.html	26 Jan 2004 01:02:09 -0000	1.7
***************
*** 6,10 ****
  <p><b>Factory Method

! <a href="TagScanner.html" class="wiki">TagScanner</a> possess an FM for the creation of a tag.

  <pre>
--- 6,10 ----
  <p><b>Factory Method

! TagScanner possess an FM for the creation of a tag.

  <pre>

Index: ImageExtraction.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/ImageExtraction.html,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** ImageExtraction.html	9 Nov 2003 17:07:07 -0000	1.6
--- ImageExtraction.html	26 Jan 2004 01:02:09 -0000	1.7
***************
*** 4,47 ****

  <div class="wikitext">
! <p><b>Image Extractions
! 
! <p>This is very similar to <a href="LinkExtraction.html" class="wiki">LinkExtraction</a>.
! 
! <p>1. Use the <i><span class="wikiunknown"><u>ObjectFindingVisitor like so :
! 
! <pre>
! Parser parser = new Parser("http://urlIWantToParse.com");
!    // Create a visitor, specify that you want to recurse through its children
!    // Recursion is needed only if you register all scanners, and a link tag could be embedded
!    // within a form tag. But if you register only the link scanner, you don't need recursion.
!    ObjectFindingVisitor visitor =
!      new ObjectFindingVisitor(ImageTag.class,true);
! 
!    parser.registerScanners();
! 
!    // Instead of registering all scanners,
!    // you could also do - parser.addScanner(new ImageScanner(""));
!    parser.visitAllNodesWith(visitor);
!    Node [] images = visitor.getTags();
!    for (int i=0;i&lt;images.length;i++) {
!       ImageTag imageTag = (ImageTag)images[i];
!       System.out.println(imageTag.getImageLocation());
!    }

! <p>2: Use <i>extractAllNodesThatAre()

  <pre>
!    Parser parser = new Parser("http://urlIWantToParse.com");
!    parser.registerScanners();
!    // Instead of registering all scanners,
!    // you could also do - parser.addScanner(new ImageScanner(""));
! 
!    Node [] images = parser.extractAllNodesThatAre(ImageTag.class);
!    for (int i=0;i&lt;images.length;i++) {
!       ImageTag imageTag = (ImageTag)images[i];
!       System.out.println(imageTag.getImageLocation());
!    }

! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a>, Sunday, February 16, 2003 2:02:18 pm.

--- 4,30 ----

<div class="wikitext">
! Image Extraction

! This is very similar to <a href="LinkExtraction.html" class="named-wiki" title="LinkExtraction">link extraction</a>. Instead of looking for LinkTag nodes you look for ImageTag nodes:

  <pre>
! import org.htmlparser.Parser;
! import org.htmlparser.util.ParserException;
! import org.htmlparser.Node;
! import org.htmlparser.tags.ImageTag;

! public class ImageDemo
! {
!     public static void main (String[] args) throws ParserException
!     {
!         Parser parser = new Parser ("http://urlIWantToParse.com");
!         Node [] images = parser.extractAllNodesThatAre (ImageTag.class);
!         for (int i = 0; i &lt; images.length; i++)
!         {
!             ImageTag imageTag = (ImageTag)images[i];
!             System.out.println (imageTag.getImageURL ());
!         }
!     }
! }

***************
*** 51,55 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, June 25, 2003  9:11:46 am.

  <hr class="toolbar" noshade="noshade" />
--- 34,38 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, January  7, 2004  5:33:01 pm.

  <hr class="toolbar" noshade="noshade" />

Index: LinkExtraction.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/LinkExtraction.html,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** LinkExtraction.html	9 Nov 2003 17:07:07 -0000	1.5
--- LinkExtraction.html	26 Jan 2004 01:02:09 -0000	1.6
***************
*** 8,101 ****
  <p>There are many ways of extracting links.

! <p>1. Use the <span class="wikiunknown"><u>ObjectFindingVisitor to extract links, like so:
! 
! <pre>
!    Parser parser = new Parser("http://urlIWantToParse.com");
!    // Create a visitor, specify that you want to recurse through its children
!    // Recursion is needed only if you register all scanners, and a link tag could be embedded
!    // within a form tag. But if you register only the link scanner, you don't need recursion.
!    ObjectFindingVisitor visitor =
!      new ObjectFindingVisitor(LinkTag.class,true);
! 
!    parser.registerScanners();
! 
!    // Instead of registering all scanners,
!    // you could also do - parser.addScanner(new LinkScanner(""));
!    parser.visitAllNodesWith(visitor);
!    Node [] links = visitor.getTags();
!    for (int i=0;i&lt;links.length;i++) {
!       LinkTag linkTag = (LinkTag)links[i];
!       System.out.println(linkTag.getLink());
!       System.out.println(linkTag.getLinkText());
!    }
! 
! <p>2. Use the parser utility method - extractAllNodesThatAre().
! 
! <pre>
!    Parser parser = new Parser("http://urlIWantToParse.com");
!    parser.registerScanners();
!    Node [] links = parser.extractAllNodesThatAre(LinkTag.class);
!    // Instead of registering all scanners,
!    // you could also do - parser.addScanner(new LinkScanner(""));
!    for (int i=0;i&lt;links.length;i++) {
!       LinkTag linkTag = (LinkTag)links[i];
!       System.out.println(linkTag.getLink());
!       System.out.println(linkTag.getLinkText());
!    }
! 
! <p>3. It is possible that you are interested in extracting more than just links. In order to customize extraction, write your own visitor. Extend the Visitor class (in the package org.htmlparser.visitors - Parser v1.3 upwards) like so :
! 
! <pre>
!    public class MyCustomizedVisitor extends Visitor {
!       public MyCustomizedVisitor(Parser parser) {
!          super(true);  /// Its usually a good idea to perform recursion
!          // Add the scanners you want.
!          // This decouples your application from having to know which scanners are required
!          parser.addScanner(new LinkScanner(""));
!          parser.addScanner(new ImageScanner(""));
!          // or add all scanners with registerScanners()
!       }
! 
!       public void visitTag(Tag tag) {
!         // Collect any tags you want
!         // You can also do type checking like so:
!         if (tag instanceof MetaTag) {
!            // This tag is a meta tag
!            MetaTag metaTag = (MetaTag)tag;
!         }
!       }
! 
!       public void visitStringNode(StringNode stringNode) {
!         // Collect text in the page here
!       }
! 
!       public void visitLinkTag(LinkTag linkTag) {
!         // Collect links here
!       }

!       public void visitImageTag(ImageTag imageTag) {
!         // Collect images here
!       }

!       public void visitEndTag(EndTag endTag) {
!         // Checking for end tags can be useful when performing more involved
!         // searches in a page
!       }

!       public void visitRemarkNode(RemarkNode remarkNode) {
!          // Collect remark nodes here
!       }

!       // Add getters to get the data you have collected..
! }

! In your app..
! Parser parser = new Parser(...);
! MyCustomizedVisitor visitor = new MyCustomizedVisitor(parser);
! parser.visitAllNodesWith(visitor);
! // You can now get the data from the visitor interface.

- <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a>

--- 8,25 ----
  <p>There are many ways of extracting links.

! <ul>

! <li><a href="VisitorLinks.html" class="named-wiki" title="VisitorLinks">Use an ObjectFindingVisitor</a>

! <li><a href="CustomVisitorLinks.html" class="named-wiki" title="CustomVisitorLinks">Use a custom Visitor</a>

! <li><a href="LinkBeanLinks.html" class="named-wiki" title="LinkBeanLinks">Use a LinkBean</a>

! <li><a href="CustomTagLinks.html" class="named-wiki" title="CustomTagLinks">Use a custom Tag</a>

! <li><a href="FilterLinks.html" class="named-wiki" title="FilterLinks">Use a NodeFilter</a>

+ <li><a href="LexerLinks.html" class="named-wiki" title="LexerLinks">Use a low level Lexer</a>

***************
*** 105,109 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Tuesday, September  2, 2003  1:59:15 pm.

  <hr class="toolbar" noshade="noshade" />
--- 29,33 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, January  7, 2004  5:22:23 pm.

  <hr class="toolbar" noshade="noshade" />

Index: PostOperation.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/PostOperation.html,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** PostOperation.html	9 Nov 2003 17:07:07 -0000	1.4
--- PostOperation.html	26 Jan 2004 01:02:09 -0000	1.5
***************
*** 31,35 ****
          // ... do parser operations

! <a href="images/Zip.java" class="namedurl"><img src="/docs/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Source Code.</a><a href="images/Zip.java" class="namedurl">Source Code.</a> <a href="images/Zip.html" class="namedurl"><img src="/docs/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Pretty Print Source Code</a><a href="images/Zip.html" class="namedurl">Pretty Print Source Code</a>

  <pre>
--- 31,35 ----
          // ... do parser operations

! <a href="images/Zip.java" class="namedurl"><img src="/wiki/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Source Code.</a><a href="images/Zip.java" class="namedurl">Source Code.</a> <a href="images/Zip.html" class="namedurl"><img src="/wiki/themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Pretty Print Source Code</a><a href="images/Zip.html" class="namedurl">Pretty Print Source Code</a>

  <pre>

Index: ReverseHtml.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/ReverseHtml.html,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** ReverseHtml.html	9 Nov 2003 17:07:07 -0000	1.5
--- ReverseHtml.html	26 Jan 2004 01:02:09 -0000	1.6
***************
*** 6,41 ****
  <p><b>Reverse Html Rendering

! In order to get back the html representation of a web page, you may use toHTML() recursively. Here's one way to get it:

  <pre>
!   Parser parser = new Parser(..);
!   parser.registerScanners();
!   StringBuffer htmlBuffer = new StringBuffer();
!   for (NodeIterator i = parser.elements();i.hasMoreNodes();) {
!      htmlBuffer.append(i.nextNode().toHTML());
!   }
!   System.out.println("reverse html rendered after parse : "+htmlBuffer.toString());

! This usually goes through child nodes of composite tags (like links, forms, etc..)

! Often, it might be desired to modify the html being reconstructed. In such a case, you must change the tag's attributes prior to calling toHTML().

! e.g. if the tag in question is a link tag, and you wish to modify the href, do this :

  <pre>
! linkTag.setAttribute("SRC",newUrlString);
! doSomethingWith(linkTag.toHTML());

! toHtml() is basically a reconstruction of the tag using its attributes (at the atomic level) and its children (at the macro/composite level).

! You can also change the name of the tag by setting its TAGNAME attribute, like so:

  <pre>
! tag.setAttribute(Tag.TAGNAME,newTagName);

! This should enable you to perform any transformations on the html.
! Take a look at another way of modifying tags in <a href="WebRipper.html" class="wiki">WebRipper</a>.

! <p>--<a href="SomikRaha.html" class="wiki">SomikRaha</a>

--- 6,62 ----
  <p><b>Reverse Html Rendering

! In order to get back the html representation of a web page, you may use toHtml() recursively. Here's one way to get it:

<pre>
! import org.htmlparser.Parser;
! import org.htmlparser.util.NodeIterator;
! import org.htmlparser.util.ParserException;

! public class ToHtmlDemo
! {
!     public static void main (String[] args) throws ParserException
!     {
!         Parser parser = new Parser ("http://urlIWantToParse.com");
!         StringBuffer html = new StringBuffer (4096);
!         for (NodeIterator i = parser.elements();i.hasMoreNodes();)
!              html.append (i.nextNode().toHtml ());
!         System.out.println (html);
!     }
! }

! Often, it might be desired to modify the html being reconstructed. In such a case, you must change the tag's attributes prior to calling toHtml().
! For example, if the tag in question is a link tag, and you wish to modify the href, do this:

! <pre>
!     linkTag.setLink ("http://newUrlString");
!     linkTag.toHtml ();
! 
! <p>This is equivalent to:

  <pre>
!     linkTag.setAttribute ("href", "http://newUrlString");
!     linkTag.toHtml ();

! This latter would work on any tag, but few other tags have an HREF attribute according to the <a href="http://www.w3.org/TR/html4/" class="namedurl">HTML specification</a>.
! The toHtml() method applies to all nodes, not just tags. For tags it is basically a reconstruction of the tag using its attributes (at the atomic level) and its children (at the macro/composite level).

! You can also change the name of the tag like so:

<pre>
! tag.setTagName (newTagName);

! and there are numerous ways to add, remove or change the attributes of a tag. For example, to add or change the ID attribute to "EditArea" use:

! <pre>
! tag.setAttribute ("id", "EditArea", '"');
! 
! Whole tags can be added and removed from the list of children held by each tag. For example, to add a &lt;P&gt; tag at the same level as another tag:
! 
! <pre>
! newTag = new Tag ();
! newTag.setTagName ("P");
! tag.getParent ().getChildren ().add (newTag);
! 
! Be careful, getChildren () may return null for an arbitrary tag.

***************
*** 45,49 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Sunday, February 23, 2003  5:34:12 pm.

  <hr class="toolbar" noshade="noshade" />
--- 66,70 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, January  7, 2004  6:14:37 pm.

  <hr class="toolbar" noshade="noshade" />

Index: SamplePrograms.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/SamplePrograms.html,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** SamplePrograms.html	9 Nov 2003 17:07:07 -0000	1.6
--- SamplePrograms.html	26 Jan 2004 01:02:09 -0000	1.7
***************
*** 10,14 ****
  <li><a href="StringExtraction.html" class="wiki">StringExtraction</a>

! <li><a href="LinkExtraction.html" class="wiki">LinkExtraction</a> (includes example of customized parsing with HTMLVisitor)

  <li><a href="EmailExtraction.html" class="wiki">EmailExtraction</a>
--- 10,14 ----
  <li><a href="StringExtraction.html" class="wiki">StringExtraction</a>

! <li><a href="LinkExtraction.html" class="wiki">LinkExtraction</a>

  <li><a href="EmailExtraction.html" class="wiki">EmailExtraction</a>
***************
*** 16,23 ****
  <li><a href="ImageExtraction.html" class="wiki">ImageExtraction</a>

- <li><a href="WebCrawler.html" class="wiki">WebCrawler</a>
- 
- <li><a href="WebRipper.html" class="wiki">WebRipper</a>
- 
  <li><a href="ReverseHtml.html" class="named-wiki" title="ReverseHtml">ReverseHtml rendering</a>

--- 16,19 ----
***************
*** 26,29 ****
--- 22,29 ----
  <li><a href="JavaBeans.html" class="wiki">JavaBeans</a>

+ <li><a href="WebCrawler.html" class="wiki">WebCrawler</a> - ignore this, it's old
+ 
+ <li><a href="WebRipper.html" class="wiki">WebRipper</a> - ignore this, it's old
+ 

***************
*** 33,37 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Thursday, April 24, 2003  4:45:21 am.

  <hr class="toolbar" noshade="noshade" />
--- 33,37 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Wednesday, January  7, 2004  6:12:30 pm.

  <hr class="toolbar" noshade="noshade" />

Index: SearchingForData.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/SearchingForData.html,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** SearchingForData.html	26 Oct 2003 19:46:17 -0000	1.3
--- SearchingForData.html	26 Jan 2004 01:02:09 -0000	1.4
***************
*** 29,33 ****

  <pre>
- parser.registerScanners();
  Node nodes [] = parser.extractAllNodesThatAre(TableTag.class);
  // Get the first table found
--- 29,32 ----
***************
*** 73,77 ****

  <pre>
- parser.registerScanners();
  Node nodes [] = parser.extractAllNodesThatAre(TableTag.class);
  // Get the first table found
--- 72,75 ----
***************
*** 103,107 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Saturday, April 19, 2003 10:38:30 pm.

  <hr class="toolbar" noshade="noshade" />
--- 101,105 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Thursday, January  8, 2004  4:15:12 am.

  <hr class="toolbar" noshade="noshade" />

Index: StringExtraction.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/StringExtraction.html,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** StringExtraction.html	9 Nov 2003 17:07:07 -0000	1.6
--- StringExtraction.html	26 Jan 2004 01:02:09 -0000	1.7
***************
*** 6,24 ****
  <p><b>String Extraction

! To get all the text content from a web page, use the <a href="TextExtractingVisitor.html" class="wiki">TextExtractingVisitor</a>, like so :

  <pre>
!    Parser parser = new Parser("http://pageIwantToParse.com");
!    TextExtractingVisitor visitor = new TextExtractingVisitor();
!    parser.visitAllNodesWith(visitor);
!    System.out.println(visitor.getExtractedText());

! If you want to strip all escape characters, do:

  <pre>
!    String cleanText =
!       ParserUtils.removeEscapeCharacters(
!          visitor.getExtractedText()
!       );

--- 6,42 ----
  <p><b>String Extraction

! To get all the text content from a web page, use the TextExtractingVisitor, like so:

  <pre>
! import org.htmlparser.Parser;
! import org.htmlparser.util.ParserException;
! import org.htmlparser.visitors.TextExtractingVisitor;
! public class StringDemo
! {
!     public static void main (String[] args) throws ParserException
!     {
!         Parser parser = new Parser ("http://pageIwantToParse.com");
!         TextExtractingVisitor visitor = new TextExtractingVisitor ();
!         parser.visitAllNodesWith (visitor);
!         System.out.println (visitor.getExtractedText());
!     }
! }

! If you want a more browser like behaviour, use the StringBean like so:

  <pre>
! import org.htmlparser.beans.StringBean;
! public class StringDemo
! {
!     public static void main (String[] args)
!     {
!         StringBean sb = new StringBean ();
!         sb.setLinks (false);
!         sb.setReplaceNonBreakingSpaces (true);
!         sb.setCollapse (true);
!         sb.setURL ("http://pageIwantToParse.com");
!         System.out.println (sb.getStrings ());
!     }
! }

***************
*** 28,32 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Sunday, February 23, 2003  5:20:23 pm.

  <hr class="toolbar" noshade="noshade" />
--- 46,50 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Tuesday, January  6, 2004  6:36:18 pm.

  <hr class="toolbar" noshade="noshade" />

Index: TemplateMethod.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/TemplateMethod.html,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** TemplateMethod.html	9 Nov 2003 17:07:07 -0000	1.6
--- TemplateMethod.html	26 Jan 2004 01:02:09 -0000	1.7
***************
*** 6,10 ****
  <p><b>Template Method

! <a href="TagScanner.html" class="wiki">TagScanner</a> uses a template method to create a scanned node - it calls a matching tag scanner to do its job and produce a scanned node in a series of steps.

  <pre>
--- 6,10 ----
  <p><b>Template Method

! TagScanner uses a template method to create a scanned node - it calls a matching tag scanner to do its job and produce a scanned node in a series of steps.

  <pre>

Index: WritingYourOwnScanners.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/WritingYourOwnScanners.html,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** WritingYourOwnScanners.html	9 Nov 2003 17:07:07 -0000	1.7
--- WritingYourOwnScanners.html	26 Jan 2004 01:02:09 -0000	1.8
***************
*** 5,14 ****
  <div class="wikitext">
  <p><b>Writing Your Own Scanners
! 
! <p>There are two types of scanners, depending on the type of tags that you wish to parse:

  <ul>

! <li><a href="TagScanner.html" class="wiki">TagScanner</a> - for parsing tags that have no child elements

  <li>CompositeTagScanner - for parsing tags with children
--- 5,14 ----
  <div class="wikitext">
  <p><b>Writing Your Own Scanners
! <b>Warning: this is out of date and needs to be completely rewritten
! There are two types of scanners, depending on the type of tags that you wish to parse:

  <ul>

! <li>TagScanner - for parsing tags that have no child elements

  <li>CompositeTagScanner - for parsing tags with children
***************
*** 29,33 ****
  <br />
  <br />
! 3. If a match was found, call the scan() method. For both <a href="TagScanner.html" class="wiki">TagScanner</a> and CompositeTagScanner, overriding this method is optional, and NOT recommended for standard cases. The default scan() methods will make a call to createTag.
  <br />
  <br />
--- 29,33 ----
  <br />
  <br />
! 3. If a match was found, call the scan() method. For both <span class="wikiunknown"><u>TagScanner and CompositeTagScanner, overriding this method is optional, and NOT recommended for standard cases. The default scan() methods will make a call to createTag.
  <br />
  <br />
***************
*** 109,113 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Thursday, May  1, 2003  6:54:01 pm.

  <hr class="toolbar" noshade="noshade" />
--- 109,113 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Thursday, January  8, 2004  4:13:18 am.

  <hr class="toolbar" noshade="noshade" />

Index: index.html
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/docs/docs/index.html,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** index.html	2 Jan 2004 16:24:52 -0000	1.11
--- index.html	26 Jan 2004 01:02:09 -0000	1.12
***************
*** 6,9 ****
--- 6,11 ----
  <p><b>HTMLParser documentation

+ <p><a href="http://htmlparser.sourceforge.net/wiki/" class="namedurl"><span style="white-space: nowrap">This</span> page has moved to http://htmlparser.sourceforge.net/wiki</a>
+ 
  <p>Welcome to the HTMLParser documentation page.
  You may visit
***************
*** 13,17 ****
  <li><a href="SamplePrograms.html" class="wiki">SamplePrograms</a> - A quick tutorial on getting started with the parser

! <li><a href="WritingYourOwnScanners.html" class="wiki">WritingYourOwnScanners</a> - Learn how to write your own scanners to extend the capability of the parser

  <li><a href="SearchingForData.html" class="wiki">SearchingForData</a> - Learn how to perform powerful searches in html pages
--- 15,19 ----
  <li><a href="SamplePrograms.html" class="wiki">SamplePrograms</a> - A quick tutorial on getting started with the parser

! <li><a href="WritingYourOwnScanners.html" class="wiki">WritingYourOwnScanners</a> - ignore this, this is old

  <li><a href="SearchingForData.html" class="wiki">SearchingForData</a> - Learn how to perform powerful searches in html pages
***************
*** 29,43 ****
  <li><a href="TestDrivenDevelopment.html" class="wiki">TestDrivenDevelopment</a>

! <li><a href="ParsingXml.html" class="wiki">ParsingXml</a>
! 
! <li><a href="UnitTestingXsl.html" class="wiki">UnitTestingXsl</a>
! 
! <li><a href="UnitTestingPdf.html" class="wiki">UnitTestingPdf</a>

! <li><a href="http://htmlparser.sourceforge.net/javadoc/" class="namedurl"><span style="white-space: nowrap">Javadocs</span> for Version 1.2</a>

  <li><a CLASS="namedurl" HREF="../javadoc/index.html"><span STYLE="white-space: nowrap">Javadocs</span></a>

! <li><a href="Benchmarks.html" class="named-wiki" title="Benchmarks">Benchmarks vs. JTidy</a>

--- 31,41 ----
  <li><a href="TestDrivenDevelopment.html" class="wiki">TestDrivenDevelopment</a>

! <li><a href="Benchmarks.html" class="named-wiki" title="Benchmarks">Benchmarks vs. JTidy</a>

! <li><a href="http://htmlparser.sourceforge.net/javadoc/" class="namedurl"><span style="white-space: nowrap">Javadocs</span></a>

  <li><a CLASS="namedurl" HREF="../javadoc/index.html"><span STYLE="white-space: nowrap">Javadocs</span></a>

! <li><a href="http://htmlparser.sourceforge.net/javadoc_1_2/" class="namedurl"><span style="white-space: nowrap">Javadocs</span> for Version 1.2</a>

***************
*** 48,52 ****
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Tuesday, November 25, 2003  4:50:49 am.

  <hr class="toolbar" noshade="noshade" />
--- 46,50 ----
  <hr class="printer" noshade="noshade" />

! <p class="editdate">Last edited on Thursday, January  8, 2004  4:14:03 am.

  <hr class="toolbar" noshade="noshade" />