[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications WikiCapturer.java,NONE,1.1 SiteCap

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications
In directory sc8-pr-cvs1:/tmp/cvs-serv24025/parserapplications

Modified Files:
	SiteCapturer.java 
Added Files:
	WikiCapturer.java 
Log Message:
First pass at the wiki capturer.
Added useful extensions to the HasAttributeFilter, SiteCapturer and NodeList


--- NEW FILE: WikiCapturer.java ---
// HTMLParser Library $Name:  $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/WikiCapturer.java,v $
// $Author: derrickoswald $
// $Date: 2004/01/10 00:06:03 $
// $Revision: 1.1 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.parserapplications;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import javax.swing.JFileChooser;
import javax.swing.JOptionPane;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;

/**
 * Save a wikiwikiweb locally.
 * Illustrative program to save a wiki locally.
 */
public class WikiCapturer
    extends
        SiteCapturer
{
    /**
     * Create a wikicapturer.
     */
    public WikiCapturer ()
    {
    }

    /**
     * Mainline to capture a web site locally.
     * @param args The command line arguments.
     * There are three arguments the web site to capture, the local directory
     * to save it to, and a flag (true or false) to indicate whether resources
     * such as images and video are to be captured as well.
     * These are requested via dialog boxes if not supplied.
     */
    public static void main (String[] args)
        throws
            MalformedURLException,
            IOException
    {
        WikiCapturer worker;
        String url;
        JFileChooser chooser;
        URL source;
        String path;
        File target;
        Boolean capture;
        int ret;
        
        worker = new WikiCapturer ();
        if (0 >= args.length)
        {
            url = (String)JOptionPane.showInputDialog (
                null,
                "Enter the URL to capture:",
                "Web Site",
                JOptionPane.PLAIN_MESSAGE,
                null,
                null,
                "http://htmlparser.sourceforge.net/wiki");
            if (null != url)
                worker.setSource (url);
            else
                System.exit (1);
        }
        else
            worker.setSource (args[0]);
        if (1 >= args.length)
        {
            url = worker.getSource ();
            source = new URL (url);
            path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();
            target = new File (path);
            chooser = new JFileChooser (target);
            chooser.setDialogType (JFileChooser.SAVE_DIALOG);
            chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);
            chooser.setSelectedFile (target); // this doesn't frickin' work
            chooser.setMultiSelectionEnabled (false);
            chooser.setDialogTitle ("Target Directory");
            ret = chooser.showSaveDialog (null);
            if (ret == JFileChooser.APPROVE_OPTION)
                worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());
            else
                System.exit (1);
        }
        else
            worker.setTarget (args[1]);
        if (2 >= args.length)
        {
            capture = (Boolean)JOptionPane.showInputDialog (
                null,
                "Should resources be captured:",
                "Capture Resources",
                JOptionPane.PLAIN_MESSAGE,
                null,
                new Object[] { Boolean.TRUE, Boolean.FALSE},
                Boolean.TRUE);
            if (null != capture)
                worker.setCaptureResources (capture.booleanValue ());
            else
                System.exit (1);
        }
        else
            worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));
        worker.setFilter (
            new NotFilter (
                new OrFilter (
                    new AndFilter (
                        new TagNameFilter ("DIV"),
                        new HasAttributeFilter ("id", "navbar")), 
                    new OrFilter (
                        new AndFilter (
                            new TagNameFilter ("DIV"),
                            new HasAttributeFilter ("id", "actionbar")),
                        new AndFilter (
                            new TagNameFilter ("DIV"),
                            new HasAttributeFilter ("id", "xhtml-validator"))))));
        worker.capture ();
        
        System.exit (0);
    }
}

Index: SiteCapturer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** SiteCapturer.java	4 Jan 2004 03:23:09 -0000	1.2
--- SiteCapturer.java	10 Jan 2004 00:06:03 -0000	1.3
***************
*** 42,51 ****
--- 42,57 ----
  
  import org.htmlparser.Node;
+ import org.htmlparser.NodeFilter;
  import org.htmlparser.Parser;
  import org.htmlparser.PrototypicalNodeFactory;
+ import org.htmlparser.filters.AndFilter;
+ import org.htmlparser.filters.HasAttributeFilter;
+ import org.htmlparser.filters.NodeClassFilter;
+ import org.htmlparser.lexer.nodes.Attribute;
  import org.htmlparser.tags.BaseHrefTag;
  import org.htmlparser.tags.FrameTag;
  import org.htmlparser.tags.ImageTag;
  import org.htmlparser.tags.LinkTag;
+ import org.htmlparser.tags.MetaTag;
  import org.htmlparser.util.NodeIterator;
  import org.htmlparser.util.NodeList;
***************
*** 54,58 ****
  /**
   * Save a web site locally.
!  * Illustrative prgram to save a web site contents locally.
   * It was created to demonstrate URL rewriting in it's simplest form.
   * It uses customized tags in the NodeFactory to alter the URLs.
--- 60,64 ----
  /**
   * Save a web site locally.
!  * Illustrative program to save a web site contents locally.
   * It was created to demonstrate URL rewriting in it's simplest form.
   * It uses customized tags in the NodeFactory to alter the URLs.
***************
*** 125,128 ****
--- 131,139 ----
  
      /**
+      * The filter to apply to the nodes retrieved.
+      */
+     protected NodeFilter mFilter;
+ 
+     /**
       * Copy buffer size.
       * Resources are moved to disk in chunks this size or less.
***************
*** 137,140 ****
--- 148,153 ----
          PrototypicalNodeFactory factory;
  
+         mSource = null;
+         mTarget = null;
          mPages = new ArrayList ();
          mFinished = new HashSet ();
***************
*** 148,151 ****
--- 161,166 ----
          factory.registerTag (new LocalImageTag ());
          mParser.setNodeFactory (factory);
+         mCaptureResources = true;
+         mFilter = null;
      }
  
***************
*** 214,217 ****
--- 229,251 ----
      }
      
+     
+     /** Getter for property filter.
+      * @return Value of property filter.
+      *
+      */
+     public NodeFilter getFilter ()
+     {
+         return (mFilter);
+     }
+     
+     /** Setter for property filter.
+      * @param filter New value of property filter.
+      *
+      */
+     public void setFilter (NodeFilter filter)
+     {
+         mFilter = filter;
+     }
+     
      /**
       * Returns <code>true</code> if the link is one we are interested in.
***************
*** 282,286 ****
          String ret;
  
!         if (link.equals (getSource ()))
              ret = "index.html"; // handle the root page specially
          else if (link.startsWith (getSource ())
--- 316,320 ----
          String ret;
  
!         if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
              ret = "index.html"; // handle the root page specially
          else if (link.startsWith (getSource ())
***************
*** 383,392 ****
       * Process a single page.
       */
!     protected void process ()
          throws
              ParserException
      {
          String url;
          NodeList list;
          File file;
          File dir;
--- 417,430 ----
       * Process a single page.
       */
!     protected void process (NodeFilter filter)
          throws
              ParserException
      {
          String url;
+         int bookmark;
          NodeList list;
+         NodeList robots;
+         MetaTag robot;
+         String content;
          File file;
          File dir;
***************
*** 399,403 ****
  
          try
!         {   // fetch the page and gather the list of nodes
              mParser.setURL (url);
              list = new NodeList ();
--- 437,443 ----
  
          try
!         {
!             bookmark = mPages.size ();
!             // fetch the page and gather the list of nodes
              mParser.setURL (url);
              list = new NodeList ();
***************
*** 405,408 ****
--- 445,470 ----
                  list.add (e.nextNode ()); // URL conversion occurs in the tags
  
+             // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
+             // <meta name="robots" content="index,follow" />
+             // <meta name="robots" content="noindex,nofollow" />
+             robots = list.extractAllNodesThatMatch (
+                 new AndFilter (
+                     new NodeClassFilter (MetaTag.class),
+                     new HasAttributeFilter ("name", "robots")), true);
+             if (0 != robots.size ())
+             {
+                 robot = (MetaTag)robots.elementAt (0);
+                 content = robot.getAttribute ("content").toLowerCase ();
+                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
+                     // reset mPages
+                     for (int i = bookmark; i < mPages.size (); i++)
+                         mPages.remove (i);
+                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
+                     return;
+             }
+     
+             if (null != filter)
+                 list.keepAllNodesThatMatch (filter, true);
+ 
              // save the page locally
              file = new File (getTarget (), makeLocalLink (url, ""));
***************
*** 410,413 ****
--- 472,483 ----
              if (!dir.exists ())
                  dir.mkdirs ();
+             else if (!dir.isDirectory ())
+             {
+                 dir = new File (dir.getParentFile (), dir.getName () + ".content");
+                 if (!dir.exists ())
+                     dir.mkdirs ();
+                 file = new File (dir, file.getName ());
+             }
+                 
              try
              {
***************
*** 582,586 ****
              try
              {
!                 process ();
                  while (0 != mImages.size ())
                      copy ();
--- 652,656 ----
              try
              {
!                 process (getFilter ());
                  while (0 != mImages.size ())
                      copy ();

[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications WikiCapturer.java,NONE,1.1 SiteCap

[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications WikiCapturer.java,NONE,1.1 SiteCapturer.java,1.2,1.3