Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications SiteCapturer.java,1.4,1.5

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications
In directory sc8-pr-cvs1:/tmp/cvs-serv32229/src/org/htmlparser/parserapplications

Modified Files:
	SiteCapturer.java 
Log Message:
Update version to 1.4-20040119.


Index: SiteCapturer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** SiteCapturer.java	14 Jan 2004 02:53:46 -0000	1.4
--- SiteCapturer.java	19 Jan 2004 23:14:18 -0000	1.5
***************
*** 41,50 ****
--- 41,55 ----
  import javax.swing.JOptionPane;
  
+ import org.htmlparser.NodeFilter;
  import org.htmlparser.Parser;
  import org.htmlparser.PrototypicalNodeFactory;
+ import org.htmlparser.filters.AndFilter;
+ import org.htmlparser.filters.HasAttributeFilter;
+ import org.htmlparser.filters.NodeClassFilter;
  import org.htmlparser.tags.BaseHrefTag;
  import org.htmlparser.tags.FrameTag;
  import org.htmlparser.tags.ImageTag;
  import org.htmlparser.tags.LinkTag;
+ import org.htmlparser.tags.MetaTag;
  import org.htmlparser.util.NodeIterator;
  import org.htmlparser.util.NodeList;
***************
*** 53,57 ****
  /**
   * Save a web site locally.
!  * Illustrative prgram to save a web site contents locally.
   * It was created to demonstrate URL rewriting in it's simplest form.
   * It uses customized tags in the NodeFactory to alter the URLs.
--- 58,62 ----
  /**
   * Save a web site locally.
!  * Illustrative program to save a web site contents locally.
   * It was created to demonstrate URL rewriting in it's simplest form.
   * It uses customized tags in the NodeFactory to alter the URLs.
***************
*** 124,127 ****
--- 129,137 ----
  
      /**
+      * The filter to apply to the nodes retrieved.
+      */
+     protected NodeFilter mFilter;
+ 
+     /**
       * Copy buffer size.
       * Resources are moved to disk in chunks this size or less.
***************
*** 136,139 ****
--- 146,151 ----
          PrototypicalNodeFactory factory;
  
+         mSource = null;
+         mTarget = null;
          mPages = new ArrayList ();
          mFinished = new HashSet ();
***************
*** 147,150 ****
--- 159,164 ----
          factory.registerTag (new LocalImageTag ());
          mParser.setNodeFactory (factory);
+         mCaptureResources = true;
+         mFilter = null;
      }
  
***************
*** 213,216 ****
--- 227,249 ----
      }
      
+     
+     /** Getter for property filter.
+      * @return Value of property filter.
+      *
+      */
+     public NodeFilter getFilter ()
+     {
+         return (mFilter);
+     }
+     
+     /** Setter for property filter.
+      * @param filter New value of property filter.
+      *
+      */
+     public void setFilter (NodeFilter filter)
+     {
+         mFilter = filter;
+     }
+     
      /**
       * Returns <code>true</code> if the link is one we are interested in.
***************
*** 281,285 ****
          String ret;
  
!         if (link.equals (getSource ()))
              ret = "index.html"; // handle the root page specially
          else if (link.startsWith (getSource ())
--- 314,318 ----
          String ret;
  
!         if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
              ret = "index.html"; // handle the root page specially
          else if (link.startsWith (getSource ())
***************
*** 382,391 ****
       * Process a single page.
       */
!     protected void process ()
          throws
              ParserException
      {
          String url;
          NodeList list;
          File file;
          File dir;
--- 415,428 ----
       * Process a single page.
       */
!     protected void process (NodeFilter filter)
          throws
              ParserException
      {
          String url;
+         int bookmark;
          NodeList list;
+         NodeList robots;
+         MetaTag robot;
+         String content;
          File file;
          File dir;
***************
*** 398,402 ****
  
          try
!         {   // fetch the page and gather the list of nodes
              mParser.setURL (url);
              list = new NodeList ();
--- 435,441 ----
  
          try
!         {
!             bookmark = mPages.size ();
!             // fetch the page and gather the list of nodes
              mParser.setURL (url);
              list = new NodeList ();
***************
*** 404,407 ****
--- 443,468 ----
                  list.add (e.nextNode ()); // URL conversion occurs in the tags
  
+             // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
+             // <meta name="robots" content="index,follow" />
+             // <meta name="robots" content="noindex,nofollow" />
+             robots = list.extractAllNodesThatMatch (
+                 new AndFilter (
+                     new NodeClassFilter (MetaTag.class),
+                     new HasAttributeFilter ("name", "robots")), true);
+             if (0 != robots.size ())
+             {
+                 robot = (MetaTag)robots.elementAt (0);
+                 content = robot.getAttribute ("content").toLowerCase ();
+                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
+                     // reset mPages
+                     for (int i = bookmark; i < mPages.size (); i++)
+                         mPages.remove (i);
+                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
+                     return;
+             }
+     
+             if (null != filter)
+                 list.keepAllNodesThatMatch (filter, true);
+ 
              // save the page locally
              file = new File (getTarget (), makeLocalLink (url, ""));
***************
*** 409,412 ****
--- 470,481 ----
              if (!dir.exists ())
                  dir.mkdirs ();
+             else if (!dir.isDirectory ())
+             {
+                 dir = new File (dir.getParentFile (), dir.getName () + ".content");
+                 if (!dir.exists ())
+                     dir.mkdirs ();
+                 file = new File (dir, file.getName ());
+             }
+                 
              try
              {
***************
*** 581,585 ****
              try
              {
!                 process ();
                  while (0 != mImages.size ())
                      copy ();
--- 650,654 ----
              try
              {
!                 process (getFilter ());
                  while (0 != mImages.size ())
                      copy ();

Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications SiteCapturer.java,1.4,1.5

htmlparser-cvs