[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications SiteCapturer.java,1.4,1.5
Brought to you by:
derrickoswald
|
From: <der...@us...> - 2004-01-19 23:14:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications
In directory sc8-pr-cvs1:/tmp/cvs-serv32229/src/org/htmlparser/parserapplications
Modified Files:
SiteCapturer.java
Log Message:
Update version to 1.4-20040119.
Index: SiteCapturer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** SiteCapturer.java 14 Jan 2004 02:53:46 -0000 1.4
--- SiteCapturer.java 19 Jan 2004 23:14:18 -0000 1.5
***************
*** 41,50 ****
--- 41,55 ----
import javax.swing.JOptionPane;
+ import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
+ import org.htmlparser.filters.AndFilter;
+ import org.htmlparser.filters.HasAttributeFilter;
+ import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
+ import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
***************
*** 53,57 ****
/**
* Save a web site locally.
! * Illustrative prgram to save a web site contents locally.
* It was created to demonstrate URL rewriting in it's simplest form.
* It uses customized tags in the NodeFactory to alter the URLs.
--- 58,62 ----
/**
* Save a web site locally.
! * Illustrative program to save a web site contents locally.
* It was created to demonstrate URL rewriting in it's simplest form.
* It uses customized tags in the NodeFactory to alter the URLs.
***************
*** 124,127 ****
--- 129,137 ----
/**
+ * The filter to apply to the nodes retrieved.
+ */
+ protected NodeFilter mFilter;
+
+ /**
* Copy buffer size.
* Resources are moved to disk in chunks this size or less.
***************
*** 136,139 ****
--- 146,151 ----
PrototypicalNodeFactory factory;
+ mSource = null;
+ mTarget = null;
mPages = new ArrayList ();
mFinished = new HashSet ();
***************
*** 147,150 ****
--- 159,164 ----
factory.registerTag (new LocalImageTag ());
mParser.setNodeFactory (factory);
+ mCaptureResources = true;
+ mFilter = null;
}
***************
*** 213,216 ****
--- 227,249 ----
}
+
+ /** Getter for property filter.
+ * @return Value of property filter.
+ *
+ */
+ public NodeFilter getFilter ()
+ {
+ return (mFilter);
+ }
+
+ /** Setter for property filter.
+ * @param filter New value of property filter.
+ *
+ */
+ public void setFilter (NodeFilter filter)
+ {
+ mFilter = filter;
+ }
+
/**
* Returns <code>true</code> if the link is one we are interested in.
***************
*** 281,285 ****
String ret;
! if (link.equals (getSource ()))
ret = "index.html"; // handle the root page specially
else if (link.startsWith (getSource ())
--- 314,318 ----
String ret;
! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
ret = "index.html"; // handle the root page specially
else if (link.startsWith (getSource ())
***************
*** 382,391 ****
* Process a single page.
*/
! protected void process ()
throws
ParserException
{
String url;
NodeList list;
File file;
File dir;
--- 415,428 ----
* Process a single page.
*/
! protected void process (NodeFilter filter)
throws
ParserException
{
String url;
+ int bookmark;
NodeList list;
+ NodeList robots;
+ MetaTag robot;
+ String content;
File file;
File dir;
***************
*** 398,402 ****
try
! { // fetch the page and gather the list of nodes
mParser.setURL (url);
list = new NodeList ();
--- 435,441 ----
try
! {
! bookmark = mPages.size ();
! // fetch the page and gather the list of nodes
mParser.setURL (url);
list = new NodeList ();
***************
*** 404,407 ****
--- 443,468 ----
list.add (e.nextNode ()); // URL conversion occurs in the tags
+ // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
+ // <meta name="robots" content="index,follow" />
+ // <meta name="robots" content="noindex,nofollow" />
+ robots = list.extractAllNodesThatMatch (
+ new AndFilter (
+ new NodeClassFilter (MetaTag.class),
+ new HasAttributeFilter ("name", "robots")), true);
+ if (0 != robots.size ())
+ {
+ robot = (MetaTag)robots.elementAt (0);
+ content = robot.getAttribute ("content").toLowerCase ();
+ if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
+ // reset mPages
+ for (int i = bookmark; i < mPages.size (); i++)
+ mPages.remove (i);
+ if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
+ return;
+ }
+
+ if (null != filter)
+ list.keepAllNodesThatMatch (filter, true);
+
// save the page locally
file = new File (getTarget (), makeLocalLink (url, ""));
***************
*** 409,412 ****
--- 470,481 ----
if (!dir.exists ())
dir.mkdirs ();
+ else if (!dir.isDirectory ())
+ {
+ dir = new File (dir.getParentFile (), dir.getName () + ".content");
+ if (!dir.exists ())
+ dir.mkdirs ();
+ file = new File (dir, file.getName ());
+ }
+
try
{
***************
*** 581,585 ****
try
{
! process ();
while (0 != mImages.size ())
copy ();
--- 650,654 ----
try
{
! process (getFilter ());
while (0 != mImages.size ())
copy ();
|