Thread: [Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications SiteCapturer.java,1.4,1.5
Brought to you by:
derrickoswald
From: <der...@us...> - 2004-01-19 23:14:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv32229/src/org/htmlparser/parserapplications Modified Files: SiteCapturer.java Log Message: Update version to 1.4-20040119. Index: SiteCapturer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** SiteCapturer.java 14 Jan 2004 02:53:46 -0000 1.4 --- SiteCapturer.java 19 Jan 2004 23:14:18 -0000 1.5 *************** *** 41,50 **** --- 41,55 ---- import javax.swing.JOptionPane; + import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; + import org.htmlparser.filters.AndFilter; + import org.htmlparser.filters.HasAttributeFilter; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; + import org.htmlparser.tags.MetaTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; *************** *** 53,57 **** /** * Save a web site locally. ! * Illustrative prgram to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. --- 58,62 ---- /** * Save a web site locally. ! * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. *************** *** 124,127 **** --- 129,137 ---- /** + * The filter to apply to the nodes retrieved. + */ + protected NodeFilter mFilter; + + /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. *************** *** 136,139 **** --- 146,151 ---- PrototypicalNodeFactory factory; + mSource = null; + mTarget = null; mPages = new ArrayList (); mFinished = new HashSet (); *************** *** 147,150 **** --- 159,164 ---- factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); + mCaptureResources = true; + mFilter = null; } *************** *** 213,216 **** --- 227,249 ---- } + + /** Getter for property filter. + * @return Value of property filter. + * + */ + public NodeFilter getFilter () + { + return (mFilter); + } + + /** Setter for property filter. + * @param filter New value of property filter. + * + */ + public void setFilter (NodeFilter filter) + { + mFilter = filter; + } + /** * Returns <code>true</code> if the link is one we are interested in. *************** *** 281,285 **** String ret; ! if (link.equals (getSource ())) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) --- 314,318 ---- String ret; ! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) *************** *** 382,391 **** * Process a single page. */ ! protected void process () throws ParserException { String url; NodeList list; File file; File dir; --- 415,428 ---- * Process a single page. */ ! protected void process (NodeFilter filter) throws ParserException { String url; + int bookmark; NodeList list; + NodeList robots; + MetaTag robot; + String content; File file; File dir; *************** *** 398,402 **** try ! { // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); --- 435,441 ---- try ! { ! bookmark = mPages.size (); ! // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); *************** *** 404,407 **** --- 443,468 ---- list.add (e.nextNode ()); // URL conversion occurs in the tags + // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html + // <meta name="robots" content="index,follow" /> + // <meta name="robots" content="noindex,nofollow" /> + robots = list.extractAllNodesThatMatch ( + new AndFilter ( + new NodeClassFilter (MetaTag.class), + new HasAttributeFilter ("name", "robots")), true); + if (0 != robots.size ()) + { + robot = (MetaTag)robots.elementAt (0); + content = robot.getAttribute ("content").toLowerCase (); + if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) + // reset mPages + for (int i = bookmark; i < mPages.size (); i++) + mPages.remove (i); + if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) + return; + } + + if (null != filter) + list.keepAllNodesThatMatch (filter, true); + // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); *************** *** 409,412 **** --- 470,481 ---- if (!dir.exists ()) dir.mkdirs (); + else if (!dir.isDirectory ()) + { + dir = new File (dir.getParentFile (), dir.getName () + ".content"); + if (!dir.exists ()) + dir.mkdirs (); + file = new File (dir, file.getName ()); + } + try { *************** *** 581,585 **** try { ! process (); while (0 != mImages.size ()) copy (); --- 650,654 ---- try { ! process (getFilter ()); while (0 != mImages.size ()) copy (); |