[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications SiteCapturer.java,1.3,1.4
Brought to you by:
derrickoswald
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/parserapplications Modified Files: SiteCapturer.java Log Message: Index: SiteCapturer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** SiteCapturer.java 10 Jan 2004 00:06:03 -0000 1.3 --- SiteCapturer.java 14 Jan 2004 02:53:46 -0000 1.4 *************** *** 41,57 **** import javax.swing.JOptionPane; - import org.htmlparser.Node; - import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.filters.AndFilter; - import org.htmlparser.filters.HasAttributeFilter; - import org.htmlparser.filters.NodeClassFilter; - import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; - import org.htmlparser.tags.MetaTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; --- 41,50 ---- *************** *** 60,64 **** /** * Save a web site locally. ! * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. --- 53,57 ---- /** * Save a web site locally. ! * Illustrative prgram to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. *************** *** 131,139 **** /** - * The filter to apply to the nodes retrieved. - */ - protected NodeFilter mFilter; - - /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. --- 124,127 ---- *************** *** 148,153 **** PrototypicalNodeFactory factory; - mSource = null; - mTarget = null; mPages = new ArrayList (); mFinished = new HashSet (); --- 136,139 ---- *************** *** 161,166 **** factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); - mCaptureResources = true; - mFilter = null; } --- 147,150 ---- *************** *** 229,251 **** } - - /** Getter for property filter. - * @return Value of property filter. - * - */ - public NodeFilter getFilter () - { - return (mFilter); - } - - /** Setter for property filter. - * @param filter New value of property filter. - * - */ - public void setFilter (NodeFilter filter) - { - mFilter = filter; - } - /** * Returns <code>true</code> if the link is one we are interested in. --- 213,216 ---- *************** *** 316,320 **** String ret; ! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) --- 281,285 ---- String ret; ! if (link.equals (getSource ())) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) *************** *** 417,430 **** * Process a single page. */ ! protected void process (NodeFilter filter) throws ParserException { String url; - int bookmark; NodeList list; - NodeList robots; - MetaTag robot; - String content; File file; File dir; --- 382,391 ---- * Process a single page. */ ! protected void process () throws ParserException { String url; NodeList list; File file; File dir; *************** *** 437,443 **** try ! { ! bookmark = mPages.size (); ! // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); --- 398,402 ---- try ! { // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); *************** *** 445,470 **** list.add (e.nextNode ()); // URL conversion occurs in the tags - // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html - // <meta name="robots" content="index,follow" /> - // <meta name="robots" content="noindex,nofollow" /> - robots = list.extractAllNodesThatMatch ( - new AndFilter ( - new NodeClassFilter (MetaTag.class), - new HasAttributeFilter ("name", "robots")), true); - if (0 != robots.size ()) - { - robot = (MetaTag)robots.elementAt (0); - content = robot.getAttribute ("content").toLowerCase (); - if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) - // reset mPages - for (int i = bookmark; i < mPages.size (); i++) - mPages.remove (i); - if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) - return; - } - - if (null != filter) - list.keepAllNodesThatMatch (filter, true); - // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); --- 404,407 ---- *************** *** 472,483 **** if (!dir.exists ()) dir.mkdirs (); - else if (!dir.isDirectory ()) - { - dir = new File (dir.getParentFile (), dir.getName () + ".content"); - if (!dir.exists ()) - dir.mkdirs (); - file = new File (dir, file.getName ()); - } - try { --- 409,412 ---- *************** *** 652,656 **** try { ! process (getFilter ()); while (0 != mImages.size ()) copy (); --- 581,585 ---- try { ! process (); while (0 != mImages.size ()) copy (); |