[Htmlparser-cvs] htmlparser/src/org/htmlparser/parserapplications WikiCapturer.java,NONE,1.1 SiteCap
Brought to you by:
derrickoswald
From: <der...@us...> - 2004-01-10 00:06:06
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv24025/parserapplications Modified Files: SiteCapturer.java Added Files: WikiCapturer.java Log Message: First pass at the wiki capturer. Added useful extensions to the HasAttributeFilter, SiteCapturer and NodeList --- NEW FILE: WikiCapturer.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/WikiCapturer.java,v $ // $Author: derrickoswald $ // $Date: 2004/01/10 00:06:03 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.parserapplications; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import javax.swing.JFileChooser; import javax.swing.JOptionPane; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NotFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.filters.TagNameFilter; /** * Save a wikiwikiweb locally. * Illustrative program to save a wiki locally. */ public class WikiCapturer extends SiteCapturer { /** * Create a wikicapturer. */ public WikiCapturer () { } /** * Mainline to capture a web site locally. * @param args The command line arguments. * There are three arguments the web site to capture, the local directory * to save it to, and a flag (true or false) to indicate whether resources * such as images and video are to be captured as well. * These are requested via dialog boxes if not supplied. */ public static void main (String[] args) throws MalformedURLException, IOException { WikiCapturer worker; String url; JFileChooser chooser; URL source; String path; File target; Boolean capture; int ret; worker = new WikiCapturer (); if (0 >= args.length) { url = (String)JOptionPane.showInputDialog ( null, "Enter the URL to capture:", "Web Site", JOptionPane.PLAIN_MESSAGE, null, null, "http://htmlparser.sourceforge.net/wiki"); if (null != url) worker.setSource (url); else System.exit (1); } else worker.setSource (args[0]); if (1 >= args.length) { url = worker.getSource (); source = new URL (url); path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath (); target = new File (path); chooser = new JFileChooser (target); chooser.setDialogType (JFileChooser.SAVE_DIALOG); chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY); chooser.setSelectedFile (target); // this doesn't frickin' work chooser.setMultiSelectionEnabled (false); chooser.setDialogTitle ("Target Directory"); ret = chooser.showSaveDialog (null); if (ret == JFileChooser.APPROVE_OPTION) worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ()); else System.exit (1); } else worker.setTarget (args[1]); if (2 >= args.length) { capture = (Boolean)JOptionPane.showInputDialog ( null, "Should resources be captured:", "Capture Resources", JOptionPane.PLAIN_MESSAGE, null, new Object[] { Boolean.TRUE, Boolean.FALSE}, Boolean.TRUE); if (null != capture) worker.setCaptureResources (capture.booleanValue ()); else System.exit (1); } else worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ())); worker.setFilter ( new NotFilter ( new OrFilter ( new AndFilter ( new TagNameFilter ("DIV"), new HasAttributeFilter ("id", "navbar")), new OrFilter ( new AndFilter ( new TagNameFilter ("DIV"), new HasAttributeFilter ("id", "actionbar")), new AndFilter ( new TagNameFilter ("DIV"), new HasAttributeFilter ("id", "xhtml-validator")))))); worker.capture (); System.exit (0); } } Index: SiteCapturer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** SiteCapturer.java 4 Jan 2004 03:23:09 -0000 1.2 --- SiteCapturer.java 10 Jan 2004 00:06:03 -0000 1.3 *************** *** 42,51 **** --- 42,57 ---- import org.htmlparser.Node; + import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; + import org.htmlparser.filters.AndFilter; + import org.htmlparser.filters.HasAttributeFilter; + import org.htmlparser.filters.NodeClassFilter; + import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; + import org.htmlparser.tags.MetaTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; *************** *** 54,58 **** /** * Save a web site locally. ! * Illustrative prgram to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. --- 60,64 ---- /** * Save a web site locally. ! * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. *************** *** 125,128 **** --- 131,139 ---- /** + * The filter to apply to the nodes retrieved. + */ + protected NodeFilter mFilter; + + /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. *************** *** 137,140 **** --- 148,153 ---- PrototypicalNodeFactory factory; + mSource = null; + mTarget = null; mPages = new ArrayList (); mFinished = new HashSet (); *************** *** 148,151 **** --- 161,166 ---- factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); + mCaptureResources = true; + mFilter = null; } *************** *** 214,217 **** --- 229,251 ---- } + + /** Getter for property filter. + * @return Value of property filter. + * + */ + public NodeFilter getFilter () + { + return (mFilter); + } + + /** Setter for property filter. + * @param filter New value of property filter. + * + */ + public void setFilter (NodeFilter filter) + { + mFilter = filter; + } + /** * Returns <code>true</code> if the link is one we are interested in. *************** *** 282,286 **** String ret; ! if (link.equals (getSource ())) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) --- 316,320 ---- String ret; ! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) *************** *** 383,392 **** * Process a single page. */ ! protected void process () throws ParserException { String url; NodeList list; File file; File dir; --- 417,430 ---- * Process a single page. */ ! protected void process (NodeFilter filter) throws ParserException { String url; + int bookmark; NodeList list; + NodeList robots; + MetaTag robot; + String content; File file; File dir; *************** *** 399,403 **** try ! { // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); --- 437,443 ---- try ! { ! bookmark = mPages.size (); ! // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); *************** *** 405,408 **** --- 445,470 ---- list.add (e.nextNode ()); // URL conversion occurs in the tags + // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html + // <meta name="robots" content="index,follow" /> + // <meta name="robots" content="noindex,nofollow" /> + robots = list.extractAllNodesThatMatch ( + new AndFilter ( + new NodeClassFilter (MetaTag.class), + new HasAttributeFilter ("name", "robots")), true); + if (0 != robots.size ()) + { + robot = (MetaTag)robots.elementAt (0); + content = robot.getAttribute ("content").toLowerCase (); + if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) + // reset mPages + for (int i = bookmark; i < mPages.size (); i++) + mPages.remove (i); + if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) + return; + } + + if (null != filter) + list.keepAllNodesThatMatch (filter, true); + // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); *************** *** 410,413 **** --- 472,483 ---- if (!dir.exists ()) dir.mkdirs (); + else if (!dir.isDirectory ()) + { + dir = new File (dir.getParentFile (), dir.getName () + ".content"); + if (!dir.exists ()) + dir.mkdirs (); + file = new File (dir, file.getName ()); + } + try { *************** *** 582,586 **** try { ! process (); while (0 != mImages.size ()) copy (); --- 652,656 ---- try { ! process (getFilter ()); while (0 != mImages.size ()) copy (); |