[tek-cvs] tek/server/util WgetFetchURLs.java,NONE,1.1 NaiveFetchURLs.java,1.17,1.18
Status: Beta
Brought to you by:
billthies
From: Bill T. <bil...@us...> - 2005-10-21 21:01:12
|
Update of /cvsroot/tek/tek/server/util In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24312/util Modified Files: NaiveFetchURLs.java Added Files: WgetFetchURLs.java Log Message: - improved page simplification (integrated loband on server) - added pdf-to-html, ps-to-html - you can send and receive searches without registering - server sends exactly 10 pages, unless it overflows ~150K - fewer pages can sometimes appear if search terms not present - URLs and domain names appear properly in the client Index: NaiveFetchURLs.java =================================================================== RCS file: /cvsroot/tek/tek/server/util/NaiveFetchURLs.java,v retrieving revision 1.17 retrieving revision 1.18 diff -u -d -r1.17 -r1.18 --- NaiveFetchURLs.java 5 Jan 2004 01:29:38 -0000 1.17 +++ NaiveFetchURLs.java 21 Oct 2005 21:01:03 -0000 1.18 @@ -2,18 +2,11 @@ package tek.server.util ; +import tek.server.data.TEKConstants; import java.io.*; import java.net.*; import java.util.*; -/** - * @author Bill Thies, Tazeen Mahtab - * Last Modified: Oct 19, 2001 - * - * NaiveFetchURLs is - * <p> - **/ - // ISSUES // // Prints weird font error when timer goes off, but only on my machine. @@ -32,13 +25,13 @@ public class NaiveFetchURLs extends Thread { private static final boolean DEBUG = true; - private static final int TIMEOUT = 20000; // In milliseconds - private static final int MAX_SIZE = 100000; // In characters/bytes + private static final int TIMEOUT = 120000; // In milliseconds + private static int MAX_SIZE = 100*TEKConstants.DEFAULT_DOWNLOAD_SIZE(); // In characters/bytes private static Object lock1 = new Object(); private static Object lock2 = new Object(); - private static Hashtable result; //Maps URL strings to their contents + private static String[] result; private static int count; private static int numURLs; private static boolean images; @@ -46,6 +39,7 @@ private int index; private URL url; + private String filename; private HttpURLConnection uc; private boolean badURL; // Indicates whether a URL can be downloaded from @@ -56,11 +50,12 @@ * * @requires url != null **/ - private NaiveFetchURLs(int index, String url) { + private NaiveFetchURLs(int index, String url, String filename) { super(); this.index = index; try { this.url = new URL(url); + this.filename = filename; badURL = false; }catch (MalformedURLException e) { System.out.println("NaiveFetchURLs:constructor:"); @@ -69,25 +64,27 @@ } /** - * For all urls, checks the URL for errors and if error-free, - * downloads the contents of the corresponding HTML file + * For all urls, checks the URL for errors and if error-free, + * downloads the contents and stores in the corresponding + * <filename>. * * @requires that <urls> contains URL Strings of pages to fetch. - * @returns hashtable of URL Strings mapped to their corresponing - * String contents + * @returns Vector of urls (strings) that successfully downloaded **/ - public static Hashtable fetch(Vector urls, boolean wantImages) + public static Vector fetch(Vector urls, Vector filenames, boolean wantImages) throws IOException { count = 0; numURLs = urls.size(); threads = new NaiveFetchURLs[numURLs]; - result = new Hashtable(); + result = new String[urls.size()]; images = wantImages; // Spawn threads System.out.println("Downloading files..."); for (int i = 0; i < numURLs; i++) { - threads[i] = new NaiveFetchURLs(i, (String)urls.elementAt(i)); + threads[i] = new NaiveFetchURLs(i, + (String)urls.elementAt(i), + (String)filenames.elementAt(i)); threads[i].start(); } @@ -97,11 +94,14 @@ synchronized(lock1) { lock1.wait(TIMEOUT); } + // sleep for a second so we don't kill threads just as + // they're exiting (in the event they all notified us) + Thread.sleep(1); } catch (InterruptedException e) { System.out.println("NaiveFetchURLs:fetch: "); e.printStackTrace(); } - System.out.println("Time's up!!!"); + System.out.println("All threads done, or timeout."); synchronized(lock2){ // Mark all threads as done count = numURLs; @@ -118,7 +118,15 @@ } } } - return result; + + // build a result vector out of non-null array elements + Vector vec = new Vector(); + for (int i=0; i<result.length; i++) { + if (result[i]!=null) { + vec.add(result[i]); + } + } + return vec; } /** @@ -143,7 +151,7 @@ if(!badURL){ // Check URL file type - badURL = !isRightFileType(uc); + badURL = !isRightFileType(uc, url.toString()); if(DEBUG && badURL){ System.out.print("Wrong File Type! " + url + " , "); } @@ -191,7 +199,8 @@ // also find and download frames if(!badURL){ - result.put(url.toString(), sb.toString()); + FileWrite.call(sb.toString().getBytes(), filename); + result[index] = url.toString(); } }catch(SocketException e){ badURL = true; @@ -219,7 +228,7 @@ } if(count != numURLs){ markThreadFinished(); - } + } } /** @@ -238,13 +247,16 @@ * @requires uc != null * @returns true if uc points to right file type, false otherwise **/ - private boolean isRightFileType(HttpURLConnection uc){ + private boolean isRightFileType(HttpURLConnection uc, String url){ boolean rightType = true; try{ // Try to determine if file type is text String type = uc.getContentType(); - if((type != null) && (type.indexOf("text") == -1)){ + if((type != null) && + type.indexOf("text") == -1 && + !(url.toLowerCase().endsWith("pdf")) && + !(url.toLowerCase().endsWith("ps"))) { rightType = false; } }catch(Exception e){ @@ -319,34 +331,37 @@ e.printStackTrace(); } } - + // For testing public static void main(String[] args){ try{ boolean wantImages = false; Vector newURLs = new Vector(); - + Vector filenames = new Vector(); + filenames.add("result1.html"); + filenames.add("result2.html"); + filenames.add("result3.html"); + // No URLs - Hashtable downloadedURLs = NaiveFetchURLs.fetch(newURLs, wantImages); + Vector downloadedURLs = NaiveFetchURLs.fetch(newURLs, + filenames, + wantImages); // 1 URL newURLs.add("http://www.cnn.com/"); - downloadedURLs = NaiveFetchURLs.fetch(newURLs, wantImages); - - Enumeration e = downloadedURLs.keys(); - while(e.hasMoreElements()){ - String url = (String) e.nextElement(); - - String content = (String) downloadedURLs.get(url); - //System.out.println(content); - } + downloadedURLs = NaiveFetchURLs.fetch(newURLs, filenames, wantImages); + + Enumeration e = downloadedURLs.elements(); + while(e.hasMoreElements()){ + String url = (String) e.nextElement(); + System.out.println(url); + } // 3URLs newURLs.add("http://web.mit.edu/"); newURLs.add("http://www.ebay.com/"); - downloadedURLs = NaiveFetchURLs.fetch(newURLs, wantImages); + downloadedURLs = NaiveFetchURLs.fetch(newURLs, filenames, wantImages); }catch(Exception e){ e.printStackTrace(); } } - } --- NEW FILE: WgetFetchURLs.java --- package tek.server.util ; import tek.server.data.TEKConstants; import tek.server.protocol.tputils.ServerConstants; import java.io.*; import java.net.*; import java.util.*; public class WgetFetchURLs extends Thread { private static final int TIMEOUT = 60; // In seconds private static Object lock1 = new Object(); private static Object lock2 = new Object(); private static String[] result; private static int count; private static int numURLs; private static boolean images; private static WgetFetchURLs[] threads; // to store all the threads private int index; private URL url; private String filename; private WgetFetchURLs(int index, String url, String filename) { this.index = index; try { this.url = new URL(url); this.filename = filename; }catch (MalformedURLException e) { e.printStackTrace(); } } /** * For all urls, downloads contents and stores in the * corresponding <filename>. * * @requires that <urls> contains URL Strings of pages to fetch. * @returns Vector of urls (strings) that successfully downloaded **/ public static Vector fetch(Vector urls, Vector filenames, boolean wantImages) throws IOException { count = 0; numURLs = urls.size(); threads = new WgetFetchURLs[numURLs]; result = new String[urls.size()]; images = wantImages; // Spawn threads System.out.println("Downloading files..."); for (int i = 0; i < numURLs; i++) { threads[i] = new WgetFetchURLs(i, (String)urls.elementAt(i), (String)filenames.elementAt(i)); threads[i].start(); } // Wait for threads to finish if (numURLs > 0){ try { synchronized(lock1) { lock1.wait(); // sleep for a second so we don't kill threads // just as they are finishing Thread.sleep(1000); } } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("All threads done."); } // build a result vector out of non-null array elements Vector vec = new Vector(); for (int i=0; i<result.length; i++) { if (result[i]!=null) { vec.add(result[i]); } } return vec; } public void run() { System.out.println("Starting to download #" + index + " " + url); int returnVal = 1; try { Process proc = Runtime.getRuntime().exec(ServerConstants.TExec + " -t " + TIMEOUT + " " + "wget --output-document=" + filename + " " + url); proc.waitFor(); returnVal = proc.exitValue(); /* Debug */ /* System.out.println(ServerConstants.TExec + " -t " + TIMEOUT + " " + "wget --output-document=" + filename + " " + url); BufferedReader jOutStream = new BufferedReader(new InputStreamReader(proc.getInputStream())); BufferedReader jErrorStream = new BufferedReader(new InputStreamReader(proc.getErrorStream())); while (jOutStream.ready()) { System.out.println(jOutStream.readLine()); } while (jErrorStream.ready()) { System.out.println(jErrorStream.readLine()); */ } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } if (returnVal == 0) { // signal that it worked System.out.println("Finished #" + index); result[index] = url.toString(); } else { // didn't work -- delete any part of file that was downloaded System.out.println("Aborting download of #" + index); File file = new File(filename); if (file.exists()) { file.delete(); } } markThreadFinished(); } /** * Increments the count of finished NaiveFetchURL threads **/ private void markThreadFinished(){ try{ synchronized(lock2) { ++count; if (count == numURLs) { synchronized(lock1) { lock1.notify(); } } } }catch(Exception e){ e.printStackTrace(); } } } |