[tek-cvs] tek/server/util WgetFetchURLs.java,NONE,1.1 NaiveFetchURLs.java,1.17,1.18

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/tek/tek/server/util
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24312/util

Modified Files:
	NaiveFetchURLs.java 
Added Files:
	WgetFetchURLs.java 
Log Message:
 - improved page simplification (integrated loband on server)
 - added pdf-to-html, ps-to-html
 - you can send and receive searches without registering
 - server sends exactly 10 pages, unless it overflows ~150K
   - fewer pages can sometimes appear if search terms not present
 - URLs and domain names appear properly in the client



Index: NaiveFetchURLs.java
===================================================================
RCS file: /cvsroot/tek/tek/server/util/NaiveFetchURLs.java,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -d -r1.17 -r1.18

--- NaiveFetchURLs.java	5 Jan 2004 01:29:38 -0000	1.17
+++ NaiveFetchURLs.java	21 Oct 2005 21:01:03 -0000	1.18
@@ -2,18 +2,11 @@
 
 package tek.server.util ;
 
+import tek.server.data.TEKConstants;
 import java.io.*;
 import java.net.*;
 import java.util.*;
 
-/**
- * @author Bill Thies, Tazeen Mahtab
- * Last Modified: Oct 19, 2001
- *
- * NaiveFetchURLs is
- * <p>
- **/
-
 // ISSUES
 //
 // Prints weird font error when timer goes off, but only on my machine.
@@ -32,13 +25,13 @@
 public class NaiveFetchURLs extends Thread {
 
     private static final boolean DEBUG = true;
-    private static final int TIMEOUT = 20000;   // In milliseconds 
-    private static final int MAX_SIZE = 100000; // In characters/bytes
+    private static final int TIMEOUT = 120000;   // In milliseconds 
+    private static int MAX_SIZE = 100*TEKConstants.DEFAULT_DOWNLOAD_SIZE(); // In characters/bytes
 
     private static Object lock1 = new Object();
     private static Object lock2 = new Object();
   
-    private static Hashtable result; //Maps URL strings to their contents
+    private static String[] result; 
     private static int count;
     private static int numURLs;
     private static boolean images;
@@ -46,6 +39,7 @@
   
     private int index;
     private URL url;
+    private String filename;
     private HttpURLConnection uc; 
     private boolean badURL;  // Indicates whether a URL can be downloaded from
   
@@ -56,11 +50,12 @@
      *
      * @requires url != null
      **/
-    private NaiveFetchURLs(int index, String url) {
+    private NaiveFetchURLs(int index, String url, String filename) {
 	super();
 	this.index = index;
 	try {
 	    this.url = new URL(url);
+	    this.filename = filename;
 	    badURL = false;
 	}catch (MalformedURLException e) {
 	    System.out.println("NaiveFetchURLs:constructor:");
@@ -69,25 +64,27 @@
     }
   
     /**
-     * For all urls, checks the URL for errors and if error-free, 
-     * downloads the contents of the corresponding HTML file
+     * For all urls, checks the URL for errors and if error-free,
+     * downloads the contents and stores in the corresponding
+     * <filename>.
      *
      * @requires that <urls> contains URL Strings of pages to fetch.
-     * @returns hashtable of URL Strings mapped to their corresponing
-     * String contents 
+     * @returns Vector of urls (strings) that successfully downloaded
      **/
-    public static Hashtable fetch(Vector urls, boolean wantImages) 
+    public static Vector fetch(Vector urls, Vector filenames, boolean wantImages) 
 	throws IOException {
 	count = 0;
 	numURLs = urls.size();
 	threads = new NaiveFetchURLs[numURLs];
-	result = new Hashtable();
+	result = new String[urls.size()];
 	images = wantImages;
 
 	// Spawn threads
 	System.out.println("Downloading files...");
 	for (int i = 0; i < numURLs; i++) {
-	    threads[i] = new NaiveFetchURLs(i, (String)urls.elementAt(i));
+	    threads[i] = new NaiveFetchURLs(i, 
+					    (String)urls.elementAt(i),
+					    (String)filenames.elementAt(i));
 	    threads[i].start();
 	}
 
@@ -97,11 +94,14 @@
 		synchronized(lock1) {
 		    lock1.wait(TIMEOUT);
 		}
+		// sleep for a second so we don't kill threads just as
+		// they're exiting (in the event they all notified us)
+		Thread.sleep(1);
 	    } catch (InterruptedException e) {
 		System.out.println("NaiveFetchURLs:fetch: ");
 		e.printStackTrace();
 	    }
-	    System.out.println("Time's up!!!");
+	    System.out.println("All threads done, or timeout.");
 	    synchronized(lock2){
 		// Mark all threads as done
 		count = numURLs;
@@ -118,7 +118,15 @@
 		}
 	    }
 	}
-	return result;
+
+	// build a result vector out of non-null array elements
+	Vector vec = new Vector();
+	for (int i=0; i<result.length; i++) {
+	    if (result[i]!=null) {
+		vec.add(result[i]);
+	    }
+	}
+	return vec;
     }
 
     /**
@@ -143,7 +151,7 @@
 
 		if(!badURL){
 		    // Check URL file type
-		    badURL = !isRightFileType(uc);
+		    badURL = !isRightFileType(uc, url.toString());
 		    if(DEBUG && badURL){
 			System.out.print("Wrong File Type! " + url + " , ");
 		    }		   
@@ -191,7 +199,8 @@
 			// also find and download frames
 	
 			if(!badURL){
-			    result.put(url.toString(), sb.toString());
+			    FileWrite.call(sb.toString().getBytes(), filename);
+			    result[index] = url.toString();
 			}
 		    }catch(SocketException e){
 			badURL = true;
@@ -219,7 +228,7 @@
 	}	
 	if(count != numURLs){
 	    markThreadFinished(); 	
-	}	  
+	}
     }
 
     /**
@@ -238,13 +247,16 @@
      * @requires uc != null
      * @returns true if uc points to right file type, false otherwise
      **/
-    private boolean isRightFileType(HttpURLConnection uc){
+    private boolean isRightFileType(HttpURLConnection uc, String url){
 	boolean rightType = true;
 
 	try{
 	    // Try to determine if file type is text	
 	    String type = uc.getContentType();
-	    if((type != null) && (type.indexOf("text") == -1)){
+	    if((type != null) && 
+	       type.indexOf("text") == -1 &&
+	       !(url.toLowerCase().endsWith("pdf")) &&
+	       !(url.toLowerCase().endsWith("ps"))) {
 		rightType = false;
 	    }
 	}catch(Exception e){
@@ -319,34 +331,37 @@
 	    e.printStackTrace();
 	}
     }
-
+    
     // For testing
     public static void main(String[] args){
 	try{
 	    boolean wantImages = false;
 	    Vector newURLs = new Vector();
-
+	    Vector filenames = new Vector();
+	    filenames.add("result1.html");
+	    filenames.add("result2.html");
+	    filenames.add("result3.html");
+	    
 	    // No URLs
-	    Hashtable downloadedURLs = NaiveFetchURLs.fetch(newURLs, wantImages); 
+	    Vector downloadedURLs = NaiveFetchURLs.fetch(newURLs, 
+							    filenames,
+							    wantImages); 
 
 	    // 1 URL
 	    newURLs.add("http://www.cnn.com/");
-	    downloadedURLs = NaiveFetchURLs.fetch(newURLs, wantImages); 
-
-		Enumeration e = downloadedURLs.keys();
-		while(e.hasMoreElements()){
-		    String url = (String) e.nextElement();
-		 
-		    String content = (String) downloadedURLs.get(url);
-		    //System.out.println(content);
-		}
+	    downloadedURLs = NaiveFetchURLs.fetch(newURLs, filenames, wantImages); 
+	    
+	    Enumeration e = downloadedURLs.elements();
+	    while(e.hasMoreElements()){
+		String url = (String) e.nextElement();
+		System.out.println(url);
+	    }
 	    // 3URLs
 	    newURLs.add("http://web.mit.edu/");
 	    newURLs.add("http://www.ebay.com/");
-	    downloadedURLs = NaiveFetchURLs.fetch(newURLs, wantImages); 
+	    downloadedURLs = NaiveFetchURLs.fetch(newURLs, filenames, wantImages); 
 	}catch(Exception e){
 	    e.printStackTrace();
 	}
     }
-
 }

--- NEW FILE: WgetFetchURLs.java ---
package tek.server.util ;

import tek.server.data.TEKConstants;
import tek.server.protocol.tputils.ServerConstants;
import java.io.*;
import java.net.*;
import java.util.*;

public class WgetFetchURLs extends Thread {
    private static final int TIMEOUT = 60;   // In seconds

    private static Object lock1 = new Object();
    private static Object lock2 = new Object();
  
    private static String[] result; 
    private static int count;
    private static int numURLs;
    private static boolean images;
    private static WgetFetchURLs[] threads;  // to store all the threads
  
    private int index;
    private URL url;
    private String filename;

    private WgetFetchURLs(int index, String url, String filename) {
	this.index = index;
	try {
	    this.url = new URL(url);
	    this.filename = filename;
	}catch (MalformedURLException e) {
	    e.printStackTrace();
	}
    }
  
    /**
     * For all urls, downloads contents and stores in the
     * corresponding <filename>.
     *
     * @requires that <urls> contains URL Strings of pages to fetch.
     * @returns Vector of urls (strings) that successfully downloaded
     **/
    public static Vector fetch(Vector urls, Vector filenames, boolean wantImages) 
	throws IOException {
	count = 0;
	numURLs = urls.size();
	threads = new WgetFetchURLs[numURLs];
	result = new String[urls.size()];
	images = wantImages;

	// Spawn threads
	System.out.println("Downloading files...");
	for (int i = 0; i < numURLs; i++) {
	    threads[i] = new WgetFetchURLs(i, 
					   (String)urls.elementAt(i),
					   (String)filenames.elementAt(i));
	    threads[i].start();
	}

	// Wait for threads to finish
	if (numURLs > 0){
	    try {
		synchronized(lock1) {
		    lock1.wait();
 		    // sleep for a second so we don't kill threads
		    // just as they are finishing
		    Thread.sleep(1000);
		}
	    } catch (InterruptedException e) {
		e.printStackTrace();
	    }
	    System.out.println("All threads done.");
	}

	// build a result vector out of non-null array elements
	Vector vec = new Vector();
	for (int i=0; i<result.length; i++) {
	    if (result[i]!=null) {
		vec.add(result[i]);
	    }
	}
	return vec;
    }

    public void run() {
	System.out.println("Starting to download #" + index + " " + url);

	int returnVal = 1;
	try {
	    Process proc = Runtime.getRuntime().exec(ServerConstants.TExec + " -t " + TIMEOUT + " " +
						     "wget --output-document=" + filename + " " + url);
	    proc.waitFor();
	    returnVal = proc.exitValue();

	    /* Debug */ 
	    /*
	    System.out.println(ServerConstants.TExec + " -t " + TIMEOUT + " " +
			       "wget --output-document=" + filename + " " + url);

	    BufferedReader jOutStream = new BufferedReader(new InputStreamReader(proc.getInputStream()));
	    BufferedReader jErrorStream =  new BufferedReader(new InputStreamReader(proc.getErrorStream()));

	    while (jOutStream.ready()) {
		System.out.println(jOutStream.readLine());
	    }
	    while (jErrorStream.ready()) {
		System.out.println(jErrorStream.readLine());
	    */
	} catch (IOException e) {
	    e.printStackTrace();
	} catch (InterruptedException e) {
	    e.printStackTrace();
	}

	if (returnVal == 0) {
	    // signal that it worked
	    System.out.println("Finished #" + index);
	    result[index] = url.toString();
	} else {
	    // didn't work --  delete any part of file that was downloaded
	    System.out.println("Aborting download of #" + index);

	    File file = new File(filename);
	    if (file.exists()) {
		file.delete();
	    }
	}

	markThreadFinished(); 	
    }

    /**
     * Increments the count of finished NaiveFetchURL threads
     **/
    private void markThreadFinished(){
	try{
	    synchronized(lock2) {
		++count;
		if (count == numURLs) {
		    synchronized(lock1) {
			lock1.notify();
		    }
		}
	    }

	}catch(Exception e){
	    e.printStackTrace();
	}
    }
}