Benjamin Hill - 2005-12-03

It seems to work great!  But I'm new to threads, so I'm looking for feedback.

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import org.xml.sax.SAXException;

import com.meterware.httpunit.HttpUnitOptions;
import com.meterware.httpunit.WebConversation;
import com.meterware.httpunit.WebLink;
import com.meterware.httpunit.WebResponse;

public abstract class AbstractSiteMiner extends Thread implements
        Iterator<WebLink>, Iterable<WebLink> {

    /**
     * How many links before quitting?
     */
    public int MAX_LINKS = 400;

    /**
     * The URL to start the search from
     */
    private String url;

    private final List<WebLink> links = new LinkedList<WebLink>();

    public AbstractSiteMiner(String theUrl) {
        super(theUrl);
        HttpUnitOptions.setExceptionsThrownOnScriptError(false);
        this.setUrl(theUrl);
    }

    /**
     * Continually try to get the next set of links, until out of links or
     * reaches max.
     */
    public void run() {
        WebConversation wc = new WebConversation();
        int page = 0;
        int totalLinks = 0;
        try {
            Thread.yield();
            WebResponse nextPage = wc.getResponse(this.getUrl());

            while (nextPage != null && links.size() < MAX_LINKS) {
                page++;
                WebResponse wr = nextPage;
                nextPage = null;
                totalLinks += wr.getLinks().length;
                for (WebLink link : wr.getLinks()) {
                    if (isValidLink(link)) {
                        this.links.add(link);
                    }

                    if (isNextPageLink(page, link)) {
                        nextPage = link.click();
                    }
                }
            }
        } catch (SAXException e) {
            e.printStackTrace();
        } catch (Exception e) {
            // System.err.println(this.getName() + " resulted in a " +
            // e.getMessage());
        }

        if (!this.links.isEmpty()) {
            System.out.println("SiteMiner of " + this.getUrl() + " found "
                    + this.links.size() + " valid links out of " + totalLinks);
        }
        // System.out.println("AbstractSiteMiner finished:" + this.getName());
    }

    /**
     * @return if there is a next link (possibly blocking for the thread)
     * @see java.util.Iterator#hasNext()
     */
    public boolean hasNext() {
        this.ensureRunning();
        synchronized (this) {
            while (this.links.isEmpty() && this.isAlive()) {
                try {
                    this.wait(250);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
        return !this.links.isEmpty();
    }

    /**
     *
     */
    private void ensureRunning() {
        if (this.getState() == Thread.State.NEW) {
            this.start();
        }
    }

    /**
     * @param page
     *            Optional page number
     * @param link
     *            Link to test
     * @return if the current link is the link to the next page of results to
     *         scan.
     */
    abstract boolean isNextPageLink(int page, WebLink link);

    /**
     * @param link
     *            Link to test
     * @return if the current link should be included in the iterator
     */
    abstract boolean isValidLink(WebLink link);

    /**
     * @return Instance of self
     * @see java.lang.Iterable#iterator()
     */
    public Iterator<WebLink> iterator() {
        return this;
    }

    /**
     * @return Make sure it is running first.
     * @see java.util.Iterator#next()
     */
    public WebLink next() {
        ensureRunning();
        return this.links.remove(0);
    }

    /**
     * Make sure it is running first.
     *
     * @see java.util.Iterator#remove()
     */
    public void remove() {
        ensureRunning();
        this.links.remove(0);
    }

    /**
     * @return Returns the url.
     */
    public String getUrl() {
        return this.url;
    }

    /**
     * @param aUrl
     *            The url to set.
     */
    public void setUrl(String aUrl) {
        this.url = aUrl;
    }
}