public abstract class AbstractSiteMiner extends Thread implements
Iterator<WebLink>, Iterable<WebLink> {
/**
* How many links before quitting?
*/
public int MAX_LINKS = 400;
/**
* The URL to start the search from
*/
private String url;
private final List<WebLink> links = new LinkedList<WebLink>();
public AbstractSiteMiner(String theUrl) {
super(theUrl);
HttpUnitOptions.setExceptionsThrownOnScriptError(false);
this.setUrl(theUrl);
}
/**
* Continually try to get the next set of links, until out of links or
* reaches max.
*/
public void run() {
WebConversation wc = new WebConversation();
int page = 0;
int totalLinks = 0;
try {
Thread.yield();
WebResponse nextPage = wc.getResponse(this.getUrl());
while (nextPage != null && links.size() < MAX_LINKS) {
page++;
WebResponse wr = nextPage;
nextPage = null;
totalLinks += wr.getLinks().length;
for (WebLink link : wr.getLinks()) {
if (isValidLink(link)) {
this.links.add(link);
}
if (!this.links.isEmpty()) {
System.out.println("SiteMiner of " + this.getUrl() + " found "
+ this.links.size() + " valid links out of " + totalLinks);
}
// System.out.println("AbstractSiteMiner finished:" + this.getName());
}
/**
* @return if there is a next link (possibly blocking for the thread)
* @see java.util.Iterator#hasNext()
*/
public boolean hasNext() {
this.ensureRunning();
synchronized (this) {
while (this.links.isEmpty() && this.isAlive()) {
try {
this.wait(250);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
return !this.links.isEmpty();
}
/**
* @param page
* Optional page number
* @param link
* Link to test
* @return if the current link is the link to the next page of results to
* scan.
*/
abstract boolean isNextPageLink(int page, WebLink link);
/**
* @param link
* Link to test
* @return if the current link should be included in the iterator
*/
abstract boolean isValidLink(WebLink link);
/**
* @return Instance of self
* @see java.lang.Iterable#iterator()
*/
public Iterator<WebLink> iterator() {
return this;
}
/**
* @return Make sure it is running first.
* @see java.util.Iterator#next()
*/
public WebLink next() {
ensureRunning();
return this.links.remove(0);
}
/**
* Make sure it is running first.
*
* @see java.util.Iterator#remove()
*/
public void remove() {
ensureRunning();
this.links.remove(0);
}
/**
* @return Returns the url.
*/
public String getUrl() {
return this.url;
}
/**
* @param aUrl
* The url to set.
*/
public void setUrl(String aUrl) {
this.url = aUrl;
}
}
If you would like to refer to this comment somewhere else in this project, copy and paste the following link:
It seems to work great! But I'm new to threads, so I'm looking for feedback.
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.xml.sax.SAXException;
import com.meterware.httpunit.HttpUnitOptions;
import com.meterware.httpunit.WebConversation;
import com.meterware.httpunit.WebLink;
import com.meterware.httpunit.WebResponse;
public abstract class AbstractSiteMiner extends Thread implements
Iterator<WebLink>, Iterable<WebLink> {
/**
* How many links before quitting?
*/
public int MAX_LINKS = 400;
/**
* The URL to start the search from
*/
private String url;
private final List<WebLink> links = new LinkedList<WebLink>();
public AbstractSiteMiner(String theUrl) {
super(theUrl);
HttpUnitOptions.setExceptionsThrownOnScriptError(false);
this.setUrl(theUrl);
}
/**
* Continually try to get the next set of links, until out of links or
* reaches max.
*/
public void run() {
WebConversation wc = new WebConversation();
int page = 0;
int totalLinks = 0;
try {
Thread.yield();
WebResponse nextPage = wc.getResponse(this.getUrl());
while (nextPage != null && links.size() < MAX_LINKS) {
page++;
WebResponse wr = nextPage;
nextPage = null;
totalLinks += wr.getLinks().length;
for (WebLink link : wr.getLinks()) {
if (isValidLink(link)) {
this.links.add(link);
}
if (isNextPageLink(page, link)) {
nextPage = link.click();
}
}
}
} catch (SAXException e) {
e.printStackTrace();
} catch (Exception e) {
// System.err.println(this.getName() + " resulted in a " +
// e.getMessage());
}
if (!this.links.isEmpty()) {
System.out.println("SiteMiner of " + this.getUrl() + " found "
+ this.links.size() + " valid links out of " + totalLinks);
}
// System.out.println("AbstractSiteMiner finished:" + this.getName());
}
/**
* @return if there is a next link (possibly blocking for the thread)
* @see java.util.Iterator#hasNext()
*/
public boolean hasNext() {
this.ensureRunning();
synchronized (this) {
while (this.links.isEmpty() && this.isAlive()) {
try {
this.wait(250);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
return !this.links.isEmpty();
}
/**
*
*/
private void ensureRunning() {
if (this.getState() == Thread.State.NEW) {
this.start();
}
}
/**
* @param page
* Optional page number
* @param link
* Link to test
* @return if the current link is the link to the next page of results to
* scan.
*/
abstract boolean isNextPageLink(int page, WebLink link);
/**
* @param link
* Link to test
* @return if the current link should be included in the iterator
*/
abstract boolean isValidLink(WebLink link);
/**
* @return Instance of self
* @see java.lang.Iterable#iterator()
*/
public Iterator<WebLink> iterator() {
return this;
}
/**
* @return Make sure it is running first.
* @see java.util.Iterator#next()
*/
public WebLink next() {
ensureRunning();
return this.links.remove(0);
}
/**
* Make sure it is running first.
*
* @see java.util.Iterator#remove()
*/
public void remove() {
ensureRunning();
this.links.remove(0);
}
/**
* @return Returns the url.
*/
public String getUrl() {
return this.url;
}
/**
* @param aUrl
* The url to set.
*/
public void setUrl(String aUrl) {
this.url = aUrl;
}
}