Share

Heritrix: Internet Archive Web Crawler

Tracker: Bugs

6 refactor/compact QuotaEnforcer code - ID: 1371326
Last Update: Comment added ( karl-ia )

Spawned from [ 1369619 ] NPE in QuotaEnforcer

Stack suggests this refactoring to reduce duplicated code:

Index: QuotaEnforcer.java
===================================================================
RCS file:
/cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/pr
efetch/QuotaEnforcer.java,v
retrieving revision 1.1
diff -u -r1.1 QuotaEnforcer.java
--- QuotaEnforcer.java 5 Nov 2005 03:10:17 -0000 1.1
+++ QuotaEnforcer.java 1 Dec 2005 21:08:47 -0000
@@ -114,63 +114,64 @@
}

protected void innerProcess(CrawlURI curi) {
- // check per-server quotas
+ // Check per-server quotas
CrawlServer server =

getController().getServerCache().getServerFor(curi);
- long fetchQuota =
((Long)getUncheckedAttribute(curi,ATTR_SERVER_MAX_FETCH_SUCCESSES)).longVal
ue();
- long bytesQuota =
1024*((Long)getUncheckedAttribute(curi,ATTR_SERVER_MAX_SUCCESS_KB)).longVal
ue();
- CrawlSubstats substats = server.getSubstats();
- if (checkQuota(curi, fetchQuota,
substats.getFetchSuccesses(),
- "Q:server-fetchSuccesses")) {
- return;
- }
- if (checkQuota(curi, bytesQuota,
substats.getSuccessBytes(),
- "Q:server-successBytes")) {
+ if (server != null && checkQuota(curi,
ATTR_SERVER_MAX_FETCH_SUCCESSES,
+ ATTR_SERVER_MAX_SUCCESS_KB,
server.getSubstats(), "server")) {
return;
}

- // check per-host quotas
- CrawlHost host =
-
getController().getServerCache().getHostFor(curi);
- fetchQuota =
((Long)getUncheckedAttribute(curi,ATTR_HOST_MAX_FETCH_SUCCESSES)).longValue
();
- bytesQuota =
1024*((Long)getUncheckedAttribute(curi,ATTR_HOST_MAX_SUCCESS_KB)).longValue
();
- substats = host.getSubstats();
- if (checkQuota(curi, fetchQuota,
substats.getFetchSuccesses(),
- "Q:host-fetchSuccesses")) {
- return;
- }
- if (checkQuota(curi, bytesQuota,
substats.getSuccessBytes(),
- "Q:host-successBytes")) {
+ // Check per-host quotas
+ CrawlHost host =
getController().getServerCache().getHostFor(curi);
+ if (host != null && checkQuota(curi,
ATTR_HOST_MAX_FETCH_SUCCESSES,
+ ATTR_HOST_MAX_SUCCESS_KB,
host.getSubstats(), "host")) {
return;
}

- // check per-frontier-group (queue) quotas
- FrontierGroup group =
- getController().getFrontier().getGroup(curi);
- fetchQuota =
((Long)getUncheckedAttribute(curi,ATTR_GROUP_MAX_FETCH_SUCCESSES)).longValu
e();
- bytesQuota =
1024*((Long)getUncheckedAttribute(curi,ATTR_GROUP_MAX_SUCCESS_KB)).longValu
e();
- substats = group.getSubstats();
- if (checkQuota(curi, fetchQuota,
substats.getFetchSuccesses(),
- "Q:group-fetchSuccesses")) {
+ // Check per-frontier-group (queue) quotas
+ FrontierGroup group =
getController().getFrontier().getGroup(curi);
+ if (group != null && checkQuota(curi,
ATTR_GROUP_MAX_FETCH_SUCCESSES,
+ ATTR_GROUP_MAX_SUCCESS_KB,
group.getSubstats(), "group")) {
return;
}
- if (checkQuota(curi, bytesQuota,
substats.getSuccessBytes(),
- "Q:group-successBytes")) {
- return;
+ }
+
+ protected boolean checkQuota(final CrawlURI curi,
final String successesKey,
+ final String maxKbKey, final CrawlSubstats
substats,
+ final String logKey) {
+ boolean aboveQuota = false;
+ long fetchQuota =
+ ((Long)getUncheckedAttribute(curi,
successesKey)).longValue();
+ long bytesQuota = 1024 *
+ ((Long) getUncheckedAttribute(curi,
maxKbKey)).longValue();
+ if (checkQuota(curi, fetchQuota,
substats.getFetchSuccesses(),
+ "Q:" + logKey + "-fetchSuccesses")) {
+ aboveQuota = true;
+ } else if (checkQuota(curi, bytesQuota,
substats.getSuccessBytes(),
+ "Q:" + logKey + "-successBytes")) {
+ aboveQuota = true;
}
+ return aboveQuota;
}

/**
- * @param serverFetchQuota
- * @param fetchSuccesses
- * @return
+ * Check if the given quota and actual values rule
out processing the
+ * given CrawlURI, and mark up the CrawlURI
appropriately if so.
+ *
+ * @param curi CrawlURI whose processing is
subject to a potential quota
+ * limitation
+ * @param quota quota value, or zero if no quota
applies
+ * @param actual current value to compare to quota
+ * @param annotate String to mark CrawlURI if
blocked by quota
+ * @return true is CrawlURI is blocked by a quota,
false otherwise
*/
- protected boolean checkQuota(CrawlURI curi, long
quota, long actual, String annotate) {
- if(quota >= 0 && actual >= quota) {
+ protected boolean checkQuota(CrawlURI curi, long
quota, long actual,
+ String annotate) {
+ if (quota >= 0 && actual >= quota) {
curi.setFetchStatus(S_BLOCKED_BY_QUOTA);
curi.addAnnotation(annotate);
- curi.skipToProcessorChain(getController().
- getPostprocessorChain());
+
curi.skipToProcessorChain(getController().getPostprocessorChain());
return true;
}
return false;




Gordon Mohr ( gojomo ) - 2005-12-01 23:53

6

Closed

Fixed

Michael Stack

General

1.8.0

Public


Comments ( 2 )

Date: 2007-03-14 01:03
Sender: karl-ia


This issue is now discussed in the new JIRA tracker at
http://webteam.archive.org/jira/browse/HER-525 -- please add further
comments at that location.


Date: 2005-12-07 02:12
Sender: stack-sfProject Admin

Logged In: YES
user_id=924942

Patch applied. Closing. Commit below:

Apply '[ 1371326 ] refactor/compact QuotaEnforcer code' patch.
* src/java/org/archive/crawler/prefetch/QuotaEnforcer.java
Remove duplicated code.



Attached File

No Files Currently Attached

Changes ( 5 )

Field Old Value Date By
status_id Open 2005-12-07 02:12 stack-sf
resolution_id None 2005-12-07 02:12 stack-sf
artifact_group_id None 2005-12-07 02:12 stack-sf
assigned_to nobody 2005-12-07 02:12 stack-sf
close_date - 2005-12-07 02:12 stack-sf