Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/filter In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27389/src/java/org/archive/crawler/filter Modified Files: PathDepthFilter.java URIRegExpFilter.java PathologicalPathFilter.java OrFilter.java Log Message: * Changed getUriString in UURI to getURIString to match CandidateURI * Added possibility to block URI in the preselector * Fixed bugs in Scope where excludefilter didn't work * Changed inverted attribute on filters to show wath to return if filter matches. Index: PathDepthFilter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/filter/PathDepthFilter.java,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** PathDepthFilter.java 20 Feb 2004 00:56:08 -0000 1.7 --- PathDepthFilter.java 27 Feb 2004 02:00:54 -0000 1.8 *************** *** 41,48 **** */ public class PathDepthFilter extends Filter { ! public static final String ATTR_INVERTED = "path-deeper-than"; public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth"; Integer maxPathDepth = new Integer(Integer.MAX_VALUE); ! char slash = '/'; String path; --- 41,49 ---- */ public class PathDepthFilter extends Filter { ! public static final String ATTR_MATCH_RETURN_VALUE = ! "path-less-or-equal-return"; public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth"; Integer maxPathDepth = new Integer(Integer.MAX_VALUE); ! final static char slash = '/'; String path; *************** *** 51,64 **** */ public PathDepthFilter(String name) { ! super(name, "Path depth filter"); ! addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH, "Max path depth", maxPathDepth)); ! addElementToDefinition( ! new SimpleType( ! ATTR_INVERTED, ! "Allow only paths deeper then max path depth. \nNormally max path" ! + "depth means that only URIs with shorter paths are accepted," ! + "setting this to true means that max path depth becomes (in " ! + "effect) minimum path depth.", ! new Boolean(false))); } --- 52,61 ---- */ public PathDepthFilter(String name) { ! super(name, "Path depth less or equal filter"); ! addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH, "Max path" + ! " depth for which this filter will match", maxPathDepth)); ! addElementToDefinition(new SimpleType(ATTR_MATCH_RETURN_VALUE, ! "What to return when path depth is less or equal to max path" + ! " depth. \n", new Boolean(true))); } *************** *** 87,91 **** if (o instanceof CrawlURI) { try { ! maxPathDepth = (Integer) getAttribute(ATTR_MAX_PATH_DEPTH, (CrawlURI) o); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); --- 84,89 ---- if (o instanceof CrawlURI) { try { ! maxPathDepth = (Integer) getAttribute( ! ATTR_MAX_PATH_DEPTH, (CrawlURI) o); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); *************** *** 98,109 **** * @see org.archive.crawler.framework.Filter#applyInversion() */ ! protected boolean applyInversion(CrawlURI curi) { ! boolean inverter = false; try { ! inverter = ((Boolean) getAttribute(ATTR_INVERTED, curi)).booleanValue(); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); } - return inverter; } } --- 96,107 ---- * @see org.archive.crawler.framework.Filter#applyInversion() */ ! protected boolean returnTrueIfMatches(CrawlURI curi) { try { ! return ((Boolean) getAttribute( ! ATTR_MATCH_RETURN_VALUE, curi)).booleanValue(); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); + return true; } } } Index: URIRegExpFilter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/filter/URIRegExpFilter.java,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** URIRegExpFilter.java 23 Feb 2004 19:18:49 -0000 1.8 --- URIRegExpFilter.java 27 Feb 2004 02:00:54 -0000 1.9 *************** *** 42,46 **** public class URIRegExpFilter extends Filter { public static final String ATTR_REGEXP = "regexp"; ! public static final String ATTR_INVERTED = "accept-matches"; /** --- 42,46 ---- public class URIRegExpFilter extends Filter { public static final String ATTR_REGEXP = "regexp"; ! public static final String ATTR_MATCH_RETURN_VALUE = "if-match-return"; /** *************** *** 50,61 **** super(name, "URI regexp filter."); addElementToDefinition( ! new SimpleType( ! ATTR_INVERTED, ! "Only allow matches. \nIf set to true all URIs matching the " ! + "regular expression will be allowed and only those that " ! + "don't match will be filtered out. If false then URIs " ! + "matching the regular expression will be filtered out " ! + "others will be accepted.", ! new Boolean(false))); addElementToDefinition( new SimpleType(ATTR_REGEXP, "Java regular expression.", "")); --- 50,55 ---- super(name, "URI regexp filter."); addElementToDefinition( ! new SimpleType(ATTR_MATCH_RETURN_VALUE, "What to return when" + ! " regular expression matches. \n", new Boolean(true))); addElementToDefinition( new SimpleType(ATTR_REGEXP, "Java regular expression.", "")); *************** *** 71,75 **** input = ((CandidateURI)o).getURIString(); } else if (o instanceof UURI ){ ! input = ((UURI)o).getUriString(); } else { //TODO handle other inputs --- 65,69 ---- input = ((CandidateURI)o).getURIString(); } else if (o instanceof UURI ){ ! input = ((UURI)o).getURIString(); } else { //TODO handle other inputs *************** *** 102,113 **** * @see org.archive.crawler.framework.Filter#applyInversion() */ ! protected boolean applyInversion(CrawlURI curi) { ! boolean inverter = false; try { ! inverter = ((Boolean) getAttribute(ATTR_INVERTED, curi)).booleanValue(); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); } - return inverter; } } --- 96,106 ---- * @see org.archive.crawler.framework.Filter#applyInversion() */ ! protected boolean returnTrueIfMatches(CrawlURI curi) { try { ! return ((Boolean) getAttribute(ATTR_MATCH_RETURN_VALUE, curi)).booleanValue(); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); + return true; } } } Index: PathologicalPathFilter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/filter/PathologicalPathFilter.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** PathologicalPathFilter.java 24 Feb 2004 01:00:49 -0000 1.2 --- PathologicalPathFilter.java 27 Feb 2004 02:00:54 -0000 1.3 *************** *** 51,59 **** public PathologicalPathFilter(String name) { super(name); ! setDescription("Pathological path filter. The Pathologicalpath filter" + " is used to avoid crawler traps by adding a constraint on" + " how many times a pattern in the URI could be repeated."); ! Type type = getElementFromDefinition(ATTR_INVERTED); type.setTransient(true); --- 51,59 ---- public PathologicalPathFilter(String name) { super(name); ! setDescription("Pathological path filter. \nThe Pathologicalpath filter" + " is used to avoid crawler traps by adding a constraint on" + " how many times a pattern in the URI could be repeated."); ! Type type = getElementFromDefinition(ATTR_MATCH_RETURN_VALUE); type.setTransient(true); *************** *** 62,66 **** addElementToDefinition(new SimpleType(ATTR_REPETITIONS, ! "Number of times the pattern should be allowed to occur.", DEFAULT_REPETITIONS)); } --- 62,68 ---- addElementToDefinition(new SimpleType(ATTR_REPETITIONS, ! "Number of times the pattern should be allowed to occur. \n" + ! "This filter returns true if number of repetitions of a" + ! " pattern exceeds this value", DEFAULT_REPETITIONS)); } Index: OrFilter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/filter/OrFilter.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** OrFilter.java 20 Feb 2004 00:56:08 -0000 1.12 --- OrFilter.java 27 Feb 2004 02:00:54 -0000 1.13 *************** *** 45,52 **** */ public class OrFilter extends Filter { ! public static final String ATTR_INVERTED = "make-filter-XOR"; ! ! private MapType filters; ! public OrFilter(String name, String description) { this(name); --- 45,51 ---- */ public class OrFilter extends Filter { ! public static final String ATTR_MATCH_RETURN_VALUE = "if-matches-return"; ! public static final String ATTR_FILTERS = "filters"; ! public OrFilter(String name, String description) { this(name); *************** *** 60,80 **** super( name, ! "OR Filter. \nA filter that serves as a placeholder for other filters who's functionality should be logically OR'ed together."); ! filters = ! new MapType( ! "filters", ! "This is a list of filters who's functionality should be logically or'ed together by the OrFilter.", ! Filter.class); addElementToDefinition( new SimpleType( ! ATTR_INVERTED, ! "Turn the filter into an XOR filter. \nIf true, instead of " ! + "filtering out anything that any of the filters added to it " ! + "matches, it will only filter out URIs that none of them " ! + "matches.", ! new Boolean(false))); ! addElementToDefinition(filters); } protected boolean innerAccepts(Object o) { if (isEmpty(o)) { --- 59,91 ---- super( name, ! "OR Filter. \nA filter that serves as a placeholder for other" + ! " filters who's functionality should be logically OR'ed together."); ! addElementToDefinition( new SimpleType( ! ATTR_MATCH_RETURN_VALUE, ! "What to return when one of the filters matches. \nIf true, " ! + "this filter will return true if one of the subfilters " ! + "return true, false otherwise. If false, this filter " ! + "will return false if one of the subfilters" ! + "return true, false otherwise.", ! new Boolean(true))); ! ! addElementToDefinition(new MapType(ATTR_FILTERS, ! "This is a list of filters who's functionality should be" + ! " logically or'ed together by the OrFilter.", Filter.class)); } + private MapType getFilters (Object o) { + try { + MapType filters = (MapType) getAttribute( + getSettingsFromObject(o), ATTR_FILTERS); + return filters; + } catch (AttributeNotFoundException e) { + logger.severe(e.getLocalizedMessage()); + return null; + } + } + protected boolean innerAccepts(Object o) { if (isEmpty(o)) { *************** *** 93,97 **** public void addFilter(CrawlerSettings settings, Filter f) { try { ! filters.addElement(settings, f); } catch (InvalidAttributeValueException e) { logger.severe(e.getMessage()); --- 104,108 ---- public void addFilter(CrawlerSettings settings, Filter f) { try { ! getFilters(settings).addElement(settings, f); } catch (InvalidAttributeValueException e) { logger.severe(e.getMessage()); *************** *** 100,108 **** public boolean isEmpty(Object o) { ! return filters.isEmpty(getSettingsFromObject(o)); } public Iterator iterator(Object o) { ! return filters.iterator(getSettingsFromObject(o)); } --- 111,119 ---- public boolean isEmpty(Object o) { ! return getFilters(o).isEmpty(getSettingsFromObject(o)); } public Iterator iterator(Object o) { ! return getFilters(o).iterator(getSettingsFromObject(o)); } *************** *** 110,121 **** * @see org.archive.crawler.framework.Filter#applyInversion() */ ! protected boolean applyInversion(CrawlURI curi) { ! boolean inverter = false; try { ! inverter = ((Boolean) getAttribute(ATTR_INVERTED, curi)).booleanValue(); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); } - return inverter; } } --- 121,131 ---- * @see org.archive.crawler.framework.Filter#applyInversion() */ ! protected boolean returnTrueIfMatches(CrawlURI curi) { try { ! return ((Boolean) getAttribute(ATTR_MATCH_RETURN_VALUE, curi)).booleanValue(); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); + return true; } } } |