You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <nl...@ar...> - 2012-02-21 04:11:15
|
Wayback-1 - Build # 118 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/118/ to view the results. |
From: <ikr...@us...> - 2012-02-21 04:05:45
|
Revision: 3618 http://archive-access.svn.sourceforge.net/archive-access/?rev=3618&view=rev Author: ikreymer Date: 2012-02-21 04:05:39 +0000 (Tue, 21 Feb 2012) Log Message: ----------- FEATURE: Add shutdown() method to ExclusionFilter, allowing for a filter to perform cleanup (such as closing resources) after a request is complete, not just after every capture check. The shutdown() method is optional and is a no-op by default, except in CompositeExclusionFilter which propagates the shutdown to other its members Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2012-02-21 01:39:14 UTC (rev 3617) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2012-02-21 04:05:39 UTC (rev 3618) @@ -30,9 +30,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.archive.util.ArchiveUtils; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.Resource; -import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.exception.LiveWebTimeoutException; @@ -97,7 +96,7 @@ sb = new StringBuilder(100); } - private String hostToRobotUrlString(String host) { + protected String hostToRobotUrlString(String host) { sb.setLength(0); sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX); String robotUrl = sb.toString(); @@ -177,15 +176,15 @@ rulesCache.put(firstUrlString, rules); } } else { + long start = System.currentTimeMillis();; try { LOGGER.fine("ROBOT: NotCached - Downloading("+urlString+")"); tmpRules = new RobotRules(); - long start = System.currentTimeMillis(); Resource resource = webCache.getCachedResource(new URL(urlString), maxCacheMS,true); - long elapsed = System.currentTimeMillis() - start; - PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString); + //long elapsed = System.currentTimeMillis() - start; + //PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString); if(resource.getStatusCode() != 200) { LOGGER.info("ROBOT: NotAvailable("+urlString+")"); @@ -214,6 +213,9 @@ LOGGER.severe("ROBOT: LiveDocumentTimedOutException("+urlString+")"); filterGroup.setRobotTimedOut(); return null; + } finally { + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString); } } } @@ -269,4 +271,8 @@ } return filterResult; } + + public LiveWebCache getWebCache() { + return webCache; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java 2012-02-21 01:39:14 UTC (rev 3617) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/CompositeExclusionFilter.java 2012-02-21 04:05:39 UTC (rev 3618) @@ -69,4 +69,12 @@ } return FILTER_INCLUDE; } + + @Override + public void shutdown() + { + for (ExclusionFilter filter : filters) { + filter.shutdown(); + } + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ExclusionFilter.java 2012-02-21 01:39:14 UTC (rev 3617) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ExclusionFilter.java 2012-02-21 04:05:39 UTC (rev 3618) @@ -32,4 +32,9 @@ public void setFilterGroup(ExclusionCaptureFilterGroup filterGroup) { this.filterGroup = filterGroup; } + + public void shutdown() + { + + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2012-02-21 01:39:14 UTC (rev 3617) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2012-02-21 04:05:39 UTC (rev 3618) @@ -192,6 +192,7 @@ WaybackRequest wbRequest = null; boolean handled = false; + ExclusionFilter exclusionFilter = null; try { String inputPath = translateRequestPathQuery(httpRequest); @@ -219,7 +220,7 @@ } if(getExclusionFactory() != null) { - ExclusionFilter exclusionFilter = + exclusionFilter = getExclusionFactory().get(); if(exclusionFilter == null) { throw new AdministrativeAccessControlException( @@ -279,7 +280,12 @@ getException().renderException(httpRequest, httpResponse, wbRequest, e, getUriConverter()); } + } finally { + if (exclusionFilter != null) { + exclusionFilter.shutdown(); + } } + return handled; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-02-21 01:46:48
|
Wayback-1 - Build # 117 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/117/ to view the results. |
From: <ikr...@us...> - 2012-02-21 01:39:23
|
Revision: 3617 http://archive-access.svn.sourceforge.net/archive-access/?rev=3617&view=rev Author: ikreymer Date: 2012-02-21 01:39:14 +0000 (Tue, 21 Feb 2012) Log Message: ----------- FIX: Make RobotExclusionFilter properties protected for subclasses overrides ADD: Missing ProxyInfo for ProxyAccessPoint support Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ProxyInfo.jsp Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2012-02-18 08:34:32 UTC (rev 3616) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2012-02-21 01:39:14 UTC (rev 3617) @@ -62,16 +62,16 @@ private final static Logger LOGGER = Logger.getLogger(RobotExclusionFilter.class.getName()); - private final static String HTTP_PREFIX = "http://"; - private final static String ROBOT_SUFFIX = "/robots.txt"; + protected final static String HTTP_PREFIX = "http://"; + protected final static String ROBOT_SUFFIX = "/robots.txt"; - private static String WWWN_REGEX = "^www[0-9]+\\."; - private final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX); + protected static String WWWN_REGEX = "^www[0-9]+\\."; + protected final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX); private LiveWebCache webCache = null; private HashMap<String,RobotRules> rulesCache = null; private long maxCacheMS = 0; private String userAgent = null; - private StringBuilder sb = null; + protected StringBuilder sb = null; private final static RobotRules emptyRules = new RobotRules(); private boolean notifiedSeen = false; private boolean notifiedPassed = false; Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ProxyInfo.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ProxyInfo.jsp (rev 0) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ProxyInfo.jsp 2012-02-21 01:39:14 UTC (rev 3617) @@ -0,0 +1,49 @@ +<%@ page language="java" pageEncoding="utf-8" contentType="text/html;charset=utf-8"%> +<%@ page import="java.util.ArrayList" %> +<%@ page import="org.archive.wayback.core.UIResults" %> +<%@ page import="org.archive.wayback.util.StringFormatter" %> +<%@ page import="org.archive.wayback.accesspoint.proxy.ProxyAccessPoint" %> +<%@ page import="org.archive.wayback.accesspoint.AccessPointConfigs" %> +<%@ page import="org.archive.wayback.accesspoint.AccessPointConfig" %> +<% +ProxyAccessPoint accessPoint = (ProxyAccessPoint)request.getAttribute("proxyAccessPoint"); +String contextRoot = accessPoint.getReplayPrefix(); +String referrer = request.getHeader("Referer"); +String logoPath = contextRoot + "images/logo_bw.gif"; +if (referrer == null) { + referrer = "Wayback"; +} +%> + +<p style="text-align: center"><img src="<%= logoPath %>"/><h2>Wayback Proxy Mode Configuration</h2><img src="<%= logoPath %>"/> +<h2>Go To: <a href="<%= contextRoot + ProxyAccessPoint.SWITCH_COLLECTION_PATH%>"><%= referrer %></a></h2> +</p> +<p>Your browser is running with Wayback Machine as proxy mode, but it doesn't know which archived collection to use</p> +<p>When prompted, the <i>Username</i> is the <i>Collection Id</i> or <i>Collection Name</i> for your collection</p> +<i>The password is ignored and may be left blank</i></p> +<p>The following collections are available:</p> + +<table> +<tr> +<td><i>Collection Id</i></td> +</tr> + +<% + +AccessPointConfigs accessPointConfigs = accessPoint.getAccessPointConfigs(); + +if (accessPointConfigs != null) { + for (AccessPointConfig theConfig : accessPointConfigs.getAccessPointConfigs().values()) { + %> + <tr> + <td> + <%= theConfig.getBeanName() %> + </td> + </tr> + <% + } +} + +%> + +</table> Property changes on: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/ProxyInfo.jsp ___________________________________________________________________ Added: svn:mime-type + text/plain This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-02-18 08:40:53
|
Wayback-1 - Build # 116 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/116/ to view the results. |
Revision: 3616 http://archive-access.svn.sourceforge.net/archive-access/?rev=3616&view=rev Author: ikreymer Date: 2012-02-18 08:34:32 +0000 (Sat, 18 Feb 2012) Log Message: ----------- BUGFIX: Check for null on oracle policy Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2012-02-18 07:08:39 UTC (rev 3615) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2012-02-18 08:34:32 UTC (rev 3616) @@ -344,7 +344,7 @@ if (tagName.equals("NOSCRIPT")) { String allPolicies = getOraclePolicies(context); - if (allPolicies.contains("force-noscript")) { + if ((allPolicies != null) && allPolicies.contains("force-noscript")) { return false; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-02-18 07:15:46
|
Wayback-1 - Build # 115 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/115/ to view the results. |
Revision: 3615 http://archive-access.svn.sourceforge.net/archive-access/?rev=3615&view=rev Author: ikreymer Date: 2012-02-18 07:08:39 +0000 (Sat, 18 Feb 2012) Log Message: ----------- BUGFIX: Fix typo due to unsaved file, calling getAccessPointPath() in BeanNameRegistrar Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java 2012-02-18 07:07:06 UTC (rev 3614) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java 2012-02-18 07:08:39 UTC (rev 3615) @@ -134,7 +134,7 @@ String name = null; if (handler instanceof AccessPoint) { - name = ((AccessPoint)handler).getAccessPointName(); + name = ((AccessPoint)handler).getAccessPointPath(); } if (name == null) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <ikr...@us...> - 2012-02-18 07:07:13
|
Revision: 3614 http://archive-access.svn.sourceforge.net/archive-access/?rev=3614&view=rev Author: ikreymer Date: 2012-02-18 07:07:06 +0000 (Sat, 18 Feb 2012) Log Message: ----------- FEATURE: Provide ability to set access point path seperate from the Spring bean name. This is really useful for interpolation, as Spring bean names are not interpolated as properties. Allows for following setup: <bean name="webAccessPoint" class="org.archive.wayback.webapp.AccessPoint"> <property name="accessPointPath" value="${wayback.host}:${wayback.port}:wayback"/> ... </bean> Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java 2012-02-16 19:07:40 UTC (rev 3613) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/webapp/BeanNameRegistrar.java 2012-02-18 07:07:06 UTC (rev 3614) @@ -23,6 +23,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.archive.wayback.webapp.AccessPoint; + /** * Helper static methods to implement registration of a RequestHandler with a * RequestMapper, based on the beanName() method. @@ -128,7 +130,17 @@ */ public static void registerHandler(RequestHandler handler, RequestMapper mapper) { - String name = handler.getBeanName(); + + String name = null; + + if (handler instanceof AccessPoint) { + name = ((AccessPoint)handler).getAccessPointName(); + } + + if (name == null) { + name = handler.getBeanName(); + } + if(name != null) { if(name.equals(RequestMapper.GLOBAL_PRE_REQUEST_HANDLER)) { LOGGER.info("Registering Global-pre request handler:" + Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2012-02-16 19:07:40 UTC (rev 3613) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2012-02-18 07:07:06 UTC (rev 3614) @@ -132,6 +132,8 @@ private BooleanOperator<WaybackRequest> authentication = null; private long embargoMS = 0; private CustomResultFilterFactory filterFactory = null; + + private String accessPointPath = null; public void init() { checkAccessPointAware(collection,exception,query,parser,replay, @@ -958,4 +960,12 @@ public CustomResultFilterFactory getFilterFactory() { return filterFactory; } + + public String getAccessPointPath() { + return accessPointPath; + } + + public void setAccessPointPath(String accessPointPath) { + this.accessPointPath = accessPointPath; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-02-16 19:16:36
|
Wayback-1 - Build # 114 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/114/ to view the results. |
From: <ikr...@us...> - 2012-02-16 19:07:53
|
Revision: 3613 http://archive-access.svn.sourceforge.net/archive-access/?rev=3613&view=rev Author: ikreymer Date: 2012-02-16 19:07:40 +0000 (Thu, 16 Feb 2012) Log Message: ----------- VERSION: Increment to 1.7.1-SNAPSHOT FEATURE: Adding CompositeAccessPoint and related classes (org.archive.wayback.accesspoint) which allow for a single access point to be used with different AccessPointConfigs. This AccessPoint is useful when there is a single WaybackCollection but multiple, possibly dynamic ways to access it. Each config mainly consists mainly of file prefixes that can pass through that access point, as well as user properties. FEATURE: There is also a ProxyAccessPoint and related classes (org.archive.wayback.accesspoint.proxy) which allows Wayback to serve as Proxy on any config that is part of a CompositeAccessPoint. Which access point is the active proxy can be configured a number of ways, including basic authentication, just IP checking or a cookie (not completed in this update). FIXES/UPDATE: FastArchivalUrlReplayParseEventHandler includes support for inserting JSP into <head> block, as well as improved checking for beginning of <body> block when inserting body jsp (including skipping through any <head>, <script> or <style> blocks that may have spurrous tags) Also ability to respond to check rewrite policy from CustomOracleFilter FEATURE: CustomPolicyOracleFilter extends ExclusionOracleFilter to provide custom policies beyond allow, block, robots. The policy is stored in the CaptureSearchResult and may be available during parsing. FIX: JSPExecutor now has getUiResults() ADDED: ArchivalUrlResultURIConverterFactory added for dynamic URI converters that need to change per AccessPointConfig Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/pom.xml trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/pom.xml trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/JSPExecutor.java trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfig.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfigs.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/CompositeAccessPoint.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/AuthProxyConfigSelector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/IPProxyConfigSelector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyAccessPoint.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyConfigSelector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverterFactory.java Modified: trunk/archive-access/projects/wayback/dist/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/pom.xml 2012-02-07 19:36:59 UTC (rev 3612) +++ trunk/archive-access/projects/wayback/dist/pom.xml 2012-02-16 19:07:40 UTC (rev 3613) @@ -7,7 +7,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.7.0-SNAPSHOT</version> + <version>1.7.1-SNAPSHOT</version> </parent> <artifactId>dist</artifactId> Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2012-02-07 19:36:59 UTC (rev 3612) +++ trunk/archive-access/projects/wayback/pom.xml 2012-02-16 19:07:40 UTC (rev 3613) @@ -7,7 +7,7 @@ <groupId>org.archive.wayback</groupId> <artifactId>wayback</artifactId> <packaging>pom</packaging> - <version>1.7.0-SNAPSHOT</version> + <version>1.7.1-SNAPSHOT</version> <name>Wayback</name> <modules> Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2012-02-07 19:36:59 UTC (rev 3612) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2012-02-16 19:07:40 UTC (rev 3613) @@ -8,7 +8,7 @@ <parent> <artifactId>wayback</artifactId> <groupId>org.archive.wayback</groupId> - <version>1.7.0-SNAPSHOT</version> + <version>1.7.1-SNAPSHOT</version> </parent> <artifactId>wayback-core</artifactId> Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilter.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,91 @@ +/* + * Oracle Filter Implementation that supports custom policies in addition to + * allow, block, block-message and robots + * + * The policy is stored in the CaptureSearchResult + */ + +package org.archive.wayback.accesscontrol.oracleclient; + +import java.util.Date; +import java.util.logging.Logger; + +import org.archive.accesscontrol.RobotsUnavailableException; +import org.archive.accesscontrol.RuleOracleUnavailableException; +import org.archive.util.ArchiveUtils; +import org.archive.wayback.core.CaptureSearchResult; + +public class CustomPolicyOracleFilter extends OracleExclusionFilter { + + public static final String CAPTURE_ORACLE_POLICY = "oracle-policy"; + + private static final Logger LOGGER = Logger.getLogger( + CustomPolicyOracleFilter.class.getName()); + + enum Policy { + ALLOW("allow"), + BLOCK_HIDDEN("block"), + BLOCK_MESSAGE("block-message"), + ROBOTS("robots"); + + Policy(String policy) { + this.policy = policy; + } + + boolean matches(String other) + { + return (other.equals(this.policy)); + } + + String policy; + } + + protected int defaultFilter = FILTER_INCLUDE; + + public CustomPolicyOracleFilter(String oracleUrl, String accessGroup, String proxyHostPort) { + super(oracleUrl, accessGroup, proxyHostPort); + } + + @Override + public int filterObject(CaptureSearchResult o) { + String url = o.getOriginalUrl(); + Date captureDate = o.getCaptureDate(); + Date retrievalDate = new Date(); + + String policy; + try { + policy = client.getPolicy(ArchiveUtils.addImpliedHttpIfNecessary(url), captureDate, retrievalDate, accessGroup); + + o.put(CAPTURE_ORACLE_POLICY, policy); + + if (policy == null) { + return defaultFilter; + } + + if (Policy.ALLOW.matches(policy)) { + return handleAllow(); + } + + // Block page but silently, as if it wasn't found + if (Policy.BLOCK_HIDDEN.matches(policy)) { + return FILTER_EXCLUDE; + } + + // Block page bit and display "access blocked" message + if (Policy.BLOCK_MESSAGE.matches(policy)) { + return handleBlock(); + } + + if (Policy.ROBOTS.matches("policy")) { + return handleRobots(); + } + + } catch (RobotsUnavailableException e) { + e.printStackTrace(); + } catch (RuleOracleUnavailableException e) { + LOGGER.warning("Oracle Unavailable/not running, default to allow all until it responds. Details: " + e.toString()); + } + + return defaultFilter; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilter.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilterFactory.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,10 @@ +package org.archive.wayback.accesscontrol.oracleclient; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; + +public class CustomPolicyOracleFilterFactory extends OracleExclusionFilterFactory { + + @Override + public ExclusionFilter get() { + return new CustomPolicyOracleFilter(this.getOracleUrl(), this.getAccessGroup(), this.getProxyHostPort()); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/CustomPolicyOracleFilterFactory.java ___________________________________________________________________ Added: svn:mime-type + text/plain Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java 2012-02-07 19:36:59 UTC (rev 3612) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -1,5 +1,4 @@ -/* - * This file is part of the Wayback archival access software +/* This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual @@ -33,8 +32,8 @@ * */ public class OracleExclusionFilter extends ExclusionFilter { - AccessControlClient client = null; - private String accessGroup = null; + protected AccessControlClient client = null; + protected String accessGroup = null; private final static String POLICY_ALLOW = "allow"; private final static String POLICY_BLOCK = "block"; @@ -70,7 +69,57 @@ this.accessGroup = accessGroup; } + protected int handleAllow() + { + if(!notifiedAdminSeen) { + notifiedAdminSeen = true; + if(filterGroup != null) { + filterGroup.setSawAdministrative(); + } + } + if(!notifiedAdminPassed) { + notifiedAdminPassed = true; + if(filterGroup != null) { + filterGroup.setPassedAdministrative(); + } + } + return FILTER_INCLUDE; + } + protected int handleBlock() + { + if(!notifiedAdminSeen) { + notifiedAdminSeen = true; + if(filterGroup != null) { + filterGroup.setSawAdministrative(); + } + } + return FILTER_EXCLUDE; + } + + protected int handleRobots() + { + if(!notifiedRobotSeen) { + notifiedRobotSeen = true; + if(filterGroup != null) { + filterGroup.setSawRobots(); + } + } + return FILTER_INCLUDE; +// if(robotFilter != null) { +// if(!notifiedRobotPassed) { +// notifiedRobotPassed = true; +// if(filterGroup != null) { +// filterGroup.setPassedRobot(); +// } +// } +// return robotFilter.filterObject(o); +// } else { +// return FILTER_EXCLUDE; +// } + } + + public int filterObject(CaptureSearchResult o) { String url = o.getOriginalUrl(); Date captureDate = o.getCaptureDate(); @@ -82,46 +131,11 @@ accessGroup); if(policy != null) { if(policy.equals(POLICY_ALLOW)) { - if(!notifiedAdminSeen) { - notifiedAdminSeen = true; - if(filterGroup != null) { - filterGroup.setSawAdministrative(); - } - } - if(!notifiedAdminPassed) { - notifiedAdminPassed = true; - if(filterGroup != null) { - filterGroup.setPassedAdministrative(); - } - } - return FILTER_INCLUDE; + return handleAllow(); } else if(policy.equals(POLICY_BLOCK)) { - if(!notifiedAdminSeen) { - notifiedAdminSeen = true; - if(filterGroup != null) { - filterGroup.setSawAdministrative(); - } - } - return FILTER_EXCLUDE; + return handleBlock(); } else if(policy.equals(POLICY_ROBOT)) { - if(!notifiedRobotSeen) { - notifiedRobotSeen = true; - if(filterGroup != null) { - filterGroup.setSawRobots(); - } - } - return FILTER_INCLUDE; -// if(robotFilter != null) { -// if(!notifiedRobotPassed) { -// notifiedRobotPassed = true; -// if(filterGroup != null) { -// filterGroup.setPassedRobot(); -// } -// } -// return robotFilter.filterObject(o); -// } else { -// return FILTER_EXCLUDE; -// } + return handleRobots(); } } } catch (RobotsUnavailableException e) { Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointAdapter.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,325 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.accesspoint; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Properties; + +import javax.servlet.ServletContext; + +import org.archive.wayback.ExceptionRenderer; +import org.archive.wayback.QueryRenderer; +import org.archive.wayback.ReplayDispatcher; +import org.archive.wayback.RequestParser; +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.accesscontrol.CompositeExclusionFilterFactory; +import org.archive.wayback.accesscontrol.ExclusionFilterFactory; +import org.archive.wayback.accesscontrol.oracleclient.CustomPolicyOracleFilter; +import org.archive.wayback.accesspoint.proxy.ProxyAccessPoint; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.replay.html.ContextResultURIConverterFactory; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.util.operator.BooleanOperator; +import org.archive.wayback.webapp.AccessPoint; +import org.archive.wayback.webapp.CustomResultFilterFactory; +import org.archive.wayback.webapp.WaybackCollection; + +public class AccessPointAdapter extends AccessPoint { + + private CompositeAccessPoint baseAccessPoint; + private AccessPointConfig config; + private ExclusionFilterFactory exclusionFactory; + private ResultURIConverter cacheUriConverter; + + private boolean switchable = false; + + private class DynamicExclusionFactory implements ExclusionFilterFactory + { + public ExclusionFilter get() + { + return new CustomPolicyOracleFilter(baseAccessPoint.getOracleUrl(), config.getBeanName(), null); + } + + public void shutdown() { + + } + } + + public AccessPointAdapter(CompositeAccessPoint baseAccessPoint, AccessPointConfig config) + { + this.baseAccessPoint = baseAccessPoint; + this.config = config; + this.exclusionFactory = null; + + this.switchable = true; + } + + public AccessPointAdapter(String accessPointName, CompositeAccessPoint baseAccessPoint) + { + this.baseAccessPoint = baseAccessPoint; + this.exclusionFactory = null; + this.config = baseAccessPoint.getAccessPointConfigs().getAccessPointConfigs().get(accessPointName); + + this.switchable = false; + } + + public CompositeAccessPoint getBaseAccessPoint() + { + return baseAccessPoint; + } + + public boolean isProxyMode() + { + return (baseAccessPoint instanceof ProxyAccessPoint); + } + + public boolean isProxySwitchable() + { + return switchable && isProxyMode(); + } + + public String getSwitchCollPath() + { + return ProxyAccessPoint.SWITCH_COLLECTION_PATH; + } + + public AccessPointConfig getAccessPointConfig() + { + return config; + } + + public Map<String, Object> getUserProps() + { + return baseAccessPoint.getUserProps(); + } + + @Override + public List<String> getFileIncludePrefixes() { + return config.getFileIncludePrefixes(); + } + + @Override + public List<String> getFileExcludePrefixes() { + return config.getFileExcludePrefixes(); + } + + @Override + public Properties getConfigs() { + // TODO Auto-generated method stub + return config.getConfigs(); + } + + @Override + public ExclusionFilterFactory getExclusionFactory() { + if (exclusionFactory == null) { + exclusionFactory = buildExclusionFactory(); + } + + return exclusionFactory; + } + + protected ExclusionFilterFactory buildExclusionFactory() + { + ArrayList<ExclusionFilterFactory> staticExclusions = baseAccessPoint.getStaticExclusions(); + + if (staticExclusions == null) { + return new DynamicExclusionFactory(); + } else { + CompositeExclusionFilterFactory composite = new CompositeExclusionFilterFactory(); + ArrayList<ExclusionFilterFactory> allExclusions = new ArrayList<ExclusionFilterFactory>(); + allExclusions.addAll(staticExclusions); + allExclusions.add(new DynamicExclusionFactory()); + composite.setFactories(allExclusions); + return composite; + } + } + + protected String getPrefix(String basePrefix) + { + if (isProxyMode()) { + return basePrefix; + } else { + return basePrefix + config.getBeanName() + "/"; + } + } + + @Override + public String getStaticPrefix() { + // TODO Auto-generated method stub + return getPrefix(baseAccessPoint.getStaticPrefix()); + } + + @Override + public String getReplayPrefix() { + // TODO Auto-generated method stub + return getPrefix(baseAccessPoint.getReplayPrefix()); + } + + @Override + public String getQueryPrefix() { + // TODO Auto-generated method stub + return getPrefix(baseAccessPoint.getQueryPrefix()); + } + + @Override + public boolean isExactHostMatch() { + // TODO Auto-generated method stub + return baseAccessPoint.isExactHostMatch(); + } + + @Override + public boolean isExactSchemeMatch() { + // TODO Auto-generated method stub + return baseAccessPoint.isExactSchemeMatch(); + } + + @Override + public boolean isUseAnchorWindow() { + // TODO Auto-generated method stub + return baseAccessPoint.isUseAnchorWindow(); + } + + @Override + public boolean isServeStatic() { + // TODO Auto-generated method stub + return baseAccessPoint.isServeStatic(); + } + + @Override + public ServletContext getServletContext() { + return baseAccessPoint.getServletContext(); + } + + @Override + public String getLiveWebPrefix() { + // TODO Auto-generated method stub + return baseAccessPoint.getLiveWebPrefix(); + } + + @Override + public String getInterstitialJsp() { + // TODO Auto-generated method stub + return baseAccessPoint.getInterstitialJsp(); + } + + @Override + public Locale getLocale() { + // TODO Auto-generated method stub + return baseAccessPoint.getLocale(); + } + + @Override + public List<String> getFilePatterns() { + // TODO Auto-generated method stub + return baseAccessPoint.getFilePatterns(); + } + + @Override + public WaybackCollection getCollection() { + // TODO Auto-generated method stub + return baseAccessPoint.getCollection(); + } + + @Override + public ExceptionRenderer getException() { + // TODO Auto-generated method stub + return baseAccessPoint.getException(); + } + + @Override + public QueryRenderer getQuery() { + // TODO Auto-generated method stub + return baseAccessPoint.getQuery(); + } + + @Override + public RequestParser getParser() { + // TODO Auto-generated method stub + return baseAccessPoint.getParser(); + } + + @Override + public ReplayDispatcher getReplay() { + // TODO Auto-generated method stub + return baseAccessPoint.getReplay(); + } + + @Override + public ResultURIConverter getUriConverter() { + + if (cacheUriConverter == null) { + ContextResultURIConverterFactory factory = baseAccessPoint.getUriConverterFactory(); + + if (factory != null) { + cacheUriConverter = factory.getContextConverter(getReplayPrefix()); + } else { + cacheUriConverter = baseAccessPoint.getUriConverter(); + } + } + + return cacheUriConverter; + } + + @Override + public BooleanOperator<WaybackRequest> getAuthentication() { + // TODO Auto-generated method stub + return baseAccessPoint.getAuthentication(); + } + + @Override + public String getRefererAuth() { + // TODO Auto-generated method stub + return baseAccessPoint.getRefererAuth(); + } + + @Override + public boolean isBounceToReplayPrefix() { + // TODO Auto-generated method stub + return baseAccessPoint.isBounceToReplayPrefix(); + } + + @Override + public boolean isBounceToQueryPrefix() { + // TODO Auto-generated method stub + return baseAccessPoint.isBounceToQueryPrefix(); + } + + @Override + public long getEmbargoMS() { + // TODO Auto-generated method stub + return baseAccessPoint.getEmbargoMS(); + } + + @Override + public boolean isForceCleanQueries() { + // TODO Auto-generated method stub + return baseAccessPoint.isForceCleanQueries(); + } + + @Override + public CustomResultFilterFactory getFilterFactory() { + // TODO Auto-generated method stub + return baseAccessPoint.getFilterFactory(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointAdapter.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfig.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfig.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfig.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,41 @@ +package org.archive.wayback.accesspoint; + +import java.util.List; +import java.util.Properties; + +import org.springframework.beans.factory.BeanNameAware; + +public class AccessPointConfig implements BeanNameAware { + + private Properties configs = null; + private List<String> fileIncludePrefixes = null; + private List<String> fileExcludePrefixes = null; + private String beanName; + + public Properties getConfigs() { + return configs; + } + public void setConfigs(Properties configs) { + this.configs = configs; + } + public List<String> getFileIncludePrefixes() { + return fileIncludePrefixes; + } + public void setFileIncludePrefixes(List<String> fileIncludePrefixes) { + this.fileIncludePrefixes = fileIncludePrefixes; + } + public List<String> getFileExcludePrefixes() { + return fileExcludePrefixes; + } + public void setFileExcludePrefixes(List<String> fileExcludePrefixes) { + this.fileExcludePrefixes = fileExcludePrefixes; + } + + public void setBeanName(String beanName) { + this.beanName = beanName; + } + + public String getBeanName() { + return this.beanName; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfig.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfigs.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfigs.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfigs.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,34 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.accesspoint; + +import java.util.HashMap; + +public class AccessPointConfigs { + private HashMap<String, AccessPointConfig> configs; + + public HashMap<String, AccessPointConfig> getAccessPointConfigs() { + return configs; + } + public void setAccessPointConfigs(HashMap<String, AccessPointConfig> configs) { + this.configs = configs; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/AccessPointConfigs.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/CompositeAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/CompositeAccessPoint.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/CompositeAccessPoint.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,162 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.accesspoint; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.accesscontrol.ExclusionFilterFactory; +import org.archive.wayback.replay.html.ContextResultURIConverterFactory; +import org.archive.wayback.webapp.AccessPoint; + +public class CompositeAccessPoint extends AccessPoint { + + protected final static String REQUEST_CONTEXT_PREFIX = + "webapp-request-context-path-prefix"; + + protected enum Status + { + ConfigNotFound, + ConfigHandled, + ConfigNotHandled, + } + + private HashMap<String, AccessPointAdapter> accessPointCache; + + public CompositeAccessPoint() + { + accessPointCache = new HashMap<String, AccessPointAdapter>(); + } + + @Override + public boolean handleRequest(HttpServletRequest request, + HttpServletResponse response) throws ServletException, + IOException { + + String configName = request.getRequestURI(); + + if (!configName.isEmpty() && (configName.charAt(0) == '/')) { + configName = configName.substring(1); + } + + int slash = configName.indexOf('/'); + + if (slash >= 0) { + configName = configName.substring(0, slash); + } + + request.setAttribute(REQUEST_CONTEXT_PREFIX, "/" + configName + "/"); + + Status status = handleRequest(configName, request, response); + return (status == Status.ConfigHandled); + } + + protected Status handleRequest(String realAccessPoint, HttpServletRequest request, + HttpServletResponse response) throws ServletException, + IOException { + + // First, check cached accessPoint + AccessPointAdapter adapter = accessPointCache.get(realAccessPoint); + + if ((adapter == null) && (accessPointConfigs != null)) { + AccessPointConfig config = accessPointConfigs.getAccessPointConfigs().get(realAccessPoint); + + if (config != null) { + adapter = new AccessPointAdapter(this, config); + accessPointCache.put(realAccessPoint, adapter); + } + } + + if (adapter == null) { + return Status.ConfigNotFound; + } + + boolean handled = adapter.handleRequest(request, response); + return (handled ? Status.ConfigHandled : Status.ConfigNotHandled); + } + + private String oracleUrl; + private ArrayList<ExclusionFilterFactory> staticExclusions; + + private ContextResultURIConverterFactory uriConverterFactory; + + public ContextResultURIConverterFactory getUriConverterFactory() { + return uriConverterFactory; + } + + public void setUriConverterFactory(ContextResultURIConverterFactory uriConverterFactory) { + this.uriConverterFactory = uriConverterFactory; + } + + public ArrayList<ExclusionFilterFactory> getStaticExclusions() { + return staticExclusions; + } + + public void setStaticExclusions( + ArrayList<ExclusionFilterFactory> staticExclusions) { + this.staticExclusions = staticExclusions; + } + + private Map<String, Object> userProps; + + private AccessPointConfigs accessPointConfigs; + + public String getOracleUrl() { + return oracleUrl; + } + public void setOracleUrl(String oracleUrl) { + this.oracleUrl = oracleUrl; + } + public Map<String, Object> getUserProps() { + return userProps; + } + + public void setUserProps(Map<String, Object> userProps) { + this.userProps = userProps; + } + + public AccessPointConfigs getAccessPointConfigs() { + return accessPointConfigs; + } + + public void setAccessPointConfigs(AccessPointConfigs accessPointConfigs) { + this.accessPointConfigs = accessPointConfigs; + } + + public AccessPointConfig findConfigForFile(String file) + { + for (AccessPointConfig config : accessPointConfigs.getAccessPointConfigs().values()) { + for (String prefix : config.getFileIncludePrefixes()) { + if (file.startsWith(prefix)) { + return config; + } + } + } + + return null; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/CompositeAccessPoint.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/AuthProxyConfigSelector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/AuthProxyConfigSelector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/AuthProxyConfigSelector.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,142 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.accesspoint.proxy; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; + +import javax.servlet.RequestDispatcher; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import javax.servlet.http.HttpSession; + +import org.apache.commons.codec.binary.Base64; +import org.archive.wayback.replay.StringHttpServletResponseWrapper; + +public class AuthProxyConfigSelector implements ProxyConfigSelector { + + public final static String PROXY_REFERRER_KEY = "wayback-wombat-proxy-referrer"; + + private String proxyInfoJsp = "/WEB-INF/replay/ProxyInfo.jsp"; + + private String authMsg = "Please enter the collection number to see Wayback content from that collection. (You can leave the password blank)"; + + public String getProxyInfoJsp() { + return proxyInfoJsp; + } + + public void setProxyInfoJsp(String proxyInfoJsp) { + this.proxyInfoJsp = proxyInfoJsp; + } + + public String getAuthMsg() { + return authMsg; + } + + public void setAuthMsg(String authMsg) { + this.authMsg = authMsg; + } + + public String resolveConfig(HttpServletRequest request) { + String authenticate = request.getHeader("Proxy-Authorization"); + + if (authenticate != null) { + String auth = decodeBasic(authenticate); + if (auth != null) { + int userEnd = auth.indexOf(':'); + return auth.substring(0, userEnd); + } + } + + return null; + } + + public boolean selectConfigHandler(HttpServletRequest request, HttpServletResponse response, ProxyAccessPoint proxy) throws IOException + { + response.setStatus(HttpServletResponse.SC_PROXY_AUTHENTICATION_REQUIRED); //407 + response.setHeader("Proxy-Authenticate", "Basic realm=\"" + authMsg + "\""); + response.setContentType("text/html"); + + //TODO: Better way to pass this to jsp? + request.setAttribute("proxyAccessPoint", proxy); + + StringHttpServletResponseWrapper wrappedResponse = + new StringHttpServletResponseWrapper(response); + RequestDispatcher dispatcher = request.getRequestDispatcher(proxyInfoJsp); + + try { + dispatcher.forward(request, wrappedResponse); + } catch (ServletException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + PrintWriter writer = response.getWriter(); + writer.println(wrappedResponse.getStringResponse()); + return true; + } + + private String decodeBasic(String authHeaderValue) { + if(authHeaderValue != null) { + if(authHeaderValue.startsWith("Basic ")) { + String b64 = authHeaderValue.substring(6); + byte[] decoded = Base64.decodeBase64(b64.getBytes()); + try { + return new String(decoded,"utf-8"); + } catch (UnsupportedEncodingException e) { + // really?... + return new String(decoded); + } + } + } + return null; + + } + + public void handleSwitch(HttpServletRequest request, + HttpServletResponse response, ProxyAccessPoint proxy) throws IOException { + + // Check reset cookie... + HttpSession sess = request.getSession(); + String referrer = (String)sess.getAttribute(PROXY_REFERRER_KEY); + + // If referrer not set, we're sending the switch request + if (referrer == null) { + String httpReferrer = request.getHeader("Referer"); + if (httpReferrer == null) { + httpReferrer = proxy.getReplayPrefix(); + } + sess.setAttribute(PROXY_REFERRER_KEY, httpReferrer); + + selectConfigHandler(request, response, proxy); + } else { + sess.removeAttribute(PROXY_REFERRER_KEY); + response.sendRedirect(referrer); + } + } + + public void handleProxyPac(HttpServletRequest httpRequest, + HttpServletResponse httpResponse) { + //No Special Handling for Proxy Pac request + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/AuthProxyConfigSelector.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/IPProxyConfigSelector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/IPProxyConfigSelector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/IPProxyConfigSelector.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,116 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.accesspoint.proxy; + +import java.io.IOException; + +import javax.servlet.RequestDispatcher; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.util.bdb.BDBMap; + +public class IPProxyConfigSelector implements ProxyConfigSelector { + + protected String proxyInfoJsp = "/WEB-INF/replay/ProxyInfo.jsp"; + + public String resolveConfig(HttpServletRequest request) { + String context = request.getContextPath(); + BDBMap bdbMap = BDBMap.getContextMap(context); + + String key = genKey(request); + String coll = bdbMap.get(key); + return coll; + } + + protected String genKey(HttpServletRequest request) + { + return request.getRemoteAddr() + "$coll"; + } + + public boolean selectConfigHandler(HttpServletRequest request, + HttpServletResponse response, ProxyAccessPoint proxy) throws IOException { + + request.setAttribute("proxyAccessPoint", proxy); + + RequestDispatcher dispatcher = request.getRequestDispatcher(proxyInfoJsp); + + try { + dispatcher.forward(request, response); + } catch (ServletException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + return true; + } + + public void handleSwitch(HttpServletRequest request, + HttpServletResponse response, ProxyAccessPoint proxy) + throws IOException { + + String config = request.getParameter("config"); + + if (config == null) { + selectConfigHandler(request, response, proxy); + return; + } + + setConfig(request, config); + + String referrer = request.getHeader("Referer"); + if (referrer == null) { + referrer = proxy.getReplayPrefix(); + } + response.sendRedirect(referrer); + } + + protected void setConfig(HttpServletRequest request, String config) + { + String context = request.getContextPath(); + BDBMap bdbMap = BDBMap.getContextMap(context); + + String key = genKey(request); + bdbMap.put(key, config); + } + + public void handleProxyPac(HttpServletRequest httpRequest, + HttpServletResponse httpResponse) { + + String uri = httpRequest.getRequestURI(); + int pacStrIndex = uri.indexOf(ProxyAccessPoint.PROXY_PAC_PATH); + + if (pacStrIndex >= 0) { + String config = uri.substring(1, pacStrIndex); + //System.out.println("config: " + config); + setConfig(httpRequest, config); + } + } + + public String getProxyInfoJsp() { + return proxyInfoJsp; + } + + public void setProxyInfoJsp(String proxyInfoJsp) { + this.proxyInfoJsp = proxyInfoJsp; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/IPProxyConfigSelector.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyAccessPoint.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyAccessPoint.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,190 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.accesspoint.proxy; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.List; +import java.util.logging.Logger; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.wayback.accesspoint.CompositeAccessPoint; +import org.archive.wayback.webapp.AccessPoint; + +public class ProxyAccessPoint extends CompositeAccessPoint { + + private static final Logger LOGGER = + Logger.getLogger(ProxyAccessPoint.class.getName()); + + public final static String SWITCH_COLLECTION_PATH = "switchCollection"; + public final static String PROXY_PAC_PATH = "/proxy.pac"; + + private List<String> directHosts; + private AccessPoint nonProxyAccessPoint; + + private ProxyConfigSelector configSelector; + + public ProxyConfigSelector getConfigSelector() { + return configSelector; + } + + public void setConfigSelector(ProxyConfigSelector configSelector) { + this.configSelector = configSelector; + } + + public List<String> getDirectHosts() { + return directHosts; + } + + public void setDirectHosts(List<String> directHosts) { + this.directHosts = directHosts; + } + + public AccessPoint getNonProxyAccessPoint() { + return nonProxyAccessPoint; + } + + public void setNonProxyAccessPoint(AccessPoint nonProxyAccessPoint) { + this.nonProxyAccessPoint = nonProxyAccessPoint; + } + + @Override + public boolean handleRequest(HttpServletRequest request, + HttpServletResponse response) throws ServletException, + IOException { + + boolean isProxyReq = (request.getHeader("Proxy-Connection") != null); + + if (!isProxyReq) { + return handleNonProxy(request, response); + } else { + return handleProxy(request, response); + } + } + + protected boolean handleNonProxy(HttpServletRequest request, + HttpServletResponse response) throws ServletException, IOException + { + String uri = request.getRequestURI(); + + if (uri.endsWith(PROXY_PAC_PATH)) { + this.writeProxyPac(request, response); + return true; + } + + return baseHandleRequest(request, response); + } + + protected boolean handleProxy(HttpServletRequest request, + HttpServletResponse response) throws ServletException, IOException + { + StringBuffer urlBuff = request.getRequestURL(); + String url = urlBuff.toString(); + + boolean isProxyHost = url.startsWith(getReplayPrefix()); + + if (isProxyHost) { + // Special reset link + if (url.endsWith(SWITCH_COLLECTION_PATH)) { + configSelector.handleSwitch(request, response, this); + return true; + } + } + + String realAccessPoint = configSelector.resolveConfig(request); + + if (realAccessPoint != null) { + + // See if the archival url form was included and redirect to strip it + if (isProxyHost) { + String prefix = "/" + realAccessPoint + "/"; + String uri = request.getRequestURI(); + + + if (uri.length() > prefix.length()) { + String requestUrl = uri.substring(prefix.length()); + + // If matches this config, simply redirect and strip + if (uri.startsWith(prefix)) { + response.sendRedirect("/" + requestUrl); + return true; + } + } + + //If archival url with any *different* config, force a selection + //if (ReplayRequestParser.WB_REQUEST_REGEX.matcher(requestUrl).matches()) { + // return configSelector.selectConfigHandler(request, response, this); + //} + } + + Status status = handleRequest(realAccessPoint, request, response); + + switch (status) { + case ConfigHandled: + return true; + + case ConfigNotHandled: + return false; + + case ConfigNotFound: + break; + } + } + + return configSelector.selectConfigHandler(request, response, this); + } + + protected boolean baseHandleRequest(HttpServletRequest request, + HttpServletResponse response) throws ServletException, IOException { + + if (nonProxyAccessPoint != null) { + return nonProxyAccessPoint.handleRequest(request, response); + } else { + return super.handleRequest(request, response); + } + } + + protected void writeProxyPac(HttpServletRequest httpRequest, + HttpServletResponse httpResponse) throws ServletException, + IOException { + + configSelector.handleProxyPac(httpRequest, httpResponse); + + String hostName = httpRequest.getServerName(); + int port = httpRequest.getServerPort(); + + httpResponse.setContentType("application/x-ns-proxy-autoconfig"); + + LOGGER.fine("updating proxy .pac"); + + PrintWriter writer = httpResponse.getWriter(); + writer.println("function FindProxyForURL (url, host) {"); + + for (String host : directHosts) { + writer.println(" if (shExpMatch(host, \"" + host + "\")) { return \"DIRECT\"; }"); + } + + writer.println(" return \"PROXY " + hostName + ":" + port + "\";\n}"); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyAccessPoint.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyConfigSelector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyConfigSelector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyConfigSelector.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,36 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.accesspoint.proxy; + +import java.io.IOException; + +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +public interface ProxyConfigSelector { + String resolveConfig(HttpServletRequest request); + + boolean selectConfigHandler(HttpServletRequest request, HttpServletResponse response, ProxyAccessPoint proxy) throws IOException; + + void handleSwitch(HttpServletRequest request, HttpServletResponse response, ProxyAccessPoint proxy) throws IOException; + + void handleProxyPac(HttpServletRequest httpRequest, HttpServletResponse httpResponse); +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesspoint/proxy/ProxyConfigSelector.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverterFactory.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -0,0 +1,35 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.wayback.archivalurl; + +import org.archive.wayback.ResultURIConverter; +import org.archive.wayback.replay.html.ContextResultURIConverterFactory; + +public class ArchivalUrlResultURIConverterFactory implements + ContextResultURIConverterFactory { + + public ResultURIConverter getContextConverter(String flags) { + ArchivalUrlResultURIConverter converter = new ArchivalUrlResultURIConverter(); + converter.setReplayURIPrefix(flags); + return converter; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlResultURIConverterFactory.java ___________________________________________________________________ Added: svn:mime-type + text/plain Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2012-02-07 19:36:59 UTC (rev 3612) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/FastArchivalUrlReplayParseEventHandler.java 2012-02-16 19:07:40 UTC (rev 3613) @@ -23,11 +23,11 @@ import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; -import java.nio.charset.Charset; import java.util.HashMap; import javax.servlet.ServletException; +import org.archive.wayback.accesscontrol.oracleclient.CustomPolicyOracleFilter; import org.archive.wayback.replay.html.ReplayParseContext; import org.archive.wayback.replay.html.StringTransformer; import org.archive.wayback.replay.html.transformer.BlockCSSStringTransformer; @@ -39,6 +39,7 @@ import org.archive.wayback.util.htmllex.ParseContext; import org.archive.wayback.util.htmllex.ParseEventHandler; import org.htmlparser.Node; +import org.htmlparser.nodes.RemarkNode; import org.htmlparser.nodes.TagNode; import org.htmlparser.nodes.TextNode; @@ -55,6 +56,8 @@ public final static String FERRET_DONE_KEY = FastArchivalUrlReplayParseEventHandler.class.toString(); + + protected final static String FERRET_IN_HEAD = "FERRET_IN_HEAD"; private String jspInsertPath = "/WEB-INF/replay/DisclaimChooser.jsp"; private String endJsp = "/WEB-INF/replay/ArchiveComment.jsp"; @@ -67,6 +70,8 @@ private final static String FRAMESET_TAG = "FRAMESET"; private final static String BODY_TAG = "BODY"; + protected static final String FERRET_HEAD_INSERTED = "FERRET_HEAD_INSERTED"; + private BlockCSSStringTransformer cssBlockTrans = new BlockCSSStringTransformer(); private InlineCSSStringTransformer cssInlineTrans = @@ -76,6 +81,9 @@ private MetaRefreshUrlStringTransformer metaRefreshTrans = new MetaRefreshUrlStringTransformer(); private URLStringTransformer anchorUrlTrans = new URLStringTransformer(); + + protected String headInsertJsp = null; + // static { // anchorUrlTrans = new URLStringTransformer(); // anchorUrlTrans.setJsTransformer(jsBlockTrans); @@ -105,8 +113,8 @@ throws IOException { ReplayParseContext context = (ReplayParseContext) pContext; if(NodeUtils.isRemarkNode(node)) { -// RemarkNode remarkNode = (RemarkNode) node; -// handleRemarkTextNode(context,remarkNode); + RemarkNode remarkNode = (RemarkNode) node; + remarkNode.setText(jsBlockTrans.transform(context, remarkNode.getText())); emit(context,null,node,null); } else if(NodeUtils.isTextNode(node)) { @@ -122,8 +130,19 @@ } } else if(NodeUtils.isTagNode(node)) { TagNode tagNode = (TagNode) node; - if(tagNode.isEndTag()) { - emit(context,null,tagNode,null); + + if (NodeUtils.isOpenTagNodeNamed(tagNode, NodeUtils.SCRIPT_TAG_NAME)) { + handleJSIncludeNode(context, tagNode); + } else if(tagNode.isEndTag()) { + + if (tagNode.getTagName().equals("HEAD")) { + context.putData(FERRET_IN_HEAD, null); + } + + if (checkAllowTag(pContext, tagNode)) { + emit(context,null,tagNode,null); + } + // handleCloseTagNode(context,tagNode); } else { // assume start, possibly empty: @@ -152,17 +171,64 @@ textNode.setText(jsBlockTrans.transform(context, textNode.getText())); emit(context,null,textNode,null); } + + private void handleJSIncludeNode(ReplayParseContext context, TagNode tagNode) throws IOException { + String file = tagNode.getAttribute("SRC"); + if (file != null) { + //TODO: This is hacky.. fix it + // This is used to check if the file should be skipped... + //from a custom rule.. + String result = jsBlockTrans.transform(context, file); + //The rewriting is done by the js_ rewriter + if ((result != null) && !result.isEmpty()) { + tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file)); + } else { + file = ""; + tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file)); + } + } + + emit(context,null,tagNode,null); + } private void handleOpenTagNode(ReplayParseContext context, TagNode tagNode) throws IOException { boolean insertedJsp = context.getData(FERRET_DONE_KEY) != null; + String preEmit = null; String postEmit = null; String tagName = tagNode.getTagName(); + + boolean alreadyInsertedHead = (context.getData(FERRET_HEAD_INSERTED) != null); + + if (!alreadyInsertedHead) { + // If we're at the beginning of a <head> tag, and haven't inserted yet, + // insert right AFTER head tag + if (tagName.equals("HEAD")) { + emitHeadInsert(context, tagNode, true); + context.putData(FERRET_IN_HEAD, FERRET_IN_HEAD); + return; + } + + + // If we're at the beginning of any tag, other than <html>, + // (including <body>) and haven't inserted yet, + // insert right BEFORE the next tag, also continue other default processing + // of the tag + if (!tagName.equals("HTML") && !tagName.equals("!DOCTYPE")) { + emitHeadInsert(context, null, false); + // Don't return continue to further processing + } + } + + + boolean inHead = (context.getData(FERRET_IN_HEAD) != null); + // Time to insert the JSP header? - if(!insertedJsp) { + //IK added check to avoid inserting inside css or script + if(!insertedJsp && !context.isInCSS() && !context.isInScriptText() && !inHead) { if(!okHeadTagMap.containsKey(tagName)) { if(tagName.equals(FRAMESET_TAG)) { // don't put the insert in framsets: @@ -189,6 +255,7 @@ context.putData(FERRET_DONE_KEY,""); } } + // now do all the usual attribute rewriting: // this could be slightly optimized by moving tags more likely to occur // to the front of the if/else if/else if routing... @@ -255,6 +322,10 @@ } else if(tagName.equals("SCRIPT")) { transformAttr(context, tagNode, "SRC", jsUrlTrans); + } else { + if (!checkAllowTag(context, tagNode)) { + return; + } } // now, for *all* tags... transformAttr(context,tagNode,"BACKGROUND", imageUrlTrans); @@ -263,8 +334,31 @@ emit(context,preEmit,tagNode,postEmit); } - - private void emit(ReplayParseContext context, String pre, Node node, + + protected boolean checkAllowTag(ParseContext context, TagNode tagNode) + { + String tagName = tagNode.getTagName(); + + // Check the NOSCRIPT tag, if force-noscript is set, + // then skip the NOSCRIPT tags and include contents explicitly + if (tagName.equals("NOSCRIPT")) { + String allPolic... [truncated message content] |
From: <aa...@us...> - 2012-02-07 19:37:05
|
Revision: 3612 http://archive-access.svn.sourceforge.net/archive-access/?rev=3612&view=rev Author: aalsum Date: 2012-02-07 19:36:59 +0000 (Tue, 07 Feb 2012) Log Message: ----------- Add WATExtractor Job to extract WAT files on Hadoop. Modified Paths: -------------- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/JobDriver.java Added Paths: ----------- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/WATExtractorJob.java trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/WATExtractorMapper.java Modified: trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/JobDriver.java =================================================================== --- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/JobDriver.java 2012-02-02 17:45:12 UTC (rev 3611) +++ trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/JobDriver.java 2012-02-07 19:36:59 UTC (rev 3612) @@ -99,6 +99,14 @@ GZRangeClientTool.class, GZRangeClientTool.TOOL_DESCRIPTION); + pgd.addClass(GZRangeClientTool.TOOL_NAME, + GZRangeClientTool.class, + GZRangeClientTool.TOOL_DESCRIPTION); + + pgd.addClass(WATExtractorJob.TOOL_NAME, + WATExtractorJob.class, + WATExtractorJob.TOOL_DESCRIPTION); + pgd.driver(args); exitCode = 0; Added: trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/WATExtractorJob.java =================================================================== --- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/WATExtractorJob.java (rev 0) +++ trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/WATExtractorJob.java 2012-02-07 19:36:59 UTC (rev 3612) @@ -0,0 +1,117 @@ +package org.archive.hadoop.jobs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.archive.hadoop.mapreduce.WATExtractorMapper; + +public class WATExtractorJob extends Configured implements Tool { + Configuration conf = null; + + public final static String TOOL_NAME = "WATExtractor"; + public final static String TOOL_DESCRIPTION = "A map/reduce program that extract a bunch of WARC files into WAT files into HDFS."; + + public final static String WAT_EXTRACT_TARGET = "wat-extractor.target"; + public final static String WAT_EXTRACTOR_OVERRIDE = "wat-extractor.override"; + + public Configuration getConf() { + + return conf; + } + + + static int printUsage() { + System.out.println("WATExtractor [OPTIONS] <input> <outputdir> <importTarget>"); + System.out.println("\tOPTIONS can be:"); + System.out.println("\t\t-m NUM - try to run with approximately NUM map tasks"); + System.out.println("\t\t--override - to override existent WAT files with the same name, the default is to skip the extracted files."); + System.out.println("\tThe input file contains lines of the form:"); + System.out.println("\t\t\tFilePath"); + System.out.println("\tOR"); + System.out.println("\t\t\tBASENAME<SPACE>FilePath"); + System.out.println("\tif only FilePath is specified, then the target will be <importTarget>/<BASENAME of FilePath>"); + System.out.println("\totherwise the target will be <importTarget>/<BASENAME>"); + System.err.println("\tFilePath is HTTP or HDFS URL to an arc, warc, arc.gz, or warc.gz."); + System.out.println(); + return -1; + } + public void setConf(Configuration conf) { + this.conf = conf; + + } + + @Override + public int run(String[] args) throws Exception { + + Job job = new Job(getConf(), "wat-extractor"); + Configuration conf = job.getConfiguration(); + job.setJarByClass(WATExtractorJob.class); + job.setInputFormatClass(TextInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Text.class); + job.setMapperClass(WATExtractorMapper.class); + + int i = 0; + int numMaps = 10; + while(i < args.length -1) { + if(args[i].equals("-m")) { + i++; + numMaps = Integer.parseInt(args[i]); + i++; + } else if(args[i].equals("--override")) { + WATExtractorMapper.setOverride(conf, true); + i++; + + } else { + break; + } + } + if(args.length - 3 != i) { + printUsage(); +// throw new IllegalArgumentException("wrong number of args..."); + } + Path inputPath = new Path(args[i]); + Path outputPath = new Path(args[i+1]); + Path targetPath = new Path(args[i+2]); + + TextInputFormat.addInputPath(job, inputPath); + FileOutputFormat.setOutputPath(job, outputPath); + WATExtractorMapper.setTargetDir(conf, targetPath.toString()); + + conf.setBoolean("mapred.map.tasks.speculative.execution", false); + + FileSystem fs = inputPath.getFileSystem(conf); + FileStatus inputStatus = fs.getFileStatus(inputPath); + long inputLen = inputStatus.getLen(); + long bytesPerMap = (int) inputLen / numMaps; + + FileInputFormat.setMaxInputSplitSize(job, bytesPerMap); + + + return (job.waitForCompletion(true) ? 0 : 1); + + + } + + /** + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception { + int res = ToolRunner.run(new Configuration(), new WATExtractorJob(), args); + System.exit(res); + + } + +} Property changes on: trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/jobs/WATExtractorJob.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/WATExtractorMapper.java =================================================================== --- trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/WATExtractorMapper.java (rev 0) +++ trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/WATExtractorMapper.java 2012-02-07 19:36:59 UTC (rev 3612) @@ -0,0 +1,149 @@ +package org.archive.hadoop.mapreduce; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URL; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; +import org.archive.extract.ExtractingResourceFactoryMapper; +import org.archive.extract.ExtractingResourceProducer; +import org.archive.extract.ExtractorOutput; +import org.archive.extract.ProducerUtils; +import org.archive.extract.ResourceFactoryMapper; +import org.archive.extract.WATExtractorOutput; +import org.archive.hadoop.jobs.WATExtractorJob; +import org.archive.resource.Resource; +import org.archive.resource.ResourceProducer; +import org.archive.util.StringFieldExtractor; +import org.archive.util.StringFieldExtractor.StringTuple; + +public class WATExtractorMapper extends + Mapper<Object, Text, Text, Text> { + + + public final static String WAT_EXTRACTOR_TARGET = "wat-extractor.target"; + public final static String WAT_EXTRACTOR_OVERRIDE = "wat-extractor.override"; + Path target = null; + FileSystem filesystem = null; + boolean overrideExistentFile = false; + StringFieldExtractor sfe = new StringFieldExtractor(' ', 1); + + public static void setTargetDir(Configuration conf, String path) { + conf.set(WAT_EXTRACTOR_TARGET, path); + } + + @Override + protected void setup(Context context) throws IOException, + InterruptedException { + super.setup(context); + Configuration conf = context.getConfiguration(); + String targetString = conf.get(WATExtractorJob.WAT_EXTRACT_TARGET); + + overrideExistentFile = conf.getBoolean(WAT_EXTRACTOR_OVERRIDE, false); + target = new Path(targetString); + filesystem = target.getFileSystem(conf); + } + + public void map(Object y, Text value, Context context) + throws IOException, InterruptedException { + + //PArse the URL files + String valueS = value.toString(); + String name; + String url = valueS; + int idx = valueS.indexOf(' '); + if(idx == -1) { + URL tmpUrl = new URL(valueS); + name = tmpUrl.getPath(); + if(name.contains("/")) { + name = name.substring(name.lastIndexOf('/')+1); + } + } else { + StringTuple t = sfe.split(valueS); + if((t.first == null) || (t.second == null)) { + throw new IOException("Bad input line:" + valueS); + } + name = t.first; + url = t.second; + } + + Path thisTarget = new Path(target,name); + Path thisTargetTmp = new Path(target,name+".wat.gz"); + doExtract(url, thisTarget,thisTargetTmp); + + } + + private void doExtract(String url, Path target, Path targetTmp) throws IOException { + // Check if the target exists (from previous map) + long targetLen = getPathLength(target); + + int max = Integer.MAX_VALUE; + + if(targetLen > -1) { + // there's a file in the filesystem already, + + if(overrideExistentFile){ + + if(!filesystem.delete(target, false)) { + throw new IOException("Failed to delete old copy"); + } + } else { + return; + } + } + + FSDataOutputStream fsdOut = filesystem.create(targetTmp, false); + ExtractorOutput out; + out = new WATExtractorOutput(fsdOut); + + ResourceProducer producer = ProducerUtils.getProducer(url); + + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer exProducer = + new ExtractingResourceProducer(producer, mapper); + + Logger.getLogger("org.archive").setLevel(Level.WARNING); + + int count = 0; + int incr = 1; + while(count < max) { + try { + Resource r = exProducer.getNext(); + if(r == null) { + break; + } + count += incr; + + out.output(r); + } catch(Exception e){ + e.printStackTrace(); + + } + } + } + + + private long getPathLength(Path path) throws IOException { + FileStatus stat = null; + try { + stat = filesystem.getFileStatus(path); + // present.. check by size: + } catch (FileNotFoundException e) { + return -1; + } + return stat.getLen(); + } + + public static void setOverride(Configuration conf, boolean isOverride) { + conf.setBoolean(WAT_EXTRACTOR_OVERRIDE, isOverride); + + } +} Property changes on: trunk/archive-access/projects/ia-tools/src/main/java/org/archive/hadoop/mapreduce/WATExtractorMapper.java ___________________________________________________________________ Added: svn:mime-type + text/plain This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-02-02 17:53:09
|
Wayback-1 - Build # 113 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/113/ to view the results. |
From: <bi...@us...> - 2012-02-02 17:45:22
|
Revision: 3611 http://archive-access.svn.sourceforge.net/archive-access/?rev=3611&view=rev Author: binzino Date: 2012-02-02 17:45:12 +0000 (Thu, 02 Feb 2012) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/PerMapOutputFormat.java Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/PerMapOutputFormat.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/PerMapOutputFormat.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/PerMapOutputFormat.java 2012-02-02 17:45:12 UTC (rev 3611) @@ -0,0 +1,126 @@ +/** + * Copyright 2012 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package org.archive.hadoop; + +import java.io.*; +import java.util.*; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileAlreadyExistsException; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.InvalidJobConfException; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputFormat; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.util.ReflectionUtils; + +/** + * OutputFormat that directs the output to a file named according to + * the input file. For instance, if the input file is "foo", then the + * output file is also named "foo". A suffix can be easily added, or + * a regex+replace applied to the input filename to produce an output + * filename. + * + * This class can be used in conjunction with FilenameInputFormat in a + * map-reduce job that only does a map() function, no reduce. By + * combining these input and output formats, it's easy to read from a + * large set of input files, process each one in a separate map task, + * and write the output to a file with a name based on the input. + * + * For example, suppose you had 1000 WARC files and your map() task + * just reads a single WARC file and outputs the number of records in + * it. Use the FilenameInputFormat and the PerMapOutputFormat, + * setting the "permap.suffix" property to ".count" and for each WARC + * input file (e.g. "foo.warc.gz") you'll get a corresponding ".count" + * file in the output (.e.g. "foo.warc.gz.count"). + * + * The nice thing about using this class as the OutputFormat is that + * Hadoop will manage the temporary file for you. This means that if + * the map task fails (suppose the task node kernel panics), Hadoop + * will automatically delete the temp file from the failed task and + * re-schedule it. + * + * This class assumes the actual OutputFormat is a SequenceFile. If + * not -- suppose you want to output a MapFile or plain text -- then + * specify the output format in the "permap.output.format.class" + * property. + * + * This class was insired by Hadoop's + * <pre>org.apache.hadoop.mapred.lib.MultipleOutputFormat</pre> + */ +public class PerMapOutputFormat<K,V> extends FileOutputFormat<K,V> +{ + private String getOutputFilename( JobConf job ) + throws IOException + { + String regex = job.get( "permap.regex" , null ); + String replace = job.get( "permap.replace", null ); + String suffix = job.get( "permap.suffix" , null ); + + String inputFilename = job.get("map.input.file"); + + if ( inputFilename == null ) throw new IOException( "map.input.file is null, not running in map task?" ); + + String outputFilename = (new Path(inputFilename)).getName(); + + if ( regex != null && replace != null ) + { + outputFilename = outputFilename.replaceAll( regex, replace ); + } + else if ( suffix != null ) + { + outputFilename += suffix; + } + + if ( outputFilename == null ) throw new IOException( "outputFilename is null" ); + + return outputFilename; + } + + private OutputFormat<K,V> getOutputFormat( JobConf job ) + { + return ReflectionUtils.newInstance( job.getClass( "permap.output.format.class", + SequenceFileOutputFormat.class, + OutputFormat.class ), + job ); + } + + + public RecordWriter<K, V> getRecordWriter( FileSystem fs, JobConf job, String name, Progressable progress ) + throws IOException + { + String outputFilename = getOutputFilename( job ); + + OutputFormat<K,V> of = getOutputFormat( job ); + + return of.getRecordWriter( fs, job, outputFilename, progress ); + + } + + /** + * Over-ride the default FileOutputFormat's checkOutputSpecs() to + * allow for the target directory to already exist. + */ + public void checkOutputSpecs( FileSystem ignored, JobConf job ) + throws FileAlreadyExistsException, InvalidJobConfException, IOException + { + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-02-02 17:18:54
|
Wayback-1 - Build # 112 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/112/ to view the results. |
From: <bi...@us...> - 2012-02-02 17:09:31
|
Revision: 3610 http://archive-access.svn.sourceforge.net/archive-access/?rev=3610&view=rev Author: binzino Date: 2012-02-02 17:09:25 +0000 (Thu, 02 Feb 2012) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java Added: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java (rev 0) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/hadoop/FilenameInputFormat.java 2012-02-02 17:09:25 UTC (rev 3610) @@ -0,0 +1,117 @@ +/* + * Copyright 2012 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package org.archive.hadoop; + +import java.io.*; +import java.util.*; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RecordReader; + + +/** + * Handy "input format" which maps the input filename into a "record" + * which just has the filename. + * + * This is very useful for map-reduce jobs where you want to pass the + * filenames into the map() function. Use this as the input format, + * and the input filenames will be passed to the map(). The full + * pathname is given as both the key and the value to the map(). + */ +public class FilenameInputFormat extends FileInputFormat<Text,Text> +{ + /** + * Configure per Hadoop properties + */ + public void configure( JobConf conf ) + { + } + + /** + * By definition, not splitable. + */ + @Override + protected boolean isSplitable(FileSystem fs, Path file) + { + return false; + } + + /** + * Return a RecordReader which returns 1 record: the file path from + * the InputSplit. + */ + public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, + JobConf job, + Reporter reporter) + throws IOException + { + reporter.setStatus(genericSplit.toString()); + + FileSplit split = (FileSplit) genericSplit; + final Path file = split.getPath(); + + return new RecordReader<Text,Text>() + { + boolean done = false; + + public void close() + { + } + + public Text createKey() + { + return new Text(); + } + + public Text createValue() + { + return new Text(); + } + + public long getPos() + { + return 0; + } + + public float getProgress() + { + return 0.0f; + } + + public boolean next( Text key, Text value) + { + if ( done ) return false; + + key .set( file.toString() ); + value.set( file.toString() ); + + done = true ; + + return true; + } + + }; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3609 http://archive-access.svn.sourceforge.net/archive-access/?rev=3609&view=rev Author: binzino Date: 2012-01-27 01:44:25 +0000 (Fri, 27 Jan 2012) Log Message: ----------- Add lastPage as configurable param for text extraction. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-01-26 21:57:22 UTC (rev 3608) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2012-01-27 01:44:25 UTC (rev 3609) @@ -44,7 +44,8 @@ /** - * + * Nutch plugin which calls 'pdftotext' command-line utility to parse + * PDF documents, as well as extract the title. */ public class PDFParser implements Parser { @@ -80,9 +81,10 @@ fos.write( raw ); fos.close(); - String exepath = this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ); + String exepath = this.conf.get( "nutchwax.parse.pdf2.pdftotext.path", "/usr/bin/pdftotext" ); + String lastPage = String.valueOf( this.conf.getInt( "nutchwax.parse.pdf2.lastPage", 100 ) ); - // Now create a Process to call 'pdftotext' to extract the metadata. + // Create a Process which calls 'pdftotext' to extract the metadata. Only get the first page. ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); Process p = pb.start(); @@ -98,7 +100,15 @@ p.destroy( ); - pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", tmpfile.toString(), "-" ); + // Extract the title from the HTML-formatted metadata output of the above call to pdftotext. + Matcher m = Pattern.compile( "<title>(.+)</title>", Pattern.DOTALL ).matcher( head ); + if ( m.find( ) ) + { + title = m.group(1); + } + + // Create a Process which calls 'pdftotext' to extract the content. + pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-enc", "UTF-8", "-f", "1", "-l", lastPage, tmpfile.toString(), "-" ); p = pb.start( ); p.getOutputStream( ).close( ); @@ -112,16 +122,7 @@ p.destroy( ); - Matcher m = Pattern.compile( "<html>.*?<title>(.*?)</title>.*?</head>", Pattern.DOTALL ).matcher( head ); - if ( m.find( ) ) - { - title = m.group(1); - } - - //System.out.println( "head = " + head ); - //System.out.println( "title = " + title ); - - // No outlinks. + // No outlinks, sorry :( Outlink[] outlinks = new Outlink[0]; ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3608 http://archive-access.svn.sourceforge.net/archive-access/?rev=3608&view=rev Author: binzino Date: 2012-01-26 21:57:22 +0000 (Thu, 26 Jan 2012) Log Message: ----------- Fix bug to restore title, parsed-text and outlinks. Accidentally removed in previous edits. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java 2012-01-26 20:53:00 UTC (rev 3607) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java 2012-01-26 21:57:22 UTC (rev 3608) @@ -156,33 +156,23 @@ // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); - /* - if (LOG.isTraceEnabled()) { - LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); - } - // check meta directives - if (!metaTags.getNoIndex()) { // okay to index - StringBuffer sb = new StringBuffer(); - if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } - utils.getText(sb, root); // extract text - text = sb.toString(); - sb.setLength(0); - if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } - utils.getTitle(sb, root); // extract title - title = sb.toString().trim(); - } - - if (!metaTags.getNoFollow()) { // okay to follow links - ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks - URL baseTag = utils.getBase(root); - if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } - utils.getOutlinks(baseTag!=null?baseTag:base, l, root); - outlinks = l.toArray(new Outlink[l.size()]); - if (LOG.isTraceEnabled()) { - LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl()); - } - } - */ + + // Extract body text + StringBuffer sb = new StringBuffer(); + utils.getText(sb, root); // extract text + text = sb.toString(); + sb.setLength(0); + + // Extract title + utils.getTitle(sb, root); + title = sb.toString().trim(); + + // Extract outlinks + ArrayList<Outlink> l = new ArrayList<Outlink>(); + URL baseTag = utils.getBase(root); + utils.getOutlinks(baseTag!=null?baseTag:base, l, root); + outlinks = l.toArray(new Outlink[l.size()]); + ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); /* if (metaTags.getRefresh()) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-26 20:53:09
|
Revision: 3607 http://archive-access.svn.sourceforge.net/archive-access/?rev=3607&view=rev Author: binzino Date: 2012-01-26 20:53:00 +0000 (Thu, 26 Jan 2012) Log Message: ----------- Initial revision of NutchWAX custom version of parse-html plugin. Main diffs are not enforcing robots meta tag nor trying to process redirects. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HtmlParser.java tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/XMLCharacterRecognizer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2012-01-26 20:51:04 UTC (rev 3606) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -92,6 +92,7 @@ <ant dir="scoring-nutchwax" target="deploy" /> <ant dir="urlfilter-nutchwax" target="deploy" /> <ant dir="parse-pdf2" target="deploy" /> + <ant dir="parse-html2" target="deploy" /> <ant dir="html-decorator" target="deploy" /> </target> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/build.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-html2" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar =================================================================== (Binary files differ) Property changes on: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup-1.2.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/lib/tagsoup.LICENSE.txt 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/plugin.xml 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-html2" + name="NutchWAX Html Parse Plug-in" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="parse-html2.jar"> + <export name="*"/> + </library> + <library name="tagsoup-1.2.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.parse.html" + name="NutchWAX HTML Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.archive.nutchwax.parse.html.HtmlParser" + class="org.archive.nutchwax.parse.html.HtmlParser"> + <parameter name="contentType" value="text/html"/> + <parameter name="pathSuffix" value=""/> + </implementation> + + </extension> + +</plugin> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMBuilder.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,740 @@ +/* + * XXX ab...@ap...: This class is copied verbatim from Xalan-J 2.6.0 + * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to + * avoid dependency on Xalan. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $ + */ +package org.archive.nutchwax.parse.html; + +import java.util.Stack; + +import org.w3c.dom.Comment; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.Text; +import org.w3c.dom.CDATASection; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.ext.LexicalHandler; +/** + * This class takes SAX events (in addition to some extra events + * that SAX doesn't handle yet) and adds the result to a document + * or document fragment. + */ +public class DOMBuilder + implements ContentHandler, LexicalHandler +{ + + /** Root document */ + public Document m_doc; + + /** Current node */ + protected Node m_currentNode = null; + + /** First node of document fragment or null if not a DocumentFragment */ + public DocumentFragment m_docFrag = null; + + /** Vector of element nodes */ + protected Stack m_elemStack = new Stack(); + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document fragment. + * + * @param doc Root document + * @param node Current node + */ + public DOMBuilder(Document doc, Node node) + { + m_doc = doc; + m_currentNode = node; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document fragment. + * + * @param doc Root document + * @param docFrag Document fragment + */ + public DOMBuilder(Document doc, DocumentFragment docFrag) + { + m_doc = doc; + m_docFrag = docFrag; + } + + /** + * DOMBuilder instance constructor... it will add the DOM nodes + * to the document. + * + * @param doc Root document + */ + public DOMBuilder(Document doc) + { + m_doc = doc; + } + + /** + * Get the root node of the DOM being created. This + * is either a Document or a DocumentFragment. + * + * @return The root document or document fragment if not null + */ + public Node getRootNode() + { + return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc; + } + + /** + * Get the node currently being processed. + * + * @return the current node being processed + */ + public Node getCurrentNode() + { + return m_currentNode; + } + + /** + * Return null since there is no Writer for this class. + * + * @return null + */ + public java.io.Writer getWriter() + { + return null; + } + + /** + * Append a node to the current container. + * + * @param newNode New node to append + */ + protected void append(Node newNode) throws org.xml.sax.SAXException + { + + Node currentNode = m_currentNode; + + if (null != currentNode) + { + currentNode.appendChild(newNode); + + // System.out.println(newNode.getNodeName()); + } + else if (null != m_docFrag) + { + m_docFrag.appendChild(newNode); + } + else + { + boolean ok = true; + short type = newNode.getNodeType(); + + if (type == Node.TEXT_NODE) + { + String data = newNode.getNodeValue(); + + if ((null != data) && (data.trim().length() > 0)) + { + throw new org.xml.sax.SAXException("Warning: can't output text before document element! Ignoring..."); + } + + ok = false; + } + else if (type == Node.ELEMENT_NODE) + { + if (m_doc.getDocumentElement() != null) + { + throw new org.xml.sax.SAXException("Can't have more than one root on a DOM!"); + } + } + + if (ok) + m_doc.appendChild(newNode); + } + } + + /** + * Receive an object for locating the origin of SAX document events. + * + * <p>SAX parsers are strongly encouraged (though not absolutely + * required) to supply a locator: if it does so, it must supply + * the locator to the application by invoking this method before + * invoking any of the other methods in the ContentHandler + * interface.</p> + * + * <p>The locator allows the application to determine the end + * position of any document-related event, even if the parser is + * not reporting an error. Typically, the application will + * use this information for reporting its own errors (such as + * character content that does not match an application's + * business rules). The information returned by the locator + * is probably not sufficient for use with a search engine.</p> + * + * <p>Note that the locator will return correct information only + * during the invocation of the events in this interface. The + * application should not attempt to use it at any other time.</p> + * + * @param locator An object that can return the location of + * any SAX document event. + * @see org.xml.sax.Locator + */ + public void setDocumentLocator(Locator locator) + { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of a document. + * + * <p>The SAX parser will invoke this method only once, before any + * other methods in this interface or in DTDHandler (except for + * setDocumentLocator).</p> + */ + public void startDocument() throws org.xml.sax.SAXException + { + + // No action for the moment. + } + + /** + * Receive notification of the end of a document. + * + * <p>The SAX parser will invoke this method only once, and it will + * be the last method invoked during the parse. The parser shall + * not invoke this method until it has either abandoned parsing + * (because of an unrecoverable error) or reached the end of + * input.</p> + */ + public void endDocument() throws org.xml.sax.SAXException + { + + // No action for the moment. + } + + /** + * Receive notification of the beginning of an element. + * + * <p>The Parser will invoke this method at the beginning of every + * element in the XML document; there will be a corresponding + * endElement() event for every startElement() event (even when the + * element is empty). All of the element's content will be + * reported, in order, before the corresponding endElement() + * event.</p> + * + * <p>If the element name has a namespace prefix, the prefix will + * still be attached. Note that the attribute list provided will + * contain only attributes with explicit values (specified or + * defaulted): #IMPLIED attributes will be omitted.</p> + * + * + * @param ns The namespace of the node + * @param localName The local part of the qualified name + * @param name The element name. + * @param atts The attributes attached to the element, if any. + * @see #endElement + * @see org.xml.sax.Attributes + */ + public void startElement( + String ns, String localName, String name, Attributes atts) + throws org.xml.sax.SAXException + { + + Element elem; + + // Note that the namespace-aware call must be used to correctly + // construct a Level 2 DOM, even for non-namespaced nodes. + if ((null == ns) || (ns.length() == 0)) + elem = m_doc.createElementNS(null,name); + else + elem = m_doc.createElementNS(ns, name); + + append(elem); + + try + { + int nAtts = atts.getLength(); + + if (0 != nAtts) + { + for (int i = 0; i < nAtts; i++) + { + + //System.out.println("type " + atts.getType(i) + " name " + atts.getLocalName(i) ); + // First handle a possible ID attribute + if (atts.getType(i).equalsIgnoreCase("ID")) + setIDAttribute(atts.getValue(i), elem); + + String attrNS = atts.getURI(i); + + if("".equals(attrNS)) + attrNS = null; // DOM represents no-namespace as null + + // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i) + // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i)); + // Crimson won't let us set an xmlns: attribute on the DOM. + String attrQName = atts.getQName(i); + + // In SAX, xmlns: attributes have an empty namespace, while in DOM they should have the xmlns namespace + if (attrQName.startsWith("xmlns:")) + attrNS = "http://www.w3.org/2000/xmlns/"; + + // ALWAYS use the DOM Level 2 call! + elem.setAttributeNS(attrNS,attrQName, atts.getValue(i)); + } + } + + // append(elem); + + m_elemStack.push(elem); + + m_currentNode = elem; + + // append(elem); + } + catch(java.lang.Exception de) + { + // de.printStackTrace(); + throw new org.xml.sax.SAXException(de); + } + + } + + /** + + + + * Receive notification of the end of an element. + * + * <p>The SAX parser will invoke this method at the end of every + * element in the XML document; there will be a corresponding + * startElement() event for every endElement() event (even when the + * element is empty).</p> + * + * <p>If the element name has a namespace prefix, the prefix will + * still be attached to the name.</p> + * + * + * @param ns the namespace of the element + * @param localName The local part of the qualified name of the element + * @param name The element name + */ + public void endElement(String ns, String localName, String name) + throws org.xml.sax.SAXException + { + m_elemStack.pop(); + m_currentNode = m_elemStack.isEmpty() ? null : (Node)m_elemStack.peek(); + } + + /** + * Set an ID string to node association in the ID table. + * + * @param id The ID string. + * @param elem The associated ID. + */ + public void setIDAttribute(String id, Element elem) + { + + // Do nothing. This method is meant to be overiden. + } + + /** + * Receive notification of character data. + * + * <p>The Parser will call this method to report each chunk of + * character data. SAX parsers may return all contiguous character + * data in a single chunk, or they may split it into several + * chunks; however, all of the characters in any single event + * must come from the same external entity, so that the Locator + * provides useful information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * <p>Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating + * parsers must do so).</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void characters(char ch[], int start, int length) throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + if (m_inCData) + { + cdata(ch, start, length); + + return; + } + + String s = new String(ch, start, length); + Node childNode; + childNode = m_currentNode != null ? m_currentNode.getLastChild(): null; + if( childNode != null && childNode.getNodeType() == Node.TEXT_NODE ){ + ((Text)childNode).appendData(s); + } + else{ + Text text = m_doc.createTextNode(s); + append(text); + } + } + + /** + * If available, when the disable-output-escaping attribute is used, + * output raw text without escaping. A PI will be inserted in front + * of the node with the name "lotusxsl-next-is-raw" and a value of + * "formatter-to-dom". + * + * @param ch Array containing the characters + * @param start Index to start of characters in the array + * @param length Number of characters in the array + */ + public void charactersRaw(char ch[], int start, int length) + throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + + String s = new String(ch, start, length); + + append(m_doc.createProcessingInstruction("xslt-next-is-raw", + "formatter-to-dom")); + append(m_doc.createTextNode(s)); + } + + /** + * Report the beginning of an entity. + * + * The start and end of the document entity are not reported. + * The start and end of the external DTD subset are reported + * using the pseudo-name "[dtd]". All other events must be + * properly nested within start/end entity events. + * + * @param name The name of the entity. If it is a parameter + * entity, the name will begin with '%'. + * @see #endEntity + * @see org.xml.sax.ext.DeclHandler#internalEntityDecl + * @see org.xml.sax.ext.DeclHandler#externalEntityDecl + */ + public void startEntity(String name) throws org.xml.sax.SAXException + { + + // Almost certainly the wrong behavior... + // entityReference(name); + } + + /** + * Report the end of an entity. + * + * @param name The name of the entity that is ending. + * @see #startEntity + */ + public void endEntity(String name) throws org.xml.sax.SAXException{} + + /** + * Receive notivication of a entityReference. + * + * @param name name of the entity reference + */ + public void entityReference(String name) throws org.xml.sax.SAXException + { + append(m_doc.createEntityReference(name)); + } + + /** + * Receive notification of ignorable whitespace in element content. + * + * <p>Validating Parsers must use this method to report each chunk + * of ignorable whitespace (see the W3C XML 1.0 recommendation, + * section 2.10): non-validating parsers may also use this method + * if they are capable of parsing and using content models.</p> + * + * <p>SAX parsers may return all contiguous whitespace in a single + * chunk, or they may split it into several chunks; however, all of + * the characters in any single event must come from the same + * external entity, so that the Locator provides useful + * information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #characters + */ + public void ignorableWhitespace(char ch[], int start, int length) + throws org.xml.sax.SAXException + { + if(isOutsideDocElem()) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + append(m_doc.createTextNode(s)); + } + + /** + * Tell if the current node is outside the document element. + * + * @return true if the current node is outside the document element. + */ + private boolean isOutsideDocElem() + { + return (null == m_docFrag) && m_elemStack.size() == 0 && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE); + } + + /** + * Receive notification of a processing instruction. + * + * <p>The Parser will invoke this method once for each processing + * instruction found: note that processing instructions may occur + * before or after the main document element.</p> + * + * <p>A SAX parser should never report an XML declaration (XML 1.0, + * section 2.8) or a text declaration (XML 1.0, section 4.3.1) + * using this method.</p> + * + * @param target The processing instruction target. + * @param data The processing instruction data, or null if + * none was supplied. + */ + public void processingInstruction(String target, String data) + throws org.xml.sax.SAXException + { + append(m_doc.createProcessingInstruction(target, data)); + } + + /** + * Report an XML comment anywhere in the document. + * + * This callback will be used for comments inside or outside the + * document element, including comments in the external DTD + * subset (if read). + * + * @param ch An array holding the characters in the comment. + * @param start The starting position in the array. + * @param length The number of characters to use from the array. + */ + public void comment(char ch[], int start, int length) throws org.xml.sax.SAXException + { + // tagsoup sometimes submits invalid values here + if (ch == null || start < 0 || length >= (ch.length - start) || length < 0) return; + append(m_doc.createComment(new String(ch, start, length))); + } + + /** Flag indicating that we are processing a CData section */ + protected boolean m_inCData = false; + + /** + * Report the start of a CDATA section. + * + * @see #endCDATA + */ + public void startCDATA() throws org.xml.sax.SAXException + { + m_inCData = true; + append(m_doc.createCDATASection("")); + } + + /** + * Report the end of a CDATA section. + * + * @see #startCDATA + */ + public void endCDATA() throws org.xml.sax.SAXException + { + m_inCData = false; + } + + /** + * Receive notification of cdata. + * + * <p>The Parser will call this method to report each chunk of + * character data. SAX parsers may return all contiguous character + * data in a single chunk, or they may split it into several + * chunks; however, all of the characters in any single event + * must come from the same external entity, so that the Locator + * provides useful information.</p> + * + * <p>The application must not attempt to read from the array + * outside of the specified range.</p> + * + * <p>Note that some parsers will report whitespace using the + * ignorableWhitespace() method rather than this one (validating + * parsers must do so).</p> + * + * @param ch The characters from the XML document. + * @param start The start position in the array. + * @param length The number of characters to read from the array. + * @see #ignorableWhitespace + * @see org.xml.sax.Locator + */ + public void cdata(char ch[], int start, int length) throws org.xml.sax.SAXException + { + if(isOutsideDocElem() + && XMLCharacterRecognizer.isWhiteSpace(ch, start, length)) + return; // avoid DOM006 Hierarchy request error + + String s = new String(ch, start, length); + + // XXX ab...@ap...: modified from the original, to accomodate TagSoup. + Node n = m_currentNode.getLastChild(); + if (n instanceof CDATASection) + ((CDATASection)n).appendData(s); + else if (n instanceof Comment) + ((Comment)n).appendData(s); + } + + /** + * Report the start of DTD declarations, if any. + * + * Any declarations are assumed to be in the internal subset + * unless otherwise indicated. + * + * @param name The document type name. + * @param publicId The declared public identifier for the + * external DTD subset, or null if none was declared. + * @param systemId The declared system identifier for the + * external DTD subset, or null if none was declared. + * @see #endDTD + * @see #startEntity + */ + public void startDTD(String name, String publicId, String systemId) + throws org.xml.sax.SAXException + { + + // Do nothing for now. + } + + /** + * Report the end of DTD declarations. + * + * @see #startDTD + */ + public void endDTD() throws org.xml.sax.SAXException + { + + // Do nothing for now. + } + + /** + * Begin the scope of a prefix-URI Namespace mapping. + * + * <p>The information from this event is not necessary for + * normal Namespace processing: the SAX XML reader will + * automatically replace prefixes for element and attribute + * names when the http://xml.org/sax/features/namespaces + * feature is true (the default).</p> + * + * <p>There are cases, however, when applications need to + * use prefixes in character data or in attribute values, + * where they cannot safely be expanded automatically; the + * start/endPrefixMapping event supplies the information + * to the application to expand prefixes in those contexts + * itself, if necessary.</p> + * + * <p>Note that start/endPrefixMapping events are not + * guaranteed to be properly nested relative to each-other: + * all startPrefixMapping events will occur before the + * corresponding startElement event, and all endPrefixMapping + * events will occur after the corresponding endElement event, + * but their order is not guaranteed.</p> + * + * @param prefix The Namespace prefix being declared. + * @param uri The Namespace URI the prefix is mapped to. + * @see #endPrefixMapping + * @see #startElement + */ + public void startPrefixMapping(String prefix, String uri) + throws org.xml.sax.SAXException + { + + /* + // Not sure if this is needed or wanted + // Also, it fails in the stree. + if((null != m_currentNode) + && (m_currentNode.getNodeType() == Node.ELEMENT_NODE)) + { + String qname; + if(((null != prefix) && (prefix.length() == 0)) + || (null == prefix)) + qname = "xmlns"; + else + qname = "xmlns:"+prefix; + + Element elem = (Element)m_currentNode; + String val = elem.getAttribute(qname); // Obsolete, should be DOM2...? + if(val == null) + { + elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", + qname, uri); + } + } + */ + } + + /** + * End the scope of a prefix-URI mapping. + * + * <p>See startPrefixMapping for details. This event will + * always occur after the corresponding endElement event, + * but the order of endPrefixMapping events is not otherwise + * guaranteed.</p> + * + * @param prefix The prefix that was being mapping. + * @see #startPrefixMapping + * @see #endElement + */ + public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException{} + + /** + * Receive notification of a skipped entity. + * + * <p>The Parser will invoke this method once for each entity + * skipped. Non-validating processors may skip entities if they + * have not seen the declarations (because, for example, the + * entity was declared in an external DTD subset). All processors + * may skip external entities, depending on the values of the + * http://xml.org/sax/features/external-general-entities and the + * http://xml.org/sax/features/external-parameter-entities + * properties.</p> + * + * @param name The name of the skipped entity. If it is a + * parameter entity, the name will begin with '%'. + */ + public void skippedEntity(String name) throws org.xml.sax.SAXException{} +} Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/DOMContentUtils.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,419 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.parse.html; + +import java.net.URL; +import java.net.MalformedURLException; +import java.util.Collection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Stack; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.util.NodeWalker; +import org.apache.hadoop.conf.Configuration; + +import org.w3c.dom.*; + +/** + * A collection of methods for extracting content from DOM trees. + * + * This class holds a few utility methods for pulling content out of + * DOM nodes, such as getOutlinks, getText, etc. + * + */ +public class DOMContentUtils { + + public static class LinkParams { + public String elName; + public String attrName; + public int childLen; + + public LinkParams(String elName, String attrName, int childLen) { + this.elName = elName; + this.attrName = attrName; + this.childLen = childLen; + } + + public String toString() { + return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]"; + } + } + + private HashMap linkParams = new HashMap(); + private Configuration conf; + + public DOMContentUtils(Configuration conf) { + setConf(conf); + } + + public void setConf(Configuration conf) { + // forceTags is used to override configurable tag ignoring, later on + Collection<String> forceTags = new ArrayList<String>(1); + + this.conf = conf; + linkParams.clear(); + linkParams.put("a", new LinkParams("a", "href", 1)); + linkParams.put("area", new LinkParams("area", "href", 0)); + if (conf.getBoolean("parser.html.form.use_action", true)) { + linkParams.put("form", new LinkParams("form", "action", 1)); + if (conf.get("parser.html.form.use_action") != null) + forceTags.add("form"); + } + linkParams.put("frame", new LinkParams("frame", "src", 0)); + linkParams.put("iframe", new LinkParams("iframe", "src", 0)); + linkParams.put("script", new LinkParams("script", "src", 0)); + linkParams.put("link", new LinkParams("link", "href", 0)); + linkParams.put("img", new LinkParams("img", "src", 0)); + + // remove unwanted link tags from the linkParams map + String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); + for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) { + if ( ! forceTags.contains(ignoreTags[i]) ) + linkParams.remove(ignoreTags[i]); + } + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, + * and will append all the content text found beneath the DOM node to + * the <code>StringBuffer</code>. + * + * <p> + * + * If <code>abortOnNestedAnchors</code> is true, DOM traversal will + * be aborted and the <code>StringBuffer</code> will not contain + * any text encountered after a nested anchor is found. + * + * <p> + * + * @return true if nested anchors were found + */ + public boolean getText(StringBuffer sb, Node node, + boolean abortOnNestedAnchors) { + if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) { + return true; + } + return false; + } + + + /** + * This is a convinience method, equivalent to {@link + * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * + */ + public void getText(StringBuffer sb, Node node) { + getText(sb, node, false); + } + + // returns true if abortOnNestedAnchors is true and we find nested + // anchors + private boolean getTextHelper(StringBuffer sb, Node node, + boolean abortOnNestedAnchors, + int anchorDepth) { + boolean abort = false; + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("script".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if ("style".equalsIgnoreCase(nodeName)) { + walker.skipChildren(); + } + if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) { + anchorDepth++; + if (anchorDepth > 1) { + abort = true; + break; + } + } + if (nodeType == Node.COMMENT_NODE) { + walker.skipChildren(); + } + if (nodeType == Node.TEXT_NODE) { + // cleanup and trim the value + String text = currentNode.getNodeValue(); + text = text.replaceAll("\\s+", " "); + text = text.trim(); + if (text.length() > 0) { + if (sb.length() > 0) sb.append(' '); + sb.append(text); + } + } + } + + return abort; + } + + /** + * This method takes a {@link StringBuffer} and a DOM {@link Node}, + * and will append the content text found beneath the first + * <code>title</code> node to the <code>StringBuffer</code>. + * + * @return true if a title node was found, false otherwise + */ + public boolean getTitle(StringBuffer sb, Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return false; + } + + if (nodeType == Node.ELEMENT_NODE) { + if ("title".equalsIgnoreCase(nodeName)) { + getText(sb, currentNode); + return true; + } + } + } + + return false; + } + + /** If Node contains a BASE tag then it's HREF is returned. */ + public URL getBase(Node node) { + + NodeWalker walker = new NodeWalker(node); + + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + // is this node a BASE tag? + if (nodeType == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD + return null; + } + + if ("base".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + if ("href".equalsIgnoreCase(attr.getNodeName())) { + try { + return new URL(attr.getNodeValue()); + } catch (MalformedURLException e) {} + } + } + } + } + } + + // no. + return null; + } + + + private boolean hasOnlyWhiteSpace(Node node) { + String val= node.getNodeValue(); + for (int i= 0; i < val.length(); i++) { + if (!Character.isWhitespace(val.charAt(i))) + return false; + } + return true; + } + + // this only covers a few cases of empty links that are symptomatic + // of nekohtml's DOM-fixup process... + private boolean shouldThrowAwayLink(Node node, NodeList children, + int childLen, LinkParams params) { + if (childLen == 0) { + // this has no inner structure + if (params.childLen == 0) return false; + else return true; + } else if ((childLen == 1) + && (children.item(0).getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { + // single nested link + return true; + + } else if (childLen == 2) { + + Node c0= children.item(0); + Node c1= children.item(1); + + if ((c0.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c0.getNodeName())) + && (c1.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c1) ) { + // single link followed by whitespace node + return true; + } + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) ) { + // whitespace node followed by single link + return true; + } + + } else if (childLen == 3) { + Node c0= children.item(0); + Node c1= children.item(1); + Node c2= children.item(2); + + if ((c1.getNodeType() == Node.ELEMENT_NODE) + && (params.elName.equalsIgnoreCase(c1.getNodeName())) + && (c0.getNodeType() == Node.TEXT_NODE) + && (c2.getNodeType() == Node.TEXT_NODE) + && hasOnlyWhiteSpace(c0) + && hasOnlyWhiteSpace(c2) ) { + // single link surrounded by whitespace nodes + return true; + } + } + + return false; + } + + /** + * Handles cases where the url param information is encoded into the base + * url as opposed to the target. + * <p> + * If the taget contains params (i.e. ';xxxx') information then the target + * params information is assumed to be correct and any base params information + * is ignored. If the base contains params information but the tareget does + * not, then the params information is moved to the target allowing it to be + * correctly determined by the java.net.URL class. + * + * @param base The base URL. + * @param target The target path from the base URL. + * + * @return URL A URL with the params information correctly encoded. + * + * @throws MalformedURLException If the url is not a well formed URL. + */ + private URL fixEmbeddedParams(URL base, String target) + throws MalformedURLException{ + + // the target contains params information or the base doesn't then no + // conversion necessary, return regular URL + if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + return new URL(base, target); + } + + // get the base url and it params information + String baseURL = base.toString(); + int startParams = baseURL.indexOf(';'); + String params = baseURL.substring(startParams); + + // if the target has a query string then put the params information after + // any path but before the query string, otherwise just append to the path + int startQS = target.indexOf('?'); + if (startQS >= 0) { + target = target.substring(0, startQS) + params + + target.substring(startQS); + } + else { + target += params; + } + + return new URL(base, target); + } + + /** + * This method finds all anchors below the supplied DOM + * <code>node</code>, and creates appropriate {@link Outlink} + * records for each (relative to the supplied <code>base</code> + * URL), and adds them to the <code>outlinks</code> {@link + * ArrayList}. + * + * <p> + * + * Links without inner structure (tags, text, etc) are discarded, as + * are links which contain only single nested links and empty text + * nodes (this is a common DOM-fixup artifact, at least with + * nekohtml). + */ + public void getOutlinks(URL base, ArrayList outlinks, + Node node) { + + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + NodeList children = currentNode.getChildNodes(); + int childLen = (children != null) ? children.getLength() : 0; + + if (nodeType == Node.ELEMENT_NODE) { + + nodeName = nodeName.toLowerCase(); + LinkParams params = (LinkParams)linkParams.get(nodeName); + if (params != null) { + if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { + + StringBuffer linkText = new StringBuffer(); + getText(linkText, currentNode, true); + + NamedNodeMap attrs = currentNode.getAttributes(); + String target = null; + boolean noFollow = false; + boolean post = false; + for (int i= 0; i < attrs.getLength(); i++ ) { + Node attr = attrs.item(i); + String attrName = attr.getNodeName(); + if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); + } else if ("rel".equalsIgnoreCase(attrName) && + "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; + } else if ("method".equalsIgnoreCase(attrName) && + "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; + } + } + if (target != null && !noFollow && !post) + try { + + URL url = (base.toString().indexOf(';') > 0) ? + fixEmbeddedParams(base, target) : new URL(base, target); + outlinks.add(new Outlink(url.toString(), + linkText.toString().trim())); + } catch (MalformedURLException e) { + // don't care + } + } + // this should not have any children, skip them + if (params.childLen == 0) continue; + } + } + } + } + +} + Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-html2/src/java/org/archive/nutchwax/parse/html/HTMLMetaProcessor.java 2012-01-26 20:53:00 UTC (rev 3607) @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.parse.html; + +import java.net.URL; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.w3c.dom.*; + +/** + * Class for parsing META Directives from DOM trees. This class + * handles specifically Robots META directives (all, none, nofollow, + * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache + * instructions. All meta directives are stored in a HTMLMetaTags instance. + */ +public class HTMLMetaProcessor { + + /** + * Utility class with indicators for the robots directives "noindex" + * and "nofollow", and HTTP-EQUIV/no-cache + */ + + /** + * Sets the indicators in <code>robotsMeta</code> to appropriate + * values, based on any META tags found under the given + * <code>node</code>. + */ + public static final void getMetaTags ( + HTMLMetaTags metaTags, Node node, URL currURL) { + + metaTags.reset(); + getMetaTagsHelper(metaTags, node, currURL); + } + + private static final void getMetaTagsHelper( + HTMLMetaTags metaTags, Node node, URL currURL) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + + if ("body".equalsIgnoreCase(node.getNodeName())) { + // META tags should not be under body + return; + } + + if ("meta".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node nameNode = null; + Node equivNode = null; + Node contentNode = null; + // Retrieves name, http-equiv and content attribues + for (int i=0; i<attrs.g... [truncated message content] |
Revision: 3606 http://archive-access.svn.sourceforge.net/archive-access/?rev=3606&view=rev Author: binzino Date: 2012-01-26 20:51:04 +0000 (Thu, 26 Jan 2012) Log Message: ----------- Add robots to list of meta tags. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java 2012-01-26 02:45:25 UTC (rev 3605) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/html-decorator/src/java/org/archive/nutchwax/html/HtmlDecorator.java 2012-01-26 20:51:04 UTC (rev 3606) @@ -1,3 +1,19 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.archive.nutchwax.html; @@ -15,7 +31,11 @@ import org.w3c.dom.DocumentFragment; - +/** + * Simple HtmlParseFilter which copies the important meta tags to the + * Content metadata. At the moment the important ones are: robots, + * description, keywords. + */ public class HtmlDecorator implements HtmlParseFilter { private Configuration conf; @@ -34,7 +54,7 @@ Metadata contentMeta = parseResult.get( content.getUrl( ) ).getData().getContentMeta(); - for ( String key : new String[] { "description", "keywords" } ) + for ( String key : new String[] { "description", "keywords", "robots" } ) { String value = metaTags.getGeneralTags().getProperty( key, "" ); @@ -44,11 +64,13 @@ return parseResult; } - public void setConf(Configuration conf) { + public void setConf(Configuration conf) + { this.conf = conf; } - public Configuration getConf() { + public Configuration getConf() + { return this.conf; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-26 02:52:49
|
Wayback-1 - Build # 111 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/111/ to view the results. |
From: <vin...@us...> - 2012-01-26 02:45:32
|
Revision: 3605 http://archive-access.svn.sourceforge.net/archive-access/?rev=3605&view=rev Author: vinaygoel Date: 2012-01-26 02:45:25 +0000 (Thu, 26 Jan 2012) Log Message: ----------- BUGFIX: Wrapped OutOfMemoryError as ResourceParseException. Modified Paths: -------------- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResourceFactory.java Modified: trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResourceFactory.java =================================================================== --- trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResourceFactory.java 2012-01-25 23:47:49 UTC (rev 3604) +++ trunk/archive-access/projects/archive-commons/src/main/java/org/archive/resource/html/HTMLResourceFactory.java 2012-01-26 02:45:25 UTC (rev 3605) @@ -35,6 +35,8 @@ } catch (ParserException e) { e.printStackTrace(); throw new ResourceParseException(e); + } catch(OutOfMemoryError e) { + throw new ResourceParseException(null); } return new HTMLResource(hmd,container); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nl...@ar...> - 2012-01-25 23:56:38
|
Wayback-1 - Build # 110 - Successful: Check console output at https://builds.archive.org:1443/job/Wayback-1/110/ to view the results. |
Revision: 3604 http://archive-access.svn.sourceforge.net/archive-access/?rev=3604&view=rev Author: ikreymer Date: 2012-01-25 23:47:49 +0000 (Wed, 25 Jan 2012) Log Message: ----------- FIX: Add ability to specify the partitioner to be used with old-style ResultsPartitioner partitions Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitionsFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitionsFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitionsFactory.java 2012-01-24 18:58:08 UTC (rev 3603) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/query/resultspartitioner/ResultsPartitionsFactory.java 2012-01-25 23:47:49 UTC (rev 3604) @@ -53,6 +53,11 @@ */ public static ArrayList<ResultsPartition> get(CaptureSearchResults results, WaybackRequest wbRequest) { + return get(results, wbRequest, null); + } + + public static ArrayList<ResultsPartition> get(CaptureSearchResults results, + WaybackRequest wbRequest, ResultsPartitioner defaultPartitioner) { Timestamp startTS = Timestamp.parseBefore(results.getFilter( WaybackRequest.REQUEST_START_DATE)); Timestamp endTS = Timestamp.parseAfter(results.getFilter( @@ -67,14 +72,17 @@ long msSpanned = endDate.getTime() - startDate.getTime(); int secsSpanned = (int) Math.ceil(msSpanned / 1000); - ResultsPartitioner partitioner = null; - for(int i = 0; i < partitioners.length; i++) { - partitioner = partitioners[i]; - if(partitioner.maxSecondsSpanned() >= secsSpanned) { - break; + ResultsPartitioner partitioner = defaultPartitioner; + + if (partitioner == null) { + for(int i = 0; i < partitioners.length; i++) { + partitioner = partitioners[i]; + if(partitioner.maxSecondsSpanned() >= secsSpanned) { + break; + } } } - + // now use the partitioner to initialize and populate the // ResultPartition objects: ArrayList<ResultsPartition> partitions = This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2012-01-24 18:58:18
|
Revision: 3603 http://archive-access.svn.sourceforge.net/archive-access/?rev=3603&view=rev Author: binzino Date: 2012-01-24 18:58:08 +0000 (Tue, 24 Jan 2012) Log Message: ----------- Employ -s option. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:28:14 UTC (rev 3602) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2012-01-24 18:58:08 UTC (rev 3603) @@ -737,7 +737,7 @@ if ( ! fs.getFileStatus( outputDir ).isDir() ) { - System.err.println( "ERROR: Output directory is not a directory: " + outputDir ); + LOG.fatal( "Output is not a directory: " + outputDir ); return 2; } @@ -754,11 +754,13 @@ if ( fs.exists( outputPath ) ) { - System.err.println( "ERROR: Output path already exists: " + outputPath ); - if ( ! skipExisting ) + if ( skipExisting ) { - return 3; + LOG.warn( "Skipping output path which already exists: " + outputPath ); + continue ; } + LOG.fatal( "Output path already exists: " + outputPath ); + return 3; } job.setJobName( "Importer " + inputPath ); @@ -790,9 +792,7 @@ catch ( Exception e ) { LOG.fatal( "Importer: ", e ); - System.out.println( "Fatal error: " + e ); - e.printStackTrace( System.out ); - return -1; + return 4; } } @@ -805,7 +805,9 @@ "Usage: Importer [opts] <input> <output_dir>]\n" + "Options:\n" + " -e filename Exclusions file, over-rides configuration property.\n" - + " -m Inputs are manifest files\n" + + " -m Inputs are manifest files\n" + + " -s Skip inputs where corresponding output directory exists.\n" + + " Without -s, processing reports error and stops.\n" + "\n" ; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |