You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2010-12-30 19:54:39
|
Revision: 3355 http://archive-access.svn.sourceforge.net/archive-access/?rev=3355&view=rev Author: bradtofel Date: 2010-12-30 19:54:31 +0000 (Thu, 30 Dec 2010) Log Message: ----------- checkpoint prior to major version refactoring Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/pom.xml trunk/archive-access/projects/wayback/pom.xml trunk/archive-access/projects/wayback/wayback-core/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Modified: trunk/archive-access/projects/wayback/dist/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/pom.xml 2010-12-09 22:58:06 UTC (rev 3354) +++ trunk/archive-access/projects/wayback/dist/pom.xml 2010-12-30 19:54:31 UTC (rev 3355) @@ -1,63 +1,25 @@ -<?xml version="1.0"?> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> - <groupId>org.archive</groupId> - <artifactId>wayback</artifactId> + <artifactId>parent</artifactId> + <groupId>org.archive.wayback</groupId> <version>1.6.0</version> </parent> - <modelVersion>4.0.0</modelVersion> - <groupId>org.archive.wayback</groupId> - <artifactId>wayback</artifactId> + <artifactId>dist</artifactId> <name>Wayback tar.gz Distribution</name> <packaging>pom</packaging> - <pluginRepositories> - <pluginRepository> - <releases> - <enabled>true</enabled> - <updatePolicy>daily</updatePolicy> - <checksumPolicy>warn</checksumPolicy> - </releases> - <snapshots> - <enabled>true</enabled> - <updatePolicy>never</updatePolicy> - <checksumPolicy>fail</checksumPolicy> - </snapshots> - <id>agilejava</id> - <name>agilejava.com</name> - <url>http://www.agilejava.com/maven</url> - <layout>default</layout> - </pluginRepository> - </pluginRepositories> - - <repositories> - <repository> - <releases> - <enabled>true</enabled> - <updatePolicy>always</updatePolicy> - <checksumPolicy>warn</checksumPolicy> - </releases> - <snapshots> - <enabled>true</enabled> - <updatePolicy>never</updatePolicy> - <checksumPolicy>fail</checksumPolicy> - </snapshots> - <id>internetarchive</id> - <name>Internet Archive Maven Repository</name> - <url>http://builds.archive.org:8080/maven2</url> - <layout>default</layout> - </repository> - </repositories> - <dependencies> <dependency> - <groupId>org.archive.wayback</groupId> + <groupId>${project.groupId}</groupId> <artifactId>wayback-core</artifactId> - <version>1.6.0</version> </dependency> </dependencies> - <build> <plugins> Modified: trunk/archive-access/projects/wayback/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/pom.xml 2010-12-09 22:58:06 UTC (rev 3354) +++ trunk/archive-access/projects/wayback/pom.xml 2010-12-30 19:54:31 UTC (rev 3355) @@ -1,28 +1,27 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - POM reference: http://maven.apache.org/pom.html +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> - List of the better articles on maven: + <groupId>org.archive.wayback</groupId> + <artifactId>parent</artifactId> + <packaging>pom</packaging> + <version>1.6.0</version> + <name>Wayback</name> - http://www.javaworld.com/javaworld/jw-05-2006/jw-0529-maven.html - http://www.javaworld.com/javaworld/jw-02-2006/jw-0227-maven_p.html + <modules> + <module>wayback-core</module> + <module>wayback-webapp</module> + <module>wayback-hadoop-java</module> + <module>wayback-hadoop</module> + <module>dist</module> + </modules> - URLs on converting from 1.0 to 2.0 maven (not much good generally): - - http://wiki.osafoundation.org/bin/view/Journal/Maven2Upgrade - http://maven.apache.org/guides/mini/guide-m1-m2.html - --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - <modelVersion>4.0.0</modelVersion> - <groupId>org.archive</groupId> - <artifactId>wayback</artifactId> <properties> - <globalVersion>1.6.0</globalVersion> - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <website.url>http://archive-access.sourceforge.net/</website.url> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> - <version>1.6.0</version> - <packaging>pom</packaging> - <name>Wayback</name> <description> The wayback project is an open source implementation of the @@ -34,8 +33,8 @@ <licenses> <license> - <name>GNU LESSER GENERAL PUBLIC LICENSE</name> - <url>http://www.gnu.org/licenses/lgpl.txt</url> + <name>Apache License, Version 2.0</name> + <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> <distribution>repo</distribution> </license> </licenses> @@ -86,24 +85,12 @@ <repositories> <repository> - <releases> - <enabled>true</enabled> - <updatePolicy>daily</updatePolicy> - <checksumPolicy>warn</checksumPolicy> - </releases> - <snapshots> - <enabled>true</enabled> - <updatePolicy>daily</updatePolicy> - <checksumPolicy>fail</checksumPolicy> - </snapshots> + <id>internetarchive</id> <name>Internet Archive Maven Repository</name> <url>http://builds.archive.org:8080/maven2</url> <layout>default</layout> - </repository> -<!-- - <repository> <releases> <enabled>true</enabled> <updatePolicy>daily</updatePolicy> @@ -114,18 +101,17 @@ <updatePolicy>daily</updatePolicy> <checksumPolicy>fail</checksumPolicy> </snapshots> - <id>dspace</id> - <name>DSpace Maven Repository</name> - <url>http://maven.dspace.org/</url> - <layout>default</layout> </repository> - --> - - </repositories> <pluginRepositories> <pluginRepository> + + <id>archive</id> + <name>archive.org</name> + <url>http://builds.archive.org:8080/maven2</url> + <layout>default</layout> + <releases> <enabled>true</enabled> <updatePolicy>daily</updatePolicy> @@ -136,10 +122,6 @@ <updatePolicy>never</updatePolicy> <checksumPolicy>fail</checksumPolicy> </snapshots> - <id>archive</id> - <name>archive.org</name> - <url>http://builds.archive.org:8080/maven2</url> - <layout>default</layout> </pluginRepository> </pluginRepositories> @@ -149,47 +131,34 @@ </connection> <tag>HEAD</tag> <url> - https://archive-access.svn.sourceforge.net/svnroot/archive-access/trunk/archive-access/projects/ + http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/ </url> </scm> <prerequisites> - <maven>2.0.9</maven> + <maven>2.1</maven> </prerequisites> - <dependencyManagement> - <!--Dependeny management is not same as dependencies (ugh)--> - <dependencies> - <dependency> - <groupId>org.archive.wayback</groupId> - <artifactId>wayback-core</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.archive.wayback</groupId> - <artifactId>wayback-hadoop-java</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.archive.wayback</groupId> - <artifactId>wayback-hadoop</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.archive.wayback</groupId> - <artifactId>wayback-webapp</artifactId> - <version>${project.version}</version> - </dependency> - </dependencies> - </dependencyManagement> - <modules> - <module>wayback-core</module> - <module>wayback-webapp</module> - <module>wayback-hadoop-java</module> - <module>wayback-hadoop</module> - <module>dist</module> - </modules> + <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <source>1.5</source> + <target>1.5</target> + </configuration> + </plugin> + <plugin> + <groupId>org.mortbay.jetty</groupId> + <artifactId>maven-jetty-plugin</artifactId> + <version>6.1.22</version> + </plugin> + </plugins> + </pluginManagement> + </build> <distributionManagement> <repository> @@ -223,12 +192,14 @@ <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-javadoc-plugin</artifactId> <configuration> - <aggregate>true</aggregate> + <code>javadoc:aggregate</code> + <code>javadoc:test-aggregate</code> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-project-info-reports-plugin</artifactId> + <version>2.1</version> <reportSets> <reportSet> <reports> @@ -246,4 +217,92 @@ </plugins> </reporting> +<!-- + Finally, the dependencyManagement - all version coordinates for all + dependencies should be specified here, allowing child modules to specify + only groupId and artifactId coordinates. +--> + + <dependencyManagement> + <dependencies> + + <!-- + Link all Wayback child module version dependencies to the parent POM + version. + --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>wayback-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>wayback-hadoop-java</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>wayback-hadoop</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>wayback-webapp</artifactId> + <version>${project.version}</version> + </dependency> + + + + <dependency> + <groupId>org.apache.geronimo.specs</groupId> + <artifactId>geronimo-servlet_2.5_spec</artifactId> + <version>1.2</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.archive.heritrix</groupId> + <artifactId>heritrix-commons</artifactId> + <version>3.1.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.archive.access-control</groupId> + <artifactId>access-control</artifactId> + <version>0.0.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.mozilla</groupId> + <artifactId>juniversalchardet</artifactId> + <version>1.0.3</version> + </dependency> + <dependency> + <groupId>org.springframework</groupId> + <artifactId>spring-core</artifactId> + <version>2.5.1</version> + </dependency> + <dependency> + <groupId>org.springframework</groupId> + <artifactId>spring-beans</artifactId> + <version>2.5.1</version> + </dependency> + <dependency> + <groupId>org.beanshell</groupId> + <artifactId>bsh</artifactId> + <version>2.0b4</version> + </dependency> + <dependency> + <groupId>org.htmlparser</groupId> + <artifactId>htmlparser</artifactId> + <version>1.6</version> + </dependency> + <dependency> + <groupId>com.flagstone</groupId> + <artifactId>transform</artifactId> + <version>3.0.1-SNAPSHOT</version> + </dependency> + + </dependencies> + </dependencyManagement> + + + </project> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-12-09 22:58:06 UTC (rev 3354) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-12-30 19:54:31 UTC (rev 3355) @@ -1,40 +1,24 @@ -<?xml version="1.0"?> -<!-- - POM reference: http://maven.apache.org/pom.html +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/maven-v4_0_0.xsd"> - List of the better articles on maven: - - http://www.javaworld.com/javaworld/jw-05-2006/jw-0529-maven.html - http://www.javaworld.com/javaworld/jw-02-2006/jw-0227-maven_p.html - - URLs on converting from 1.0 to 2.0 maven (not much good generally): - - http://wiki.osafoundation.org/bin/view/Journal/Maven2Upgrade - http://maven.apache.org/guides/mini/guide-m1-m2.html - --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> + <parent> - <groupId>org.archive</groupId> - <artifactId>wayback</artifactId> + <artifactId>parent</artifactId> + <groupId>org.archive.wayback</groupId> <version>1.6.0</version> </parent> - <groupId>org.archive.wayback</groupId> + <artifactId>wayback-core</artifactId> - <name>Wayback Core Classes</name> + <name>Wayback Core Java Classes</name> <packaging>jar</packaging> +<!-- <build> <plugins> <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <configuration> - <source>1.5</source> - <target>1.5</target> - </configuration> - </plugin> - <plugin> <artifactId>maven-jar-plugin</artifactId> <configuration> <archive> @@ -47,13 +31,13 @@ </plugin> </plugins> </build> +--> <dependencies> - <dependency> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - <version>2.4</version> + <groupId>org.apache.geronimo.specs</groupId> + <artifactId>geronimo-servlet_2.5_spec</artifactId> + <version>1.2</version> <scope>provided</scope> </dependency> <dependency> Modified: trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-12-09 22:58:06 UTC (rev 3354) +++ trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-12-30 19:54:31 UTC (rev 3355) @@ -1,15 +1,18 @@ -<?xml version="1.0" encoding="UTF-8"?><project> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <parent> - <artifactId>wayback</artifactId> - <groupId>org.archive</groupId> + <artifactId>parent</artifactId> + <groupId>org.archive.wayback</groupId> <version>1.6.0</version> </parent> - <modelVersion>4.0.0</modelVersion> - <groupId>org.archive.wayback</groupId> + <artifactId>wayback-hadoop</artifactId> <name>Wayback Hadoop Jar Packaging</name> - <version>1.6.0</version> - <url>http://maven.apache.org</url> <packaging>pom</packaging> <dependencies> Modified: trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2010-12-09 22:58:06 UTC (rev 3354) +++ trunk/archive-access/projects/wayback/wayback-hadoop-java/pom.xml 2010-12-30 19:54:31 UTC (rev 3355) @@ -1,16 +1,18 @@ -<?xml version="1.0" encoding="UTF-8"?> -<project> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <parent> - <artifactId>wayback</artifactId> - <groupId>org.archive</groupId> + <artifactId>parent</artifactId> + <groupId>org.archive.wayback</groupId> <version>1.6.0</version> </parent> - <modelVersion>4.0.0</modelVersion> - <groupId>org.archive.wayback</groupId> + <artifactId>wayback-hadoop-java</artifactId> <name>Wayback Hadoop Java Code</name> - <version>1.6.0</version> - <url>http://maven.apache.org</url> <packaging>jar</packaging> <dependencies> @@ -40,6 +42,7 @@ <version>1.6.0</version> </dependency> </dependencies> + <!-- <build> <plugins> <plugin> @@ -52,4 +55,5 @@ </plugin> </plugins> </build> + --> </project> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-12-09 22:58:06 UTC (rev 3354) +++ trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-12-30 19:54:31 UTC (rev 3355) @@ -1,15 +1,18 @@ -<?xml version="1.0" encoding="UTF-8"?> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> - <artifactId>wayback</artifactId> - <groupId>org.archive</groupId> + <artifactId>parent</artifactId> + <groupId>org.archive.wayback</groupId> <version>1.6.0</version> </parent> - <modelVersion>4.0.0</modelVersion> - <groupId>org.archive.wayback</groupId> + <artifactId>wayback-webapp</artifactId> + <name>Wayback Web Application</name> <packaging>war</packaging> - <name>Wayback Web Application</name> <build> <finalName>wayback-${project.version}</finalName> @@ -26,7 +29,9 @@ <plugin> <groupId>org.mortbay.jetty</groupId> <artifactId>maven-jetty-plugin</artifactId> + <!-- <version>6.1.22</version> + --> </plugin> </plugins> </build> @@ -34,7 +39,7 @@ <dependencies> <dependency> - <groupId>org.archive.wayback</groupId> + <groupId>${project.groupId}</groupId> <artifactId>wayback-core</artifactId> <scope>compile</scope> </dependency> @@ -44,6 +49,7 @@ <version>5.5.15</version> <scope>provided</scope> </dependency> + <!-- <dependency> <groupId>javax.servlet</groupId> <artifactId>jstl</artifactId> @@ -55,6 +61,7 @@ <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> + --> <dependency> <groupId>org.dspace</groupId> <artifactId>foresite</artifactId> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-12-09 22:58:12
|
Revision: 3354 http://archive-access.svn.sourceforge.net/archive-access/?rev=3354&view=rev Author: bradtofel Date: 2010-12-09 22:58:06 +0000 (Thu, 09 Dec 2010) Log Message: ----------- TWEAK: comment Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml 2010-12-02 05:18:47 UTC (rev 3353) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/LiveWeb.xml 2010-12-09 22:58:06 UTC (rev 3354) @@ -33,7 +33,7 @@ class="org.archive.wayback.liveweb.RemoteLiveWebCache"> <property name="proxyHostPort" value="localhost:8099" /> <!-- - If you've set up a local squid to cache requests to the above + If you've set up a local squid/varnish to cache requests to the above ARCRecordingProxy, you should use the port for that, instead of 8099: <property name="proxyHostPort" value="localhost:3128" /> --> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-12-02 05:18:53
|
Revision: 3353 http://archive-access.svn.sourceforge.net/archive-access/?rev=3353&view=rev Author: bradtofel Date: 2010-12-02 05:18:47 +0000 (Thu, 02 Dec 2010) Log Message: ----------- Upped maven-site-pligin to version 2.1 COMMENT + WHITESPACE Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/pom.xml Modified: trunk/archive-access/projects/wayback/dist/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/pom.xml 2010-12-02 05:13:40 UTC (rev 3352) +++ trunk/archive-access/projects/wayback/dist/pom.xml 2010-12-02 05:18:47 UTC (rev 3353) @@ -1,8 +1,8 @@ <?xml version="1.0"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <parent> - <groupId>org.archive</groupId> - <artifactId>wayback</artifactId> + <groupId>org.archive</groupId> + <artifactId>wayback</artifactId> <version>1.6.0</version> </parent> <modelVersion>4.0.0</modelVersion> @@ -49,6 +49,7 @@ <layout>default</layout> </repository> </repositories> + <dependencies> <dependency> <groupId>org.archive.wayback</groupId> @@ -56,6 +57,7 @@ <version>1.6.0</version> </dependency> </dependencies> + <build> <plugins> @@ -78,18 +80,17 @@ <goal>attached</goal> </goals> </execution> - </executions> + </executions> </plugin> -<plugin> - <artifactId>maven-site-plugin</artifactId> -<!-- <version>2.0-SNAPSHOT</version> --> - <configuration> -<!-- <locales>en</locales> --> - <inputencoding>utf-8</inputencoding> - <outputencoding>utf-8</outputencoding> - </configuration> -</plugin> + <plugin> + <artifactId>maven-site-plugin</artifactId> + <version>2.1</version> + <configuration> + <inputencoding>utf-8</inputencoding> + <outputencoding>utf-8</outputencoding> + </configuration> + </plugin> </plugins> </build> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-12-02 05:13:46
|
Revision: 3352 http://archive-access.svn.sourceforge.net/archive-access/?rev=3352&view=rev Author: bradtofel Date: 2010-12-02 05:13:40 +0000 (Thu, 02 Dec 2010) Log Message: ----------- DOC tweaks Modified Paths: -------------- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/downloads.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2010-12-02 05:12:57 UTC (rev 3351) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/administrator_manual.xml 2010-12-02 05:13:40 UTC (rev 3352) @@ -177,7 +177,7 @@ </li> <li> <b>livewebPrefix</b> - a String URL prefix indicating the host, - port, and path to the correct Replay AccessPoint. + port, and path to an AccessPoint configured with Live Web fetching. </li> <li><b>locale</b> - A specific Locale to use for all requests within this AccessPoint, overriding the users preferred Locale Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/downloads.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/downloads.xml 2010-12-02 05:12:57 UTC (rev 3351) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/downloads.xml 2010-12-02 05:13:40 UTC (rev 3352) @@ -11,7 +11,7 @@ <subsection name="Releases"> <p>All releases are available off the - <a href="http://sourceforge.net/project/showfiles.php?group_id=118427">Sourceforge Downloads</a> page. Release notes can be found here, + <a href="http://sourceforge.net/project/showfiles.php?group_id=118427">Sourceforge Downloads</a> page. Full <a href="release_notes.html">Release notes</a> are available for releases beyond 1.2.0. </p> </subsection> Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml 2010-12-02 05:12:57 UTC (rev 3351) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/hadoop.xml 2010-12-02 05:13:40 UTC (rev 3352) @@ -10,7 +10,7 @@ <body> <section name="Overview"> <p> - Wayback is distributed with an .jar file that + Wayback is distributed with a .jar file that simplifies creation of large-scale CDX files using hadoop. This code is experimental, and will primarily be useful only if your CDX files are very large - more than a few hundred GB (or more, depending on your Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml 2010-12-02 05:12:57 UTC (rev 3351) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/index.xml 2010-12-02 05:13:40 UTC (rev 3352) @@ -68,8 +68,8 @@ In the local, standalone mode, this software includes the capability to scan for new archived content in a specified location, and to automatically index and serve the new content as it appears. Directing - the Wayback to look for ARC files in the directory where an instance of - the Heritrix web crawler is writing ARC output should provide the + the Wayback to look for W/ARC files in the directory where an instance of + the Heritrix web crawler is writing W/ARC output should provide the capability to browse content archived by Heritrix as it is crawled. </p> </section> Modified: trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml =================================================================== --- trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml 2010-12-02 05:12:57 UTC (rev 3351) +++ trunk/archive-access/projects/wayback/dist/src/site/xdoc/release_notes.xml 2010-12-02 05:13:40 UTC (rev 3352) @@ -43,8 +43,8 @@ Improved hadoop CDX generation capabilities for large scale indexes. </li> <li> - SWF (Flash) rewriting, to contextualize URLs embedded within flash - content. + SWF (Flash) rewriting, to contextualize absolute URLs embedded + within flash content. </li> <li> ArchivalUrl mode now accepts identity ("id_") flag to indicate This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-12-02 05:13:03
|
Revision: 3351 http://archive-access.svn.sourceforge.net/archive-access/?rev=3351&view=rev Author: bradtofel Date: 2010-12-02 05:12:57 +0000 (Thu, 02 Dec 2010) Log Message: ----------- WHITESPACE + COMMENT Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-12-02 05:06:57 UTC (rev 3350) +++ trunk/archive-access/projects/wayback/wayback-webapp/pom.xml 2010-12-02 05:12:57 UTC (rev 3351) @@ -10,6 +10,7 @@ <artifactId>wayback-webapp</artifactId> <packaging>war</packaging> <name>Wayback Web Application</name> + <build> <finalName>wayback-${project.version}</finalName> <plugins> @@ -29,19 +30,13 @@ </plugin> </plugins> </build> + <dependencies> + <dependency> <groupId>org.archive.wayback</groupId> <artifactId>wayback-core</artifactId> - <scope>compile</scope> - <!-- - <exclusions> - <exclusion> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> - </exclusion> - </exclusions> - --> + <scope>compile</scope> </dependency> <dependency> <groupId>tomcat</groupId> @@ -50,21 +45,22 @@ <scope>provided</scope> </dependency> <dependency> - <groupId>javax.servlet</groupId> - <artifactId>jstl</artifactId> - <version>1.0</version> - <scope>compile</scope> + <groupId>javax.servlet</groupId> + <artifactId>jstl</artifactId> + <version>1.0</version> + <scope>compile</scope> </dependency> <dependency> - <groupId>taglibs</groupId> - <artifactId>standard</artifactId> - <version>1.1.2</version> + <groupId>taglibs</groupId> + <artifactId>standard</artifactId> + <version>1.1.2</version> </dependency> <dependency> - <groupId>org.dspace</groupId> - <artifactId>foresite</artifactId> - <version>SNAPSHOT</version> + <groupId>org.dspace</groupId> + <artifactId>foresite</artifactId> + <version>SNAPSHOT</version> </dependency> + </dependencies> -</project> +</project> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-12-02 05:07:05
|
Revision: 3350 http://archive-access.svn.sourceforge.net/archive-access/?rev=3350&view=rev Author: bradtofel Date: 2010-12-02 05:06:57 +0000 (Thu, 02 Dec 2010) Log Message: ----------- WHITESPACE + COMMENT Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/pom.xml Modified: trunk/archive-access/projects/wayback/wayback-core/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-12-02 05:03:46 UTC (rev 3349) +++ trunk/archive-access/projects/wayback/wayback-core/pom.xml 2010-12-02 05:06:57 UTC (rev 3350) @@ -15,14 +15,15 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>org.archive</groupId> - <artifactId>wayback</artifactId> - <version>1.6.0</version> + <groupId>org.archive</groupId> + <artifactId>wayback</artifactId> + <version>1.6.0</version> </parent> <groupId>org.archive.wayback</groupId> <artifactId>wayback-core</artifactId> <name>Wayback Core Classes</name> <packaging>jar</packaging> + <build> <plugins> <plugin> @@ -36,16 +37,17 @@ <plugin> <artifactId>maven-jar-plugin</artifactId> <configuration> - <archive> - <manifest> - <mainClass>org.archive.wayback.Wayback</mainClass> - <addClasspath>true</addClasspath> - </manifest> - </archive> + <archive> + <manifest> + <mainClass>org.archive.wayback.Wayback</mainClass> + <addClasspath>true</addClasspath> + </manifest> + </archive> </configuration> </plugin> </plugins> </build> + <dependencies> <dependency> @@ -72,7 +74,7 @@ <groupId>org.archive.heritrix</groupId> <artifactId>commons</artifactId> </exclusion> - </exclusions> + </exclusions> </dependency> <dependency> <groupId>org.mozilla</groupId> @@ -95,24 +97,16 @@ <version>2.0b4</version> </dependency> <dependency> - <groupId>org.htmlparser</groupId> - <artifactId>htmlparser</artifactId> - <version>1.6</version> + <groupId>org.htmlparser</groupId> + <artifactId>htmlparser</artifactId> + <version>1.6</version> </dependency> <dependency> - <groupId>com.flagstone</groupId> - <artifactId>transform</artifactId> - <version>3.0.1-SNAPSHOT</version> + <groupId>com.flagstone</groupId> + <artifactId>transform</artifactId> + <version>3.0.1-SNAPSHOT</version> </dependency> - - <!-- - Doh... I'm not sure what package is configuring org.apache.commons-logging - to use log4j, but it's breaking some command line tools. - <dependency> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> - <version>1.2.14</version> - </dependency> - --> + </dependencies> + </project> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-12-02 05:03:52
|
Revision: 3349 http://archive-access.svn.sourceforge.net/archive-access/?rev=3349&view=rev Author: bradtofel Date: 2010-12-02 05:03:46 +0000 (Thu, 02 Dec 2010) Log Message: ----------- WHITESPACE Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml Modified: trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-11-28 06:34:54 UTC (rev 3348) +++ trunk/archive-access/projects/wayback/wayback-hadoop/pom.xml 2010-12-02 05:03:46 UTC (rev 3349) @@ -11,6 +11,7 @@ <version>1.6.0</version> <url>http://maven.apache.org</url> <packaging>pom</packaging> + <dependencies> <dependency> <groupId>junit</groupId> @@ -25,6 +26,7 @@ <scope>compile</scope> </dependency> </dependencies> + <build> <plugins> <plugin> @@ -36,7 +38,7 @@ </descriptorRefs> <finalName>wayback-hadoop</finalName> <archive> - <manifestFile>src/main/archive/MANIFEST.MF</manifestFile> + <manifestFile>src/main/archive/MANIFEST.MF</manifestFile> </archive> </configuration> <executions> @@ -50,4 +52,5 @@ </plugin> </plugins> </build> + </project> \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-11-28 06:35:01
|
Revision: 3348 http://archive-access.svn.sourceforge.net/archive-access/?rev=3348&view=rev Author: bradtofel Date: 2010-11-28 06:34:54 +0000 (Sun, 28 Nov 2010) Log Message: ----------- TWEAK: removed duplicate entry Modified Paths: -------------- trunk/archive-access/projects/wayback/.classpath Modified: trunk/archive-access/projects/wayback/.classpath =================================================================== --- trunk/archive-access/projects/wayback/.classpath 2010-11-28 06:08:47 UTC (rev 3347) +++ trunk/archive-access/projects/wayback/.classpath 2010-11-28 06:34:54 UTC (rev 3348) @@ -2,12 +2,11 @@ <classpath> <classpathentry kind="src" output="wayback-core/target/classes" path="wayback-core/src/main/java"/> <classpathentry kind="src" output="wayback-core/target/test-classes" path="wayback-core/src/test/java"/> - <classpathentry kind="src" output="wayback-mapreduce-prereq/target/classes" path="wayback-mapreduce-prereq/src/main/java"/> + <classpathentry kind="src" output="wayback-hadoop-java/target/classes" path="wayback-hadoop-java/src/main/java"/> <classpathentry kind="src" output="wayback-webapp/target/classes" path="wayback-webapp/src/main/java"/> <classpathentry kind="src" output="wayback-webapp/target/test-classes" path="wayback-webapp/src/test/java"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="con" path="org.eclipse.jst.server.core.container/org.eclipse.jst.server.tomcat.runtimeTarget/Apache Tomcat v5.5"/> - <classpathentry kind="con" path="org.eclipse.jst.j2ee.internal.module.container"/> <classpathentry kind="con" path="org.maven.ide.eclipse.MAVEN2_CLASSPATH_CONTAINER"> <attributes> <attribute name="org.eclipse.jst.component.dependency" value="/WEB-INF/lib"/> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3347 http://archive-access.svn.sourceforge.net/archive-access/?rev=3347&view=rev Author: bradtofel Date: 2010-11-28 06:08:47 +0000 (Sun, 28 Nov 2010) Log Message: ----------- TWEAK: now checks for aggregationPrefix first, then Prefix when determining redirect URL. It's a hack, but should work fine until the whole prefix problem is cleared up.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java 2010-11-23 01:22:45 UTC (rev 3346) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/TimeBundleRequestParser.java 2010-11-28 06:08:47 UTC (rev 3347) @@ -72,7 +72,12 @@ wbRequest.setCaptureQueryRequest(); wbRequest.setRequestUrl(urlStr); - String uriPrefix = accessPoint.getConfigs().getProperty("Prefix"); + String uriPrefix = accessPoint.getConfigs().getProperty("aggregationPrefix"); + if(uriPrefix == null) { + // TODO: this is a hack... need to clean up the whole prefix + // configuration setup... + uriPrefix = accessPoint.getConfigs().getProperty("Prefix"); + } String betterUrl = uriPrefix + "timemap/rdf/" + urlStr; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3346 http://archive-access.svn.sourceforge.net/archive-access/?rev=3346&view=rev Author: binzino Date: 2010-11-23 01:22:45 +0000 (Tue, 23 Nov 2010) Log Message: ----------- Add params to pdftotext to inhibit error messages and to omit page breaks. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-23 00:31:27 UTC (rev 3345) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-23 01:22:45 UTC (rev 3346) @@ -80,8 +80,10 @@ fos.write( raw ); fos.close(); + String exepath = this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ); + // Now create a Process to call 'pdftotext' to extract the metadata. - ProcessBuilder pb = new ProcessBuilder( this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ), "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); + ProcessBuilder pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); Process p = pb.start(); @@ -96,7 +98,7 @@ p.destroy( ); - pb = new ProcessBuilder( "/usr/bin/pdftotext", tmpfile.toString(), "-" ); + pb = new ProcessBuilder( exepath, "-q", "-nopgbrk", tmpfile.toString(), "-" ); p = pb.start( ); p.getOutputStream( ).close( ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3345 http://archive-access.svn.sourceforge.net/archive-access/?rev=3345&view=rev Author: binzino Date: 2010-11-23 00:31:27 +0000 (Tue, 23 Nov 2010) Log Message: ----------- Remove bogus debug message. Add config for path to pdftotext executable. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-22 22:44:48 UTC (rev 3344) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-23 00:31:27 UTC (rev 3345) @@ -64,8 +64,6 @@ public ParseResult getParse( Content content ) { - System.out.println( "PDFParser" ); - Metadata metadata = new Metadata(); String title = ""; String text = ""; @@ -83,7 +81,7 @@ fos.close(); // Now create a Process to call 'pdftotext' to extract the metadata. - ProcessBuilder pb = new ProcessBuilder( "/usr/bin/pdftotext", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); + ProcessBuilder pb = new ProcessBuilder( this.conf.get( "org.archive.nutchwax.parse.pdf.pdftotext.path", "/usr/bin/pdftotext" ), "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); Process p = pb.start(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-22 22:44:55
|
Revision: 3344 http://archive-access.svn.sourceforge.net/archive-access/?rev=3344&view=rev Author: binzino Date: 2010-11-22 22:44:48 +0000 (Mon, 22 Nov 2010) Log Message: ----------- Add PDF parser that uses external 'pdftotext' tool. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml Added Paths: ----------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(pdf2|tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -31,7 +31,7 @@ </mimeType> <mimeType name="application/pdf"> - <plugin id="parse-tika" /> + <plugin id="parse-pdf2" /> </mimeType> <mimeType name="application/vnd.ms-excel"> @@ -152,6 +152,7 @@ <alias name="parse-ext" extension-id="ExtParser" /> <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> + <alias name="parse-pdf2" extension-id="org.archive.nutchwax.parse.pdf.PDFParser" /> <!-- <alias name="parse-js" extension-id="JSParser" /> <alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2010-11-19 02:51:57 UTC (rev 3343) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/plugin/build.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -91,6 +91,7 @@ <ant dir="query-nutchwax" target="deploy" /> <ant dir="scoring-nutchwax" target="deploy" /> <ant dir="urlfilter-nutchwax" target="deploy" /> + <ant dir="parse-pdf2" target="deploy" /> </target> @@ -202,5 +203,6 @@ <ant dir="query-nutchwax" target="clean" /> <ant dir="scoring-nutchwax" target="clean" /> <ant dir="urlfilter-nutchwax" target="clean" /> + <ant dir="parse-pdf2" target="clean" /> </target> </project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/build.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-pdf2" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/plugin.xml 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,49 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Copyright (C) 2010 Internet Archive. + + This file is part of the archive-access tools project + (http://sourceforge.net/projects/archive-access). + + The archive-access tools are free software; you can redistribute them and/or + modify them under the terms of the GNU Lesser Public License as published by + the Free Software Foundation; either version 2.1 of the License, or any + later version. + + The archive-access tools are distributed in the hope that they will be + useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + Public License for more details. + + You should have received a copy of the GNU Lesser Public License along with + the archive-access tools; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +--> +<plugin + id="parse-pdf2" + name="External PDF Parser" + version="1.0.0" + provider-name="archive.org"> + + <runtime> + <library name="parse-pdf2.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.archive.nutchwax.parse.pdf" + name="NutchWAX External PDF Parser" + point="org.apache.nutch.parse.Parser"> + + <implementation id="org.archive.nutchwax.parse.pdf.PDFParser" + class="org.archive.nutchwax.parse.pdf.PDFParser"> + <parameter name="contentType" value="application/pdf" /> + <parameter name="pathSuffix" value="" /> + </implementation> + </extension> + +</plugin> Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java (rev 0) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/parse-pdf2/src/java/org/archive/nutchwax/parse/pdf/PDFParser.java 2010-11-22 22:44:48 UTC (rev 3344) @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2010 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.parse.pdf; + +import java.io.*; +import java.util.*; +import java.util.regex.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +import org.apache.nutch.protocol.Content; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; +import org.apache.nutch.util.LogUtil; + + +/** + * + */ +public class PDFParser implements Parser +{ + public static final Log LOG = LogFactory.getLog( PDFParser.class ); + + private Configuration conf; + + public void setConf( Configuration conf ) + { + this.conf = conf; + } + + public Configuration getConf( ) + { + return this.conf; + } + + public ParseResult getParse( Content content ) + { + System.out.println( "PDFParser" ); + + Metadata metadata = new Metadata(); + String title = ""; + String text = ""; + + byte[] raw = content.getContent( ); + + File tmpfile = null; + try + { + tmpfile = File.createTempFile( "pdf2-", ".pdf" ); + + // Write the PDF document to the tmp file. + FileOutputStream fos = new FileOutputStream( tmpfile ); + fos.write( raw ); + fos.close(); + + // Now create a Process to call 'pdftotext' to extract the metadata. + ProcessBuilder pb = new ProcessBuilder( "/usr/bin/pdftotext", "-htmlmeta", "-f", "1", "-l", "1", tmpfile.toString(), "-" ); + + Process p = pb.start(); + + p.getOutputStream( ).close(); + String head = suck( new InputStreamReader( p.getInputStream( ) ) ); + byte[] err = suck( p.getErrorStream( ) ); + + if ( err.length > 0 ) + { + LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + } + + p.destroy( ); + + pb = new ProcessBuilder( "/usr/bin/pdftotext", tmpfile.toString(), "-" ); + p = pb.start( ); + + p.getOutputStream( ).close( ); + text = suck( new InputStreamReader( p.getInputStream( ) ) ); + err = suck( p.getErrorStream( ) ); + + if ( err.length > 0 ) + { + LOG.warn( "Error from pdftotext: " + new String( err, "utf-8" ) ); + } + + p.destroy( ); + + Matcher m = Pattern.compile( "<html>.*?<title>(.*?)</title>.*?</head>", Pattern.DOTALL ).matcher( head ); + if ( m.find( ) ) + { + title = m.group(1); + } + + //System.out.println( "head = " + head ); + //System.out.println( "title = " + title ); + + // No outlinks. + Outlink[] outlinks = new Outlink[0]; + + ParseData parseData = new ParseData( ParseStatus.STATUS_SUCCESS, + title, + outlinks, + content.getMetadata(), + metadata ); + + return ParseResult.createParseResult( content.getUrl(), new ParseImpl( text, parseData ) ); + } + catch ( Exception e ) + { + LOG.error( e ); + } + finally + { + if ( tmpfile != null ) + { + tmpfile.delete(); + } + } + + // TODO! + return null; + } + + private byte[] suck( InputStream is ) + throws IOException + { + ByteArrayOutputStream baos = new ByteArrayOutputStream( 4* 1024 ); + byte[] buf = new byte[1024*4]; + int c = -1; + while ( (c = is.read( buf )) != -1 ) + { + baos.write( buf, 0, c ); + } + + return baos.toByteArray(); + } + + private String suck( InputStreamReader reader ) + throws IOException + { + StringBuilder sb = new StringBuilder( 1024 * 4 ); + char[] buf = new char[1024*4]; + int c = -1; + + while ( (c = reader.read( buf )) != -1 ) + { + sb.append( buf, 0, c ); + } + + return sb.toString(); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-11-19 02:52:03
|
Revision: 3343 http://archive-access.svn.sourceforge.net/archive-access/?rev=3343&view=rev Author: bradtofel Date: 2010-11-19 02:51:57 +0000 (Fri, 19 Nov 2010) Log Message: ----------- TWEAK: closing GetMethod's inputstream after using, moved, GetMethod.releaseConnections() to finally{} block. All attempts to make sure we don't leave filehandles open Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2010-11-19 02:47:32 UTC (rev 3342) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2010-11-19 02:51:57 UTC (rev 3343) @@ -132,8 +132,9 @@ getMethod.setRequestHeader("User-Agent", userAgent); int code = client.executeMethod(getMethod); LOGGER.info("URL(" + url + ") HTTP:" + code); - ByteOp.discardStream(getMethod.getResponseBodyAsStream()); - getMethod.releaseConnection(); + InputStream responseIS = getMethod.getResponseBodyAsStream(); + ByteOp.discardStream(responseIS); + responseIS.close(); gotUrl = true; } catch (URIException e) { @@ -156,6 +157,9 @@ } finally { recorder.closeRecorders(); Recorder.setHttpRecorder(null); + if(getMethod != null) { + getMethod.releaseConnection(); + } } // now write the content, or a fake record: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3342 http://archive-access.svn.sourceforge.net/archive-access/?rev=3342&view=rev Author: bradtofel Date: 2010-11-19 02:47:32 +0000 (Fri, 19 Nov 2010) Log Message: ----------- BUGFIX: ensuring empty strings get translated into '-' in output CDX records Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java 2010-11-19 02:41:25 UTC (rev 3341) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/CDXFormat.java 2010-11-19 02:47:32 UTC (rev 3342) @@ -86,7 +86,7 @@ protected CDXField[] fields = null; protected char delimiter = ' '; protected String delimiterS = null; - + public static String CDX_MAGIC = " CDX"; public static char URL_KEY = 'A'; @@ -185,7 +185,12 @@ public String serializeResult(CaptureSearchResult result) { StringBuilder sb = new StringBuilder(100); for(int i = 0; i < fields.length; i++) { - sb.append(fields[i].serialize(result)); + String value = fields[i].serialize(result); + if((value == null) || (value.length() == 0)) { + sb.append(CDXField.DEFAULT_VALUE); + } else { + sb.append(value); + } if(i < fields.length - 1) { sb.append(delimiter); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3341 http://archive-access.svn.sourceforge.net/archive-access/?rev=3341&view=rev Author: bradtofel Date: 2010-11-19 02:41:25 +0000 (Fri, 19 Nov 2010) Log Message: ----------- BUGFIX(unreported) empty string Content-Type header was causing empty string value in CDX Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java 2010-11-19 02:40:22 UTC (rev 3340) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/cdx/format/MIMETypeCDXField.java 2010-11-19 02:41:25 UTC (rev 3341) @@ -30,6 +30,6 @@ public String serialize(CaptureSearchResult result) { String r = result.getMimeType(); - return r == null ? DEFAULT_VALUE : r; + return (r == null) || (r.length() == 0) ? DEFAULT_VALUE : r; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3340 http://archive-access.svn.sourceforge.net/archive-access/?rev=3340&view=rev Author: bradtofel Date: 2010-11-19 02:40:22 +0000 (Fri, 19 Nov 2010) Log Message: ----------- TWEAK: preparation, still in comments for adding "sha1:" prefix to ARC record digests. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2010-11-19 02:38:01 UTC (rev 3339) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2010-11-19 02:40:22 UTC (rev 3340) @@ -80,6 +80,7 @@ result.setHttpCode("-"); result.setRedirectUrl("-"); +// result.setDigest("sha1:"+rec.getDigestStr()); result.setDigest(rec.getDigestStr()); result.setCaptureTimestamp(meta.getDate()); String uriStr = meta.getUrl(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3339 http://archive-access.svn.sourceforge.net/archive-access/?rev=3339&view=rev Author: bradtofel Date: 2010-11-19 02:38:01 +0000 (Fri, 19 Nov 2010) Log Message: ----------- BUGFIX(unreported) produced unneeded warning and no URL for warcinfo record, now includes filedesc:// scheme with filename Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2010-11-19 02:36:02 UTC (rev 3338) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2010-11-19 02:38:01 UTC (rev 3339) @@ -83,10 +83,10 @@ ArchiveRecordHeader header = rec.getHeader(); String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); - if(type.equals(WARCConstants.WARCINFO)) { - LOGGER.info("Skipping record type : " + type); - return null; - } +// if(type.equals(WARCConstants.WARCINFO)) { +// LOGGER.info("Skipping record type : " + type); +// return null; +// } CaptureSearchResult result = genericResult(rec); @@ -121,6 +121,10 @@ } else { result = null; } + } else if(type.equals(WARCConstants.WARCINFO)) { + + result.setMimeType("warc/warcinfo"); + } else { LOGGER.info("Skipping record type : " + type); } @@ -156,14 +160,29 @@ WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); String origUrl = header.getUrl(); - result.setOriginalUrl(origUrl); - try { - String urlKey = canonicalizer.urlStringToKey(origUrl); - result.setUrlKey(urlKey); - } catch (URIException e) { - LOGGER.warning("FAILED canonicalize(" + origUrl + "):" + - file + " " + offset); - result.setUrlKey(origUrl); + if(origUrl == null) { + String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); + if(type.equals(WARCConstants.WARCINFO)) { + String filename = header.getHeaderValue( + WARCConstants.HEADER_KEY_FILENAME).toString(); + result.setOriginalUrl("filedesc:"+filename); + result.setUrlKey("filedesc:"+filename); + } else { + result.setOriginalUrl(DEFAULT_VALUE); + result.setUrlKey(DEFAULT_VALUE); + } + + + } else { + result.setOriginalUrl(origUrl); + try { + String urlKey = canonicalizer.urlStringToKey(origUrl); + result.setUrlKey(urlKey); + } catch (URIException e) { + LOGGER.warning("FAILED canonicalize(" + origUrl + "):" + + file + " " + offset); + result.setUrlKey(origUrl); + } } return result; } @@ -204,6 +223,7 @@ return orig.substring(5); } return orig; +// return (o == null) ? DEFAULT_VALUE : o.toString(); } /* This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3338 http://archive-access.svn.sourceforge.net/archive-access/?rev=3338&view=rev Author: bradtofel Date: 2010-11-19 02:36:02 +0000 (Fri, 19 Nov 2010) Log Message: ----------- BUGFIX(unreported) zero arguments now shows usage Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2010-11-16 23:17:48 UTC (rev 3337) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2010-11-19 02:36:02 UTC (rev 3338) @@ -160,6 +160,9 @@ boolean setFormat = false; boolean isIdentity = false; String path = null; + if(args.length == 0) { + USAGE(); + } for(int idx = 0; idx < args.length; idx++) { if(args[idx].equals("-identity")) { canonicalizer = new IdentityUrlCanonicalizer(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3337 http://archive-access.svn.sourceforge.net/archive-access/?rev=3337&view=rev Author: binzino Date: 2010-11-16 23:17:48 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Added config property to control size of body to be indexed. Default 100k. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-11-16 23:17:04 UTC (rev 3336) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-11-16 23:17:48 UTC (rev 3337) @@ -47,6 +47,7 @@ private List<FieldSpecification> fieldSpecs; private int MAX_TITLE_LENGTH; + private int MAX_BODY_LENGTH; private TypeNormalizer typenormalizer; private TypeFilter typefilter; private URLFilter urlfilter; @@ -56,8 +57,8 @@ this.conf = conf; this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); + this.MAX_BODY_LENGTH = conf.getInt("indexer.max.body.length", (100 * 1024)); - // this.allowedTypes = new HashSet<String>( conf.get( "indexer.mimetypes.allowed", "" ).split( "\\s+" ) ); this.typenormalizer = new TypeNormalizer( ); this.typenormalizer.setAliases( typenormalizer.getDefaultAliases( ) ); @@ -185,6 +186,11 @@ else if ( "content".equals( spec.srcKey ) ) { value = parse.getText( ); + + if ( value != null && value.length() > MAX_BODY_LENGTH ) + { + value = value.substring( 0, MAX_BODY_LENGTH ); + } } else if ( "title".equals( spec.srcKey ) ) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-16 23:17:11
|
Revision: 3336 http://archive-access.svn.sourceforge.net/archive-access/?rev=3336&view=rev Author: binzino Date: 2010-11-16 23:17:04 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Use Nutch parsers for text and html, Tika for the rest. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:16:35 UTC (rev 3335) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml 2010-11-16 23:17:04 UTC (rev 3336) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-tika|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> + <value>protocol-http|parse-(tika|text|html)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value> </property> <!-- Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:16:35 UTC (rev 3335) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/parse-plugins.xml 2010-11-16 23:17:04 UTC (rev 3336) @@ -107,15 +107,15 @@ </mimeType> <mimeType name="text/html"> - <plugin id="parse-tika" /> + <plugin id="parse-html" /> </mimeType> <mimeType name="application/xhtml+xml"> - <plugin id="parse-tika" /> + <plugin id="parse-html" /> </mimeType> <mimeType name="text/plain"> - <plugin id="parse-tika" /> + <plugin id="parse-text" /> </mimeType> <mimeType name="text/richtext"> @@ -150,8 +150,9 @@ <aliases> <alias name="parse-tika" extension-id="org.apache.nutch.parse.tika.Parser" /> <alias name="parse-ext" extension-id="ExtParser" /> + <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> + <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> <!-- - <alias name="parse-html" extension-id="org.apache.nutch.parse.html.HtmlParser" /> <alias name="parse-js" extension-id="JSParser" /> <alias name="parse-msexceld" extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> <alias name="parse-mspowerpoint" extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" /> @@ -161,7 +162,6 @@ <alias name="parse-rss" extension-id="org.apache.nutch.parse.rss.RSSParser" /> <alias name="feed" extension-id="org.apache.nutch.parse.feed.FeedParser" /> <alias name="parse-swf" extension-id="org.apache.nutch.parse.swf.SWFParser" /> - <alias name="parse-text" extension-id="org.apache.nutch.parse.text.TextParser" /> <alias name="parse-zip" extension-id="org.apache.nutch.parse.zip.ZipParser" /> --> </aliases> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-16 23:16:42
|
Revision: 3335 http://archive-access.svn.sourceforge.net/archive-access/?rev=3335&view=rev Author: binzino Date: 2010-11-16 23:16:35 +0000 (Tue, 16 Nov 2010) Log Message: ----------- Added config controls to trim input docs for text/plain and text/html to avoid performance problems with large (50+MB) input docs. Also added try/catch around boilerpipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-15 20:32:34 UTC (rev 3334) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-16 23:16:35 UTC (rev 3335) @@ -16,11 +16,10 @@ */ package org.archive.nutchwax; -import java.io.IOException; -import java.net.MalformedURLException; +import java.io.*; +import java.net.*; import java.util.Map.Entry; -import java.util.List; -import java.util.ArrayList; +import java.util.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -359,20 +358,43 @@ "application/xhtml+xml".equals( content.getContentType( ) ) || "application/xhtml" .equals( content.getContentType( ) ) ) { - long size = jobConf.getLong( "nutchwax.import.content.limit.html", -1 ); + int size = jobConf.getInt( "nutchwax.import.content.limit.html", -1 ); if ( size > 0 && size < length ) { - LOG.warn( "HTML file size exceeds threshold [" + size + "], skipping: " + meta.getUrl( ) + " [" + length + "]" ); - return false; + LOG.warn( "HTML file size exceeds threshold [" + size + "]: " + meta.getUrl( ) + " [" + length + "]" ); + + bytes = Arrays.copyOf( bytes, size ); + + content.setContent( bytes ); } - if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + try { - // BoilerPipe! - contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + { + // BoilerPipe! + contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + } } + catch ( Exception e ) + { + LOG.warn( "Error boilerpiping: " + meta.getUrl( ) ); + } } + if ( "text/plain".equals( content.getContentType( ) ) ) + { + int size = jobConf.getInt( "nutchwax.import.content.limit.text", -1 ); + if ( size > 0 && size < length ) + { + LOG.warn( "Text file size exceeds threshold [" + size + "]: " + meta.getUrl( ) + " [" + length + "]" ); + + bytes = Arrays.copyOf( bytes, size ); + + content.setContent( bytes ); + } + } + output( output, new Text( key ), content ); return true; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-15 20:32:40
|
Revision: 3334 http://archive-access.svn.sourceforge.net/archive-access/?rev=3334&view=rev Author: binzino Date: 2010-11-15 20:32:34 +0000 (Mon, 15 Nov 2010) Log Message: ----------- Added nutchwax.import.content.limit.html property. If html file is larger than this value, it is skipped. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-12 23:54:34 UTC (rev 3333) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-15 20:32:34 UTC (rev 3334) @@ -247,7 +247,7 @@ { ARCRecordMetaData meta = record.getMetaData(); - if ( LOG.isDebugEnabled() ) LOG.debug( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength( ) + "]" ); if ( ! this.httpStatusCodeFilter.isAllowed( record.getStatusCode( ) ) ) { @@ -266,7 +266,8 @@ // We use record.available() rather than meta.getLength() // because the latter includes the size of the HTTP header, // which we just skipped. - byte[] bytes = readBytes( record, record.available( ) ); + long length = record.available(); + byte[] bytes = readBytes( record, length ); // If there is no digest, then we assume we're reading an // ARCRecord not a WARCRecord. In that case, we close the @@ -358,6 +359,13 @@ "application/xhtml+xml".equals( content.getContentType( ) ) || "application/xhtml" .equals( content.getContentType( ) ) ) { + long size = jobConf.getLong( "nutchwax.import.content.limit.html", -1 ); + if ( size > 0 && size < length ) + { + LOG.warn( "HTML file size exceeds threshold [" + size + "], skipping: " + meta.getUrl( ) + " [" + length + "]" ); + return false; + } + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) { // BoilerPipe! @@ -365,7 +373,7 @@ } } - output( output, new Text( key ), content ); + output( output, new Text( key ), content ); return true; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-11-12 23:54:43
|
Revision: 3333 http://archive-access.svn.sourceforge.net/archive-access/?rev=3333&view=rev Author: binzino Date: 2010-11-12 23:54:34 +0000 (Fri, 12 Nov 2010) Log Message: ----------- Added config property to enable/disable BoilerPipe. Modified Paths: -------------- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-11 05:49:07 UTC (rev 3332) +++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Importer.java 2010-11-12 23:54:34 UTC (rev 3333) @@ -354,15 +354,16 @@ contentMetadata.set( NutchWax.ORIGINAL_TYPE_KEY, meta.getMimetype( ) ); contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, content.getContentType( ) ); - // BoilerPipe! - /* - if ( "text/html".equals( content.getContentType( ) ) ) + if ( "text/html" .equals( content.getContentType( ) ) || + "application/xhtml+xml".equals( content.getContentType( ) ) || + "application/xhtml" .equals( content.getContentType( ) ) ) { - String boiledHTML = de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ); - - contentMetadata.set( "boiledHTML", boiledHTML ); + if ( jobConf.getBoolean( "nutchwax.import.boilerpipe", false ) ) + { + // BoilerPipe! + contentMetadata.set( "boiled", de.l3s.boilerpipe.extractors.DefaultExtractor.INSTANCE.getText( new org.xml.sax.InputSource( new java.io.ByteArrayInputStream( bytes ) ) ) ); + } } - */ output( output, new Text( key ), content ); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-11-11 05:49:13
|
Revision: 3332 http://archive-access.svn.sourceforge.net/archive-access/?rev=3332&view=rev Author: bradtofel Date: 2010-11-11 05:49:07 +0000 (Thu, 11 Nov 2010) Log Message: ----------- TWEAK: changed rel Link headers per fixes for RFC, removed TCN headers Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java 2010-11-11 05:47:25 UTC (rev 3331) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/memento/MementoReplayRendererDecorator.java 2010-11-11 05:49:07 UTC (rev 3332) @@ -87,7 +87,7 @@ String timemap = " , <" + apProps.getProperty("aggregationPrefix") + "timemap/link/" + u - + ">;rel=\"timemap\"; type=\"text/csv\""; + + ">;rel=\"timemap\"; type=\"application/link-format\""; String timegate = ",<" + uriPrefix + "timegate/" + u + ">;rel=\"timegate\""; @@ -101,7 +101,7 @@ httpformatterl.format(result.getCaptureDate())); String memento = ",<" + uriPrefix + formatterk.format(closestDate) - + "/" + u + ">;rel=\"memento\";datetime=\"" + + "/" + u + ">;rel=\"memento\"; datetime=\"" + httpformatterl.format(closestDate) + "\""; String mfl = null; if ((closestDate.equals(f)) && closestDate.equals(l)) { @@ -110,31 +110,31 @@ + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento memento last-memento\"; datetime=\"" + + ">;rel=\"first last memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } else if (closestDate.equals(f)) { mfl = ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; } else if (closestDate.equals(l)) { mfl = ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } else { mfl = memento; mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } @@ -184,12 +184,12 @@ + formatterk.format(closestleft.getCaptureDate()) + "/" + u - + ">;rel=\"prev-memento\"; datetime=\"" + + ">;rel=\"prev memento\"; datetime=\"" + httpformatterl.format(closestleft .getCaptureDate()) + "\""); } else { - int m_index = sb.lastIndexOf("\"first-memento\""); - sb.insert(m_index + 1, "prev-memento "); + int m_index = sb.lastIndexOf("\"first memento\""); + sb.insert(m_index + 1, "prev "); } } @@ -200,12 +200,12 @@ + formatterk.format(closestright.getCaptureDate()) + "/" + u - + ">;rel=\"next-memento\"; datetime=\"" + + ">;rel=\"next memento\"; datetime=\"" + httpformatterl.format(closestright .getCaptureDate()) + "\""); } else { - int m_index = sb.lastIndexOf("\"last-memento\""); - sb.insert(m_index + 1, "next-memento "); + int m_index = sb.lastIndexOf("\"last memento\""); + sb.insert(m_index + 1, "next "); } Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp 2010-11-11 05:47:25 UTC (rev 3331) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/Memento.jsp 2010-11-11 05:49:07 UTC (rev 3332) @@ -51,7 +51,7 @@ String timemap = " , <" + results.getContextConfig("aggregationPrefix") + "timemap/link/" + u - + ">;rel=\"timemap\"; type=\"text/csv\""; + + ">;rel=\"timemap\"; type=\"application/link-format\""; String origlink = ", <" + u + ">;rel=\"original\""; String uriPrefix = wbRequest.getAccessPoint().getReplayPrefix(); @@ -74,31 +74,31 @@ + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento memento last-memento\"; datetime=\"" + + ">;rel=\"first last memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } else if (closestDate.equals(f)) { mfl = ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; } else if (closestDate.equals(l)) { mfl = ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } else { mfl = memento; mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } @@ -116,18 +116,18 @@ + formatterk.format(f) + "/" + u - + ">;rel=\"last-memento first-memento\"; datetime=\"" + + ">;rel=\"last first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } else { fl = ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; fl = fl + ", <" + uriPrefix + formatterk.format(f) + "/" - + u + ">;rel=\"first-memento\"; datetime=\"" + + u + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } - response.setHeader("TCN", "list"); + response.setStatus(400); response.setHeader("Link", "<" + agguri + ">;rel=\"timebundle\"" + origlink + fl + timemap); @@ -203,12 +203,12 @@ + formatterk.format(closestleft.getCaptureDate()) + "/" + u - + ">;rel=\"prev-memento\"; datetime=\"" + + ">;rel=\"prev memento\"; datetime=\"" + httpformatterl.format(closestleft .getCaptureDate()) + "\""); } else { - int m_index = sb.lastIndexOf("\"first-memento\""); - sb.insert(m_index + 1, "prev-memento "); + int m_index = sb.lastIndexOf("\"first memento\""); + sb.insert(m_index + 1, "prev "); } } if (closestright != null) { @@ -218,12 +218,12 @@ + formatterk.format(closestright.getCaptureDate()) + "/" + u - + ">;rel=\"next-memento\"; datetime=\"" + + ">;rel=\"next \"; datetime=\"" + httpformatterl.format(closestright .getCaptureDate()) + "\""); } else { - int m_index = sb.lastIndexOf("\"last-memento\""); - sb.insert(m_index + 1, "next-memento "); + int m_index = sb.lastIndexOf("\"last memento\""); + sb.insert(m_index + 1, "next "); } } @@ -231,7 +231,6 @@ response.setHeader("Link", "<" + agguri + ">;rel=\"timebundle\"" + origlink + sb.toString() + timemap); //added timemap - response.setHeader("TCN", "choice"); response.setHeader("Location", replayUrl); response.sendError(302, "Found"); %> \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-11-11 05:47:25 UTC (rev 3331) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/query/ORE.jsp 2010-11-11 05:49:07 UTC (rev 3332) @@ -108,7 +108,7 @@ linkbf.append(",<" + replayPrefix + "timegate/" + u + ">;rel=\"timegate\"\n"); linkbf.append(",<" + queryPrefix + "timemap/" + format + "/" + u - + ">;rel=\"timemap\";type=\"text/csv\"\n"); + + ">;rel=\"timemap\";type=\"application/link-format\"\n"); String firstmemento = null; int count = 0; @@ -142,7 +142,7 @@ if (startdate == null) { if (firstmemento == null) { linkbf.append(",<" + resurl - + ">;rel=\"first-memento\";datetime=\"" + + ">;rel=\"first memento\";datetime=\"" + httpformatterl.format(enddate) + "\"\n"); firstmemento = "firstmemento"; @@ -226,7 +226,7 @@ if (count > 0) { int m_index = linkbf.lastIndexOf("\"memento\""); - linkbf.insert(m_index + 1, "last-"); + linkbf.insert(m_index + 1, "last "); } ORESerialiser serial = null; @@ -249,7 +249,7 @@ else if (format.equals("link")) { PrintWriter pw = response.getWriter(); - response.setContentType("text/csv"); + response.setContentType("application/link-format"); pw.print(linkbf.toString()); pw.flush(); Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp 2010-11-11 05:47:25 UTC (rev 3331) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/replay/MementoValidity.jsp 2010-11-11 05:49:07 UTC (rev 3332) @@ -29,7 +29,7 @@ String timemap = " , <" + results.getContextConfig("aggregationPrefix") + "timemap/link/" + u - + ">;rel=\"timemap\"; type=\"text/csv\""; + + ">;rel=\"timemap\"; type=\"application/link-format\""; String timegate = ",<" + uriPrefix + "timegate/" + u + ">;rel=\"timegate\""; @@ -52,31 +52,31 @@ + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento memento last-memento\"; datetime=\"" + + ">;rel=\"first last memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } else if (closestDate.equals(f)) { mfl = ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; } else if (closestDate.equals(l)) { mfl = ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } else { mfl = memento; mfl = mfl + ", <" + uriPrefix + formatterk.format(l) + "/" + u - + ">;rel=\"last-memento\"; datetime=\"" + + ">;rel=\"last memento\"; datetime=\"" + httpformatterl.format(l) + "\""; mfl = mfl + ", <" + uriPrefix + formatterk.format(f) + "/" + u - + ">;rel=\"first-memento\"; datetime=\"" + + ">;rel=\"first memento\"; datetime=\"" + httpformatterl.format(f) + "\""; } @@ -128,12 +128,12 @@ + formatterk.format(closestleft.getCaptureDate()) + "/" + u - + ">;rel=\"prev-memento\"; datetime=\"" + + ">;rel=\"prev memento\"; datetime=\"" + httpformatterl.format(closestleft .getCaptureDate()) + "\""); } else { - int m_index = sb.lastIndexOf("\"first-memento\""); - sb.insert(m_index + 1, "prev-memento "); + int m_index = sb.lastIndexOf("\"first memento\""); + sb.insert(m_index + 1, "prev "); } } @@ -144,12 +144,12 @@ + formatterk.format(closestright.getCaptureDate()) + "/" + u - + ">;rel=\"next-memento\"; datetime=\"" + + ">;rel=\"next memento\"; datetime=\"" + httpformatterl.format(closestright .getCaptureDate()) + "\""); } else { - int m_index = sb.lastIndexOf("\"last-memento\""); - sb.insert(m_index + 1, "next-memento "); + int m_index = sb.lastIndexOf("\"last memento\""); + sb.insert(m_index + 1, "next "); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-11-11 05:47:31
|
Revision: 3331 http://archive-access.svn.sourceforge.net/archive-access/?rev=3331&view=rev Author: bradtofel Date: 2010-11-11 05:47:25 +0000 (Thu, 11 Nov 2010) Log Message: ----------- BUGFIX: was not sending correct URL prefixes for timemaps. Split out replayPrefix and queryPrefix for timegate and timemaps, respectively Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml =================================================================== --- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2010-11-11 05:29:16 UTC (rev 3330) +++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/wayback.xml 2010-11-11 05:47:25 UTC (rev 3331) @@ -227,6 +227,8 @@ <!-- <import resource="MementoReplay.xml"/> <bean name="8080:memento" parent="8080:wayback"> + <property name="replayPrefix" value="http://localhost.archive.org:8080/memento/" /> + <property name="queryPrefix" value="http://localhost.archive.org:8080/list/" /> <property name="configs"> <props> <prop key="aggregationPrefix">http://localhost.archive.org:8080/list/</prop> @@ -260,6 +262,8 @@ <bean name="8080:list" parent="8080:memento"> + <property name="replayPrefix" value="http://localhost.archive.org:8080/memento/" /> + <property name="queryPrefix" value="http://localhost.archive.org:8080/list/" /> <property name="staticPrefix" value="http://localhost.archive.org:8080/list/" /> <property name="configs"> <props> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |