You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: Doug C. <cu...@us...> - 2005-10-21 20:54:09
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/plugin/parse-ext In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23098/src/plugin/parse-ext Modified Files: Tag: mapred plugin.xml Log Message: Fix so that parse-pdf.sh is included in job jar, and is found relative to job jar. Also fix so that parser plugin config overrides default. Index: plugin.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/plugin/parse-ext/plugin.xml,v retrieving revision 1.3 retrieving revision 1.3.2.1 diff -C2 -d -r1.3 -r1.3.2.1 *** plugin.xml 4 Jun 2005 01:01:32 -0000 1.3 --- plugin.xml 21 Oct 2005 20:53:58 -0000 1.3.2.1 *************** *** 24,28 **** contentType="application/pdf" pathSuffix="pdf" ! command="@PWD@/bin/parse-pdf.sh" timeout="30"/> --- 24,28 ---- contentType="application/pdf" pathSuffix="pdf" ! command="bin/parse-pdf.sh" timeout="30"/> |
From: Doug C. <cu...@us...> - 2005-10-21 20:54:06
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/conf In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23098/conf Modified Files: Tag: mapred nutch-site.xml Added Files: Tag: mapred ia-parse-plugins.xml Removed Files: Tag: mapred parse-plugins.xml Log Message: Fix so that parse-pdf.sh is included in job jar, and is found relative to job jar. Also fix so that parser plugin config overrides default. --- parse-plugins.xml DELETED --- --- NEW FILE: ia-parse-plugins.xml --- <?xml version="1.0" encoding="UTF-8"?> <!-- Copyright 2005 The Apache Software Foundation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Author : mattmann Description: This xml file represents a natural ordering for which parsing plugin should get called for a particular mimeType. --> <parse-plugins> <!-- by default if the mimeType is set to *, or can't be determined, use parse-text --> <mimeType name="*"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/java"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/msword"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/pdf"> <plugin id="parse-ext" /> </mimeType> <mimeType name="application/rss+xml"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/vnd.ms-excel"> <plugin id="parse-msexcel" /> </mimeType> <mimeType name="application/vnd.ms-powerpoint"> <plugin id="parse-mspowerpoint" /> </mimeType> <mimeType name="application/vnd.wap.wbxml"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/vnd.wap.wmlc"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/vnd.wap.wmlscriptc"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/xhtml+xml"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-bzip2"> <!-- try and parse it with the zip parser --> <plugin id="parse-zip" /> </mimeType> <mimeType name="application/x-csh"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-gzip"> <!-- try and parse it with the zip parser --> <plugin id="parse-zip" /> </mimeType> <mimeType name="application/x-javascript"> <plugin id="parse-js" /> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-kword"> <!-- try and parse it with the word parser --> <plugin id="parse-msword" /> </mimeType> <mimeType name="application/x-kspread"> <!-- try and parse it with the msexcel parser --> <plugin id="parse-msexcel" /> </mimeType> <mimeType name="application/x-latex"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-netcdf"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-sh"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-tcl"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-tex"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-texinfo"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff-man"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff-me"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/x-troff-ms"> <plugin id="parse-text" /> </mimeType> <mimeType name="application/zip"> <plugin id="parse-zip" /> </mimeType> <mimeType name="message/news"> <plugin id="parse-text" /> </mimeType> <mimeType name="message/rfc822"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/css"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/html"> <plugin id="parse-html" /> </mimeType> <mimeType name="text/plain"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/richtext"> <plugin id="parse-rtf" /> <plugin id="parse-msword" /> </mimeType> <mimeType name="text/rtf"> <plugin id="parse-rtf" /> <plugin id="parse-msword" /> </mimeType> <mimeType name="text/sgml"> <plugin id="parse-html" /> <plugin id="parse-text" /> </mimeType> <mimeType name="text/tab-separated-values"> <plugin id="parse-msexcel" /> <plugin id="parse-text" /> </mimeType> <mimeType name="text/vnd.wap.wml"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/vnd.wap.wmlscript"> <plugin id="parse-text" /> </mimeType> <mimeType name="text/xml"> <plugin id="parse-text" /> <plugin id="parse-html" /> <plugin id="parse-rss" /> </mimeType> <mimeType name="text/x-setext"> <plugin id="parse-text" /> </mimeType> <!-- Types for parse-ext plugin: required for unit tests to pass. --> <mimeType name="application/vnd.nutch.example.cat"> <plugin id="parse-ext" /> </mimeType> <mimeType name="application/vnd.nutch.example.md5sum"> <plugin id="parse-ext" /> </mimeType> </parse-plugins> Index: nutch-site.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml,v retrieving revision 1.24.2.4 retrieving revision 1.24.2.5 diff -C2 -d -r1.24.2.4 -r1.24.2.5 *** nutch-site.xml 20 Oct 2005 23:30:48 -0000 1.24.2.4 --- nutch-site.xml 21 Oct 2005 20:53:57 -0000 1.24.2.5 *************** *** 7,21 **** <!-- NDFS --> ! <!-- <property> --> ! <!-- <name>fs.default.name</name> --> ! <!-- <value>ia109102.archive.org:8009</value> --> ! <!-- </property> --> <!-- MapReduce --> ! <!-- <property> --> ! <!-- <name>mapred.job.tracker</name> --> ! <!-- <value>ia109102.archive.org:8010</value> --> ! <!-- </property> --> <!-- Override a few Nutch defaults --> --- 7,21 ---- <!-- NDFS --> ! <property> ! <name>fs.default.name</name> ! <value>ia109102.archive.org:8009</value> ! </property> <!-- MapReduce --> ! <property> ! <name>mapred.job.tracker</name> ! <value>ia109102.archive.org:8010</value> ! </property> <!-- Override a few Nutch defaults --> *************** *** 37,40 **** --- 37,47 ---- </property> + <property> + <name>parse.plugin.file</name> + <value>ia-parse-plugins.xml</value> + <description>The name of the file that defines the associations between + content-types and parsers.</description> + </property> + <!-- don't truncate documents as much as by default --> <property> |
From: Doug C. <cu...@us...> - 2005-10-21 20:54:05
|
Update of /cvsroot/archive-access/archive-access/projects/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23098 Modified Files: Tag: mapred build.xml Log Message: Fix so that parse-pdf.sh is included in job jar, and is found relative to job jar. Also fix so that parser plugin config overrides default. Index: build.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/build.xml,v retrieving revision 1.10.2.2 retrieving revision 1.10.2.3 diff -C2 -d -r1.10.2.2 -r1.10.2.3 *** build.xml 20 Oct 2005 23:30:48 -0000 1.10.2.2 --- build.xml 21 Oct 2005 20:53:57 -0000 1.10.2.3 *************** *** 66,74 **** <copy file="${root}/src/plugin/parse-ext/plugin.xml" todir="${nutch.root}/build/plugins/parse-ext/" ! overwrite="true" ! filtering="true" > ! <filterset> ! <filter token="PWD" value="${root}"/> ! </filterset> </copy> --- 66,70 ---- <copy file="${root}/src/plugin/parse-ext/plugin.xml" todir="${nutch.root}/build/plugins/parse-ext/" ! overwrite="true"> </copy> *************** *** 116,124 **** <!-- ================================================================== --> <target name="job" depends="compile"> ! <jar jarfile="${build.dir}/${name}.job.jar"> ! <zipfileset prefix="classes" file="${conf.dir}/parse-plugins.xml"/> <zipfileset prefix="classes" dir="${build.classes}"/> <zipfileset refid="lib.jars"/> ! </jar> </target> --- 112,121 ---- <!-- ================================================================== --> <target name="job" depends="compile"> ! <zip destfile="${build.dir}/${name}.job.jar"> ! <zipfileset prefix="classes" file="${conf.dir}/ia-parse-plugins.xml"/> ! <zipfileset prefix="bin" file="bin/parse-pdf.sh" filemode="755"/> <zipfileset prefix="classes" dir="${build.classes}"/> <zipfileset refid="lib.jars"/> ! </zip> </target> |
From: Michael S. <sta...@us...> - 2005-10-21 17:32:26
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10785/src/articles Modified Files: releasenotes.xml Log Message: * src/articles/releasenotes.xml Formatting. Index: releasenotes.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/articles/releasenotes.xml,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** releasenotes.xml 21 Oct 2005 17:26:36 -0000 1.6 --- releasenotes.xml 21 Oct 2005 17:32:07 -0000 1.7 *************** *** 115,139 **** <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1333042">1333042</ulink></entry><entry>Fix</entry><entry>Search result list - Bad handling of dedup result list</entry><entry>2005-10-20 03:08</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322601">1322601</ulink></entry><entry><entry>Fix</entry>search ui - time param not set</entry><entry>2005-10-10 05:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324757">1324757</ulink></entry><entry><entry>Fix</entry>debug on messes up displayed web page</entry><entry>2005-10-12 04:37</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1249970">1249970</ulink></entry><entry><entry>Fix</entry>Installer requires X though claimed not needing it</entry><entry>2005-08-01 21:23</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324161">1324161</ulink></entry><entry><entry>Fix</entry>euc-jp page not displayed properly in wera</entry><entry>2005-10-11 12:34</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322668">1322668</ulink></entry><entry><entry>Fix</entry>wera help need update</entry><entry>2005-10-10 05:59</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324755">1324755</ulink></entry><entry><entry>Fix</entry>Header sent from wera documentdispatcher of wrong format</entry><entry>2005-10-12 04:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322554">1322554</ulink></entry><entry><entry>Fix</entry>exacturl query returnns 0 of X versions in result list</entry><entry>2005-10-10 05:13</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322594">1322594</ulink></entry><entry><entry>Fix</entry>When time param not set url is not found</entry><entry>2005-10-10 05:29</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312442">1312442</ulink></entry><entry><entry>Fix</entry>Date range missing in querystring</entry><entry>2005-10-03 17:26</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314403">1314403</ulink></entry><entry><entry>Fix</entry>Use newly added 'encoding' in search results</entry><entry>2005-10-05 19:34</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314098">1314098</ulink></entry><entry><entry>Fix</entry>Encoding issue, wera displaying archived web page</entry><entry>2005-10-05 10:53</entry><entry>stack-sf</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1244894">1244894</ulink></entry><entry><entry>Fix</entry>Cannot query for non-ISO8859 characters</entry><entry>2005-07-25 18:38</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312208">1312208</ulink></entry><entry><entry>Fix</entry>Query time encoding issues</entry><entry>2005-10-03 12:11</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314360">1314360</ulink></entry><entry><entry>Fix</entry>Remove all, any or phrase selection in search ui</entry><entry>2005-10-05 18:14</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312479">1312479</ulink></entry><entry><entry>Fix</entry>indexSearch.inc need cleanup</entry><entry>2005-10-03 18:40</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1313251">1313251</ulink></entry><entry><entry>Fix</entry>Wera search, ugly and/or not useful error messages</entry><entry>2005-10-04 13:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1282042">1282042</ulink></entry><entry><entry>Fix</entry>WERA - Timeline - Warning when URL not found</entry><entry>2005-09-05 03:28</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312484">1312484</ulink></entry><entry><entry>Fix</entry>[wera] Ugly complaint about invalid argument</entry><entry>2005-10-03 18:47</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312299">1312299</ulink></entry><entry><entry>Fix</entry>WERA - Exacturl search not always working</entry><entry>2005-10-03 13:51</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1281697">1281697</ulink></entry><entry><entry>Fix</entry>searching czech words not working</entry><entry>2005-09-04 10:36</entry><entry>stack-sf</entry><entry>kranach</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1277376">1277376</ulink></entry><entry><entry>Fix</entry>WERA - Duplicate hits in result list</entry><entry>2005-08-31 05:45</entry><entry>sverreb</entry><entry>sverreb</entry></row> </tbody> --- 115,139 ---- <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1333042">1333042</ulink></entry><entry>Fix</entry><entry>Search result list - Bad handling of dedup result list</entry><entry>2005-10-20 03:08</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322601">1322601</ulink></entry><entry>Fix</entry><entry>search ui - time param not set</entry><entry>2005-10-10 05:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324757">1324757</ulink></entry><entry>Fix</entry><entry>debug on messes up displayed web page</entry><entry>2005-10-12 04:37</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1249970">1249970</ulink></entry><entry>Fix</entry><entry>Installer requires X though claimed not needing it</entry><entry>2005-08-01 21:23</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324161">1324161</ulink></entry><entry>Fix</entry><entry>euc-jp page not displayed properly in wera</entry><entry>2005-10-11 12:34</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322668">1322668</ulink></entry><entry>Fix</entry><entry>wera help need update</entry><entry>2005-10-10 05:59</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324755">1324755</ulink></entry><entry>Fix</entry><entry>Header sent from wera documentdispatcher of wrong format</entry><entry>2005-10-12 04:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322554">1322554</ulink></entry><entry>Fix</entry><entry>exacturl query returnns 0 of X versions in result list</entry><entry>2005-10-10 05:13</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322594">1322594</ulink></entry><entry>Fix</entry><entry>When time param not set url is not found</entry><entry>2005-10-10 05:29</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312442">1312442</ulink></entry><entry>Fix</entry><entry>Date range missing in querystring</entry><entry>2005-10-03 17:26</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314403">1314403</ulink></entry><entry>Fix</entry><entry>Use newly added 'encoding' in search results</entry><entry>2005-10-05 19:34</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314098">1314098</ulink></entry><entry>Fix</entry><entry>Encoding issue, wera displaying archived web page</entry><entry>2005-10-05 10:53</entry><entry>stack-sf</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1244894">1244894</ulink></entry><entry>Fix</entry><entry>Cannot query for non-ISO8859 characters</entry><entry>2005-07-25 18:38</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312208">1312208</ulink></entry><entry>Fix</entry><entry>Query time encoding issues</entry><entry>2005-10-03 12:11</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314360">1314360</ulink></entry><entry>Fix</entry><entry>Remove all, any or phrase selection in search ui</entry><entry>2005-10-05 18:14</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312479">1312479</ulink></entry><entry>Fix</entry><entry>indexSearch.inc need cleanup</entry><entry>2005-10-03 18:40</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1313251">1313251</ulink></entry><entry>Fix</entry><entry>Wera search, ugly and/or not useful error messages</entry><entry>2005-10-04 13:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1282042">1282042</ulink></entry><entry>Fix</entry><entry>WERA - Timeline - Warning when URL not found</entry><entry>2005-09-05 03:28</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312484">1312484</ulink></entry><entry>Fix</entry><entry>[wera] Ugly complaint about invalid argument</entry><entry>2005-10-03 18:47</entry><entry>sverreb</entry><entry>stack-sf</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312299">1312299</ulink></entry><entry>Fix</entry><entry>WERA - Exacturl search not always working</entry><entry>2005-10-03 13:51</entry><entry>sverreb</entry><entry>sverreb</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1281697">1281697</ulink></entry><entry>Fix</entry><entry>searching czech words not working</entry><entry>2005-09-04 10:36</entry><entry>stack-sf</entry><entry>kranach</entry></row> ! <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1277376">1277376</ulink></entry><entry>Fix</entry><entry>WERA - Duplicate hits in result list</entry><entry>2005-08-31 05:45</entry><entry>sverreb</entry><entry>sverreb</entry></row> </tbody> |
From: Michael S. <sta...@us...> - 2005-10-21 17:26:44
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9639/src/articles Modified Files: releasenotes.xml Log Message: * src/articles/releasenotes.xml Add in rfes. Index: releasenotes.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/articles/releasenotes.xml,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** releasenotes.xml 21 Oct 2005 17:10:14 -0000 1.5 --- releasenotes.xml 21 Oct 2005 17:26:36 -0000 1.6 *************** *** 109,112 **** --- 109,117 ---- <tbody> + + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681140&aid=1312159">1312159</ulink></entry><entry>Add</entry><entry>wera overview doc based on dokuwiki text</entry><entry>2005-10-03 11:29</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681140&aid=1246834">1246834</ulink></entry><entry>Add</entry><entry>Move arc path to retreiver (WAS Path...lib/seal/nutch.inc)</entry><entry>2005-07-28 08:06</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681140&aid=1244879">1244879</ulink></entry><entry>Add</entry><entry>Add display of text snippets to wera search results page</entry><entry>2005-07-25 17:32</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1333042">1333042</ulink></entry><entry>Fix</entry><entry>Search result list - Bad handling of dedup result list</entry><entry>2005-10-20 03:08</entry><entry>sverreb</entry><entry>stack-sf</entry></row> <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322601">1322601</ulink></entry><entry><entry>Fix</entry>search ui - time param not set</entry><entry>2005-10-10 05:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> |
From: Michael S. <sta...@us...> - 2005-10-21 17:10:27
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5093/src/articles Modified Files: releasenotes.xml Log Message: * src/articles/releasenotes.xml List of fixes. Index: releasenotes.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/articles/releasenotes.xml,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** releasenotes.xml 21 Oct 2005 15:44:17 -0000 1.4 --- releasenotes.xml 21 Oct 2005 17:10:14 -0000 1.5 *************** *** 90,93 **** --- 90,138 ---- the encoding detected by NutchWax at index time.</para> </sect3> + + <para + ><table> + <title>Changes</title> + + <tgroup cols="5"> + <thead> + <row> + <entry>ID</entry> + <entry>Type</entry> + <entry>Summary</entry> + <entry>Open Date</entry> + <entry>By</entry> + <entry>Filer</entry> + </row> + </thead> + + <tbody> + + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1333042">1333042</ulink></entry><entry>Fix</entry><entry>Search result list - Bad handling of dedup result list</entry><entry>2005-10-20 03:08</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322601">1322601</ulink></entry><entry><entry>Fix</entry>search ui - time param not set</entry><entry>2005-10-10 05:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324757">1324757</ulink></entry><entry><entry>Fix</entry>debug on messes up displayed web page</entry><entry>2005-10-12 04:37</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1249970">1249970</ulink></entry><entry><entry>Fix</entry>Installer requires X though claimed not needing it</entry><entry>2005-08-01 21:23</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324161">1324161</ulink></entry><entry><entry>Fix</entry>euc-jp page not displayed properly in wera</entry><entry>2005-10-11 12:34</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322668">1322668</ulink></entry><entry><entry>Fix</entry>wera help need update</entry><entry>2005-10-10 05:59</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1324755">1324755</ulink></entry><entry><entry>Fix</entry>Header sent from wera documentdispatcher of wrong format</entry><entry>2005-10-12 04:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322554">1322554</ulink></entry><entry><entry>Fix</entry>exacturl query returnns 0 of X versions in result list</entry><entry>2005-10-10 05:13</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1322594">1322594</ulink></entry><entry><entry>Fix</entry>When time param not set url is not found</entry><entry>2005-10-10 05:29</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312442">1312442</ulink></entry><entry><entry>Fix</entry>Date range missing in querystring</entry><entry>2005-10-03 17:26</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314403">1314403</ulink></entry><entry><entry>Fix</entry>Use newly added 'encoding' in search results</entry><entry>2005-10-05 19:34</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314098">1314098</ulink></entry><entry><entry>Fix</entry>Encoding issue, wera displaying archived web page</entry><entry>2005-10-05 10:53</entry><entry>stack-sf</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1244894">1244894</ulink></entry><entry><entry>Fix</entry>Cannot query for non-ISO8859 characters</entry><entry>2005-07-25 18:38</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312208">1312208</ulink></entry><entry><entry>Fix</entry>Query time encoding issues</entry><entry>2005-10-03 12:11</entry><entry>stack-sf</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1314360">1314360</ulink></entry><entry><entry>Fix</entry>Remove all, any or phrase selection in search ui</entry><entry>2005-10-05 18:14</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312479">1312479</ulink></entry><entry><entry>Fix</entry>indexSearch.inc need cleanup</entry><entry>2005-10-03 18:40</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1313251">1313251</ulink></entry><entry><entry>Fix</entry>Wera search, ugly and/or not useful error messages</entry><entry>2005-10-04 13:32</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1282042">1282042</ulink></entry><entry><entry>Fix</entry>WERA - Timeline - Warning when URL not found</entry><entry>2005-09-05 03:28</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312484">1312484</ulink></entry><entry><entry>Fix</entry>[wera] Ugly complaint about invalid argument</entry><entry>2005-10-03 18:47</entry><entry>sverreb</entry><entry>stack-sf</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1312299">1312299</ulink></entry><entry><entry>Fix</entry>WERA - Exacturl search not always working</entry><entry>2005-10-03 13:51</entry><entry>sverreb</entry><entry>sverreb</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1281697">1281697</ulink></entry><entry><entry>Fix</entry>searching czech words not working</entry><entry>2005-09-04 10:36</entry><entry>stack-sf</entry><entry>kranach</entry></row> + <row><entry><ulink url="http://sourceforge.net/tracker/index.php?func=detail&group_id=118427&atid=681137&aid=1277376">1277376</ulink></entry><entry><entry>Fix</entry>WERA - Duplicate hits in result list</entry><entry>2005-08-31 05:45</entry><entry>sverreb</entry><entry>sverreb</entry></row> + + </tbody> + </tgroup> + </table></para> </sect2> </sect1> |
From: Michael S. <sta...@us...> - 2005-10-21 16:27:12
|
Update of /cvsroot/archive-access/archive-access/projects/wera/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27168/xdocs Modified Files: index.xml Log Message: * xdocs/index.xml Fix formatting error. Index: index.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/xdocs/index.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** index.xml 21 Oct 2005 15:30:07 -0000 1.1 --- index.xml 21 Oct 2005 16:27:04 -0000 1.2 *************** *** 42,45 **** --- 42,46 ---- <p>WERA is now a subproject of the <a href="http://archive-access.sourceforge.net">archive-access</a>.</p> + </subsection> </section> </body> |
From: Michael S. <sta...@us...> - 2005-10-21 15:44:26
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14942/src/articles Modified Files: releasenotes.xml Log Message: * src/articles/releasenotes.xml An edit. Index: releasenotes.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/articles/releasenotes.xml,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** releasenotes.xml 21 Oct 2005 13:33:04 -0000 1.3 --- releasenotes.xml 21 Oct 2005 15:44:17 -0000 1.4 *************** *** 13,18 **** <abstract> ! <para>Improved exacturl handling, error handling and encoding issues. ! Bug fixes.</para> </abstract> --- 13,18 ---- <abstract> ! <para>Improved <emphasis>exacturl</emphasis> handling, error handling and ! encoding issues. Bug fixes.</para> </abstract> *************** *** 21,33 **** <sect3> ! <title>No canonization of URLS</title> ! <para>Currently, .no canonization of url is done in Wera. If a link ! points to an url that is indexed with a different form (e.g ! http://www.nb.no instead of http://nb.no), Wera will not find this in the index and therefore will report: <emphasis>Sorry, no documents with the given uri were found.</emphasis> See <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312202&group_id=118427&atid=681137">bugtracker ! issue</ulink>.</para> </sect3> --- 21,33 ---- <sect3> ! <title>No canonicalization of URLS</title> ! <para>Currently, no canonization of URLs is done in WERA. If a link ! points to an URL that is indexed with a different form (e.g ! http://www.nb.no instead of http://nb.no), WERA will not find this in the index and therefore will report: <emphasis>Sorry, no documents with the given uri were found.</emphasis> See <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312202&group_id=118427&atid=681137">[1312202] exacturl needs to ! canonicalize</ulink>.</para> </sect3> *************** *** 35,46 **** <title>Redirects not handled</title> ! <para>Wera does nothing to handle redirects. The result, depending on the nature of the redirect, will be either that the actual resource is ! not displayedat all in Wera or that a redirect to live web is executed ! within the Wera view without any information to the user See bugtracker issues <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312200&group_id=118427&atid=681137">1312200</ulink> and <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312214&group_id=118427&atid=681137">1312214</ulink>.</para> </sect3> --- 35,46 ---- <title>Redirects not handled</title> ! <para>WERA does nothing to handle redirects. The result, depending on the nature of the redirect, will be either that the actual resource is ! not displayed at all in WERA or that a redirect to live web is executed ! within the WERA view without any information to the user See bugtracker issues <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312200&group_id=118427&atid=681137">[1312200] Pages at end of redirects not found</ulink> and <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312214&group_id=118427&atid=681137">[1312214] More redirects to live web</ulink>.</para> </sect3> *************** *** 59,66 **** <title>Exacturl handling</title> ! <para>The handling of exacturl searches has been improved considerably ! on both Wera and NutchWax side. Wera uses the exacturl search ! functionality extensively both for counting versions of a given url ! and to determine the mapping between a given url/timestamp and its Arc name and offset.</para> </sect3> --- 59,67 ---- <title>Exacturl handling</title> ! <para>The handling of <emphasis>exacturl</emphasis> searches has been ! improved considerably ! on both WERA and NutchWax side. WERA uses the exacturl search ! functionality extensively both for counting versions of a given URL ! and to determine the mapping between a given URL/timestamp and its Arc name and offset.</para> </sect3> *************** *** 69,73 **** <title>Error handling</title> ! <para>Wera's error messages has been imporved. Instead of printing cryptical PHP warnings and errors it prints more meaningful error messages enabling to user to understand what is wrong.</para> --- 70,74 ---- <title>Error handling</title> ! <para>WERA's error messages has been imporved. Instead of printing cryptical PHP warnings and errors it prints more meaningful error messages enabling to user to understand what is wrong.</para> *************** *** 78,82 **** <para>There were major problems with querying with non-ISO8859 ! characters. To solve this issue changes were made to both Wera and NutchWax.</para> </sect3> --- 79,83 ---- <para>There were major problems with querying with non-ISO8859 ! characters. To solve this issue changes were made to both WERA and NutchWax.</para> </sect3> *************** *** 85,89 **** <title>Encoding issues when vieving archived pages</title> ! <para>Wera now sets the encoding in the header of a given web page prior to sending the page to the users browser. The encoding sent is the encoding detected by NutchWax at index time.</para> --- 86,90 ---- <title>Encoding issues when vieving archived pages</title> ! <para>WERA now sets the encoding in the header of a given web page prior to sending the page to the users browser. The encoding sent is the encoding detected by NutchWax at index time.</para> *************** *** 121,125 **** <para>When no X installed the Java based installer should fall back to ! console mode. Some reports of problems with this. If so, install wera manually. See manual.</para> --- 122,126 ---- <para>When no X installed the Java based installer should fall back to ! console mode. Some reports of problems with this. If so, install WERA manually. See manual.</para> *************** *** 157,159 **** </sect2> </sect1> ! </article> \ No newline at end of file --- 158,160 ---- </sect2> </sect1> ! </article> |
From: Michael S. <sta...@us...> - 2005-10-21 15:31:59
|
Update of /cvsroot/archive-access/archive-access/projects/wera/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12448/xdocs Modified Files: navigation.xml Log Message: * xdocs/navigation.xml Remove link to release notes. Its in body of page. Index: navigation.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/xdocs/navigation.xml,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** navigation.xml 20 Oct 2005 16:35:57 -0000 1.5 --- navigation.xml 21 Oct 2005 15:31:48 -0000 1.6 *************** *** 17,21 **** <item name="What is WERA" href="/articles/what-is-wera.html"/> <item name="Wera Manual" href="/articles/manual.html"/> - <item name="Release Notes" href="/articles/releasenotes.html"/> <item name="FAQ" href="faq.html"/> </item> --- 17,20 ---- |
From: Michael S. <sta...@us...> - 2005-10-21 15:30:17
|
Update of /cvsroot/archive-access/archive-access/projects/wera/xdocs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11986/xdocs Added Files: index.xml Log Message: * xdocs/index.xml Adding home page. --- NEW FILE: index.xml --- <?xml version="1.0" encoding="ISO-8859-1"?> <document> <properties> <title>Home Page</title> <author email="stack at archive dot org">St.Ack</author> <revision>$Id: index.xml,v 1.1 2005/10/21 15:30:07 stack-sf Exp $</revision> </properties> <body> <section name="Introduction"> <p>WERA (Web ARchive Access) is a freely available solution for searching and navigating archived web document collections. It works like the Internet Archive's <a href="http://www.archive.org/web">Wayback Machine</a> except it also allows for full-text search of the web archive. Wera is a php application based on pieces from -- and now, with <a href="/projects/nutch">Nutchwax</a> replaces -- <a href="http://nwa.nb.no/">NwaToolset</a>. The wera component includes an <i>arcretriever</i> webapp for the fetching of records from directories of Internet Archive ARC files. See the <a href="articles/manual">wera Manual</a> for more on how wera works, requirements, and installation. For a demo of wera+nutchwax, see <a href="http://nwa.nb.no/wera/">nwa.nb.no/wera</a>. Wera development has been sponsored by the <a href="http://www.netpreserve.net">International Internet Preservation Consortium (IIPC)</a>.</p> <p>See the <a href="downloads.html">downloads page</a> for the latest release of WERA. See the <a href="articles/manual.html">manual</a> for installation instructions and <a href="articles/what-is-wera.html">What Is WERA</a> for an overview on its workings the basic architecture. </p> </section> <section name="News"> <subsection name="Release 0.4.0 10/21/2005"> <p>Improved error and encoding handling. Lots of bug fixes. See <a href="articles/releasenotes.html">Release Notes</a>. </p> </subsection> <subsection name="WERA migrated from NWA to archive-access 09/2005"> <p>WERA is now a subproject of the <a href="http://archive-access.sourceforge.net">archive-access</a>.</p> </section> </body> </document> |
From: Sverre B. <sv...@us...> - 2005-10-21 13:33:15
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10563/src/articles Modified Files: releasenotes.xml Log Message: Index: releasenotes.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/articles/releasenotes.xml,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** releasenotes.xml 21 Oct 2005 13:19:57 -0000 1.2 --- releasenotes.xml 21 Oct 2005 13:33:04 -0000 1.3 *************** *** 10,14 **** <sect1 id="0_4_0"> ! <title>Release 0.4.0 - NOT YET RELEASED</title> <abstract> --- 10,14 ---- <sect1 id="0_4_0"> ! <title>Release 0.4.0</title> <abstract> |
From: Sverre B. <sv...@us...> - 2005-10-21 13:25:41
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/installer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv8580/src/installer Modified Files: install_info.txt Log Message: Index: install_info.txt =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/installer/install_info.txt,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** install_info.txt 4 Oct 2005 22:59:27 -0000 1.1 --- install_info.txt 21 Oct 2005 13:25:15 -0000 1.2 *************** *** 4,23 **** * A JVM. ! * Apache http server w. PHP 4.3 or 4.4. (make sure that XML support is enabled, see end for details). WERA will NOT work properly ! with PHP 5, because of the new Object Model. If PHP not installed, ! the quickest solution may be to install XAMPP, http://www.apachefriends.org/en/xampp.html ! ! * Tomcat servlet container (http://jakarta.apache.org/tomcat/index.html). ! The ArcRetriever web app has been ! tested on v.5.0.27 and 5.0.28 - * NutchWAX. A bundling of Nutch and extensions for - searching Web Archive Collections (WACs) - http://archive-access.sourceforge.net/projects/nutch/ - - THE NEXT STEPS -------------- --- 4,23 ---- * A JVM. ! * Apache http server w. PHP 4.3 or 4.4 (make sure that XML support is enabled, see end for details). WERA will NOT work properly ! with PHP 5, because of the new Object Model in PHP5. If PHP not ! installed, the quickest solution may be to install XAMPP, http://www.apachefriends.org/en/xampp.html ! ! * Tomcat servlet container (http://jakarta.apache.org/tomcat/index.html). ! The arcretriever web app has been tested on v.5.0.27 ! and 5.0.28 as well as in 5.5.9. ! ! * NutchWAX. A bundling of Nutch and extensions for searching ! Web Archive Collections (WACs) ! http://archive-access.sourceforge.net/projects/nutch/ ! THE NEXT STEPS -------------- *************** *** 30,34 **** different components. ! For the installtion of the ARC retriever you need to enter the webapp directory of your Tomcat installation. You will also be asked where yuor ARC files recide. --- 30,34 ---- different components. ! For the installtion of the arcretriever you need to enter the webapp directory of your Tomcat installation. You will also be asked where yuor ARC files recide. |
From: Sverre B. <sv...@us...> - 2005-10-21 13:20:21
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv6238 Modified Files: releasenotes.xml Log Message: Added info on upcoming release 1.4 Index: releasenotes.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/articles/releasenotes.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** releasenotes.xml 17 Oct 2005 21:51:35 -0000 1.1 --- releasenotes.xml 21 Oct 2005 13:19:57 -0000 1.2 *************** *** 11,85 **** <sect1 id="0_4_0"> <title>Release 0.4.0 - NOT YET RELEASED</title> <abstract> ! <para>TODO</para> </abstract> <sect2 id="0_4_0_limitations"> <title>Known Limitations/Issues</title> ! <sect3><title>?</title> ! <para> ! ? ! </para> ! </sect3> </sect2> <sect2 id="0_4_0_changes"> ! <title>Changes</title> ! <sect3 > ! <title>?</title> ! <para /> ! </sect3> ! </sect2> ! </sect1> <sect1 id="0_2_2"> <title>Release 0.2.2</title> <abstract> <para>Bug fixes</para> </abstract> <sect2 id="0_2_2_changes"> ! <title>Changes</title> ! ! <para>Fixed ! <ulink url="http://sourceforge.net/tracker/index.php?func=detail&aid=1277376&group_id=118427&atid=681137">1277376 duplicate hits in result list</ulink>. WERA now uses NutchWAX's dedup functionality to supress duplicate ! hits in result list. Gives improved performance.</para> ! </sect2> ! </sect1> <sect1 id="0_2_1"> <title>Release 0.2.1</title> <abstract> <para>First release of WERA</para> </abstract> <sect2> <title>Known Limitations/Issues</title> ! <para>When no X installed the Java based installer should fall back to ! console mode. Some reports of problems with this. If so, install wera ! manually. See manual. ! </para> ! <para>WERA does not work properly with PHP5. Has to do with PHP5's new ! Object Model. When using the 'NEAR' mode of the documentLocator it will ! return a resultset concatenated by the resultsets for 'BEFORE' ! and 'AFTER' instead of returning the one closest in time. Results in ! wrong aid to the documentRetriever when presenting inline objects. ! </para> </sect2> <sect2 id="0_2_1_changes"> ! <title>Changes</title> ! <para> ! <orderedlist> ! <listitem>Support for nutchwax search engine added</listitem> ! <listitem>Support for nwalucene search removed (replaced by the above). ! </listitem> ! <listitem>Support for Fast Search Engine currently not working (will be added in later version).</listitem> ! <listitem>Advanced search removed (may be added in later ! version).</listitem> ! <listitem>Server side link rewriting replaced by javascript client side ! link rewriting.</listitem> ! </orderedlist> ! </para> ! </sect2> ! </sect1> ! </article> --- 11,159 ---- <sect1 id="0_4_0"> <title>Release 0.4.0 - NOT YET RELEASED</title> + <abstract> ! <para>Improved exacturl handling, error handling and encoding issues. ! Bug fixes.</para> </abstract> <sect2 id="0_4_0_limitations"> <title>Known Limitations/Issues</title> ! ! <sect3> ! <title>No canonization of URLS</title> ! ! <para>Currently, .no canonization of url is done in Wera. If a link ! points to an url that is indexed with a different form (e.g ! http://www.nb.no instead of http://nb.no), Wera will not find this in ! the index and therefore will report: <emphasis>Sorry, no documents ! with the given uri were found.</emphasis> See <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312202&group_id=118427&atid=681137">bugtracker ! issue</ulink>.</para> ! </sect3> ! ! <sect3> ! <title>Redirects not handled</title> ! ! <para>Wera does nothing to handle redirects. The result, depending on ! the nature of the redirect, will be either that the actual resource is ! not displayedat all in Wera or that a redirect to live web is executed ! within the Wera view without any information to the user See ! bugtracker issues <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312200&group_id=118427&atid=681137">1312200</ulink> ! and <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1312214&group_id=118427&atid=681137">1312214</ulink>.</para> ! </sect3> ! ! <sect3> ! <title>Advanced search not ready</title> ! ! <para>The advanced search has not been prioritized for this release. ! Functionality needed not specified.</para> ! </sect3> </sect2> + <sect2 id="0_4_0_changes"> ! <title>Changes</title> ! ! <sect3> ! <title>Exacturl handling</title> ! ! <para>The handling of exacturl searches has been improved considerably ! on both Wera and NutchWax side. Wera uses the exacturl search ! functionality extensively both for counting versions of a given url ! and to determine the mapping between a given url/timestamp and its Arc ! name and offset.</para> ! </sect3> ! ! <sect3> ! <title>Error handling</title> ! ! <para>Wera's error messages has been imporved. Instead of printing ! cryptical PHP warnings and errors it prints more meaningful error ! messages enabling to user to understand what is wrong.</para> ! </sect3> ! ! <sect3> ! <title>Query encoding issues</title> ! ! <para>There were major problems with querying with non-ISO8859 ! characters. To solve this issue changes were made to both Wera and ! NutchWax.</para> ! </sect3> ! ! <sect3> ! <title>Encoding issues when vieving archived pages</title> ! ! <para>Wera now sets the encoding in the header of a given web page ! prior to sending the page to the users browser. The encoding sent is ! the encoding detected by NutchWax at index time.</para> ! </sect3> ! </sect2> ! </sect1> <sect1 id="0_2_2"> <title>Release 0.2.2</title> + <abstract> <para>Bug fixes</para> </abstract> + <sect2 id="0_2_2_changes"> ! <title>Changes</title> ! ! <para>Fixed <ulink ! url="http://sourceforge.net/tracker/index.php?func=detail&aid=1277376&group_id=118427&atid=681137">1277376 ! duplicate hits in result list</ulink>. WERA now uses NutchWAX's dedup ! functionality to supress duplicate hits in result list. Gives improved ! performance.</para> ! </sect2> ! </sect1> <sect1 id="0_2_1"> <title>Release 0.2.1</title> + <abstract> <para>First release of WERA</para> </abstract> + <sect2> <title>Known Limitations/Issues</title> ! ! <para>When no X installed the Java based installer should fall back to ! console mode. Some reports of problems with this. If so, install wera ! manually. See manual.</para> ! ! <para>WERA does not work properly with PHP5. Has to do with PHP5's new ! Object Model. When using the 'NEAR' mode of the documentLocator it will ! return a resultset concatenated by the resultsets for 'BEFORE' and ! 'AFTER' instead of returning the one closest in time. Results in wrong ! aid to the documentRetriever when presenting inline objects.</para> </sect2> + <sect2 id="0_2_1_changes"> ! <title>Changes</title> ! <para><orderedlist> ! <listitem> ! Support for nutchwax search engine added ! </listitem> + <listitem> + Support for nwalucene search removed (replaced by the above). + </listitem> ! <listitem> ! Support for Fast Search Engine currently not working (will be added in later version). ! </listitem> ! ! <listitem> ! Advanced search removed (may be added in later version). ! </listitem> ! ! <listitem> ! Server side link rewriting replaced by javascript client side link rewriting. ! </listitem> ! </orderedlist></para> ! </sect2> ! </sect1> ! </article> \ No newline at end of file |
From: Sverre B. <sv...@us...> - 2005-10-21 11:12:19
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/installer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv6051/src/installer Modified Files: antinstall-config.xml build.xml build-wera.xml Added Files: build-retriever.xml Log Message: praparations for release 4 Index: build-wera.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/installer/build-wera.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** build-wera.xml 4 Oct 2005 22:59:27 -0000 1.1 --- build-wera.xml 21 Oct 2005 11:12:07 -0000 1.2 *************** *** 17,23 **** <!-- Properties for generating documentation --> ! <property name="input.file" value="${cvsexport}/wera/doc/manual.xml"/> ! <property name="output.html" value="${cvsexport}/manual/manual.html"/> ! <property name="output.pdf" value="${cvsexport}/manual/manual.pdf"/> <property name="docbook.dir" value="/usr/share/xml/docbook" /> <property name="saxon.dir" value="/usr/share/saxon" /> --- 17,26 ---- <!-- Properties for generating documentation --> ! <property name="releasenotes.xml" value="${cvsexport}/archive-access/projects/wera/src/articles/releasenotes.xml"/> ! <property name="releasenotes.html" value="${cvsexport}/releasenotes.html"/> ! ! <property name="input.file" value="${cvsexport}/archive-access/projects/wera/src/articles/manual.xml"/> ! <property name="output.html" value="${cvsexport}/articles/manual.html"/> ! <property name="output.pdf" value="${cvsexport}/articles/manual.pdf"/> <property name="docbook.dir" value="/usr/share/xml/docbook" /> <property name="saxon.dir" value="/usr/share/saxon" /> *************** *** 27,39 **** <echo message="Fetching source code ... "/> <tstamp> ! <format property="werabuildtime" pattern="yyyyMMddHHmm"/> </tstamp> <if> <equals arg1="${wera.cvstag}" arg2="head" /> <then> ! <property name="werapack" value="wera-${werabuildtime}"/> <echo message="Package name: ${werapack}"/> <echo message="Retrieving most recent files from cvs"/> ! <cvs command="export" cvsRoot=":ext:${nwa.sfuser}@cvs.sf.net:/cvsroot/nwatoolset" package="retriever wera" date="tomorrow" dest="${cvsexport}" failonerror="true"/> </then> <elseif> --- 30,42 ---- <echo message="Fetching source code ... "/> <tstamp> ! <format property="weraversion" pattern="yyyyMMddHHmm"/> </tstamp> <if> <equals arg1="${wera.cvstag}" arg2="head" /> <then> ! <property name="werapack" value="wera-${weraversion}"/> <echo message="Package name: ${werapack}"/> <echo message="Retrieving most recent files from cvs"/> ! <cvs command="export" cvsRoot=":ext:${nwa.sfuser}@cvs.sf.net:/cvsroot/archive-access" package="archive-access/projects/wera/src archive-access/projects/wera/lib" date="tomorrow" dest="${cvsexport}" failonerror="true"/> </then> <elseif> *************** *** 44,67 **** <fileset dir="cvslocal"/> </copy> ! <property name="werapack" value="wera-test-${werabuildtime}"/> </then> </elseif> <else> <echo message="Retrieving files with tag ${wera.cvstag} from cvs"/> ! <cvs command="export" cvsRoot=":ext:${nwa.sfuser}@cvs.sf.net:/cvsroot/nwatoolset" package="retriever wera" tag="${wera.cvstag}" dest="${cvsexport}" failonerror="true"/> <property name="werapack" value="${wera.cvstag}"/> <echo message="Package name: ${werapack}"/> </else> </if> ! <delete file="${cvsexport}/wera/gui/lib/config.inc"/> <echo message="Generating documentation"/> ! <mkdir dir="${cvsexport}/manual"/> ! <copy todir="${cvsexport}/manual"> ! <fileset dir="${cvsexport}/wera/doc"> <include name="images/*"/> - <include name="RELEASE-NOTES"/> </fileset> </copy> --- 47,70 ---- <fileset dir="cvslocal"/> </copy> ! <property name="werapack" value="wera-test-${weraversion}"/> </then> </elseif> <else> <echo message="Retrieving files with tag ${wera.cvstag} from cvs"/> ! <cvs command="export" cvsRoot=":ext:${nwa.sfuser}@cvs.sf.net:/cvsroot/archive-access" package="archive-access/projects/wera/src archive-access/projects/wera/lib" tag="${wera.cvstag}" dest="${cvsexport}" failonerror="true"/> <property name="werapack" value="${wera.cvstag}"/> + <property name="weraversion" value="${wera.cvstag}"/> <echo message="Package name: ${werapack}"/> </else> </if> ! <delete file="${cvsexport}/archive-access/projects/wera/src/webapps/wera/lib/config.inc"/> <echo message="Generating documentation"/> ! <mkdir dir="${cvsexport}/articles"/> ! <copy todir="${cvsexport}/articles"> ! <fileset dir="${cvsexport}/archive-access/projects/wera/src/articles"> <include name="images/*"/> </fileset> </copy> *************** *** 81,85 **** <arg value="${docbook.dir}/stylesheet/nwalsh/current/html/docbook.xsl"/> </java> ! <echo message="Creating manual.pdf"/> <java classname="org.apache.fop.apps.Fop" fork="true"> <classpath refid="fop.classpath"/> --- 84,93 ---- <arg value="${docbook.dir}/stylesheet/nwalsh/current/html/docbook.xsl"/> </java> ! <echo message="Creating releasenotes.html"/> ! <java jar="${saxon.dir}/saxon.jar" fork="true" output="${releasenotes.html}"> ! <arg value="${releasenotes.xml}"/> ! <arg value="${docbook.dir}/stylesheet/nwalsh/current/html/docbook.xsl"/> ! </java> ! <!-- <echo message="Creating manual.pdf"/> <java classname="org.apache.fop.apps.Fop" fork="true"> <classpath refid="fop.classpath"/> *************** *** 87,109 **** <arg line="-xsl ${docbook.dir}/stylesheet/nwalsh/current/fo/docbook.xsl"/> <arg value="${output.pdf}"/> ! </java> ! <echo message="Building ARC Retriever"/> ! <ant dir="${cvsexport}/retriever/arcretriever"/> <zip file="./wera.zip"> <zipfileset dir="${cvsexport}"> ! <include name="manual/**/*"/> ! <include name="wera/gui/**/*"/> </zipfileset> ! <zipfileset dir="${cvsexport}/retriever/arcretriever"> ! <include name="ArcRetriever.war"/> </zipfileset> - <!--<zipfileset dir="${cvsexport}/nutchwax"> - <include name="archive-access-nutch.war"/> - </zipfileset>--> </zip> ! <echo message="Building manual install package"/> <property name="manualinstalldir" location="${basedir}/manual-install"/> <delete dir="${manualinstalldir}"/> --- 95,123 ---- <arg line="-xsl ${docbook.dir}/stylesheet/nwalsh/current/fo/docbook.xsl"/> <arg value="${output.pdf}"/> ! </java>--> ! <delete dir="arcretriever"/> ! ! <echo message="Building ARC Retriever"/> ! <ant antfile="build-retriever.xml"/> ! ! <replace file="${cvsexport}/archive-access/projects/wera/src/webapps/wera/lib/header.inc"> ! <replacefilter token="@VERSION@" value="${weraversion}"/> ! </replace> <zip file="./wera.zip"> <zipfileset dir="${cvsexport}"> ! <include name="articles/**/*"/> ! <include name="**/releasenotes.html"/> </zipfileset> ! <zipfileset dir="${cvsexport}/archive-access/projects/wera/src/webapps/wera"> ! <include name="**/*"/> ! </zipfileset> ! <zipfileset dir="."> ! <include name="arcretriever/**/*"/> </zipfileset> </zip> ! <!-- <echo message="Building manual install package"/> <property name="manualinstalldir" location="${basedir}/manual-install"/> <delete dir="${manualinstalldir}"/> *************** *** 132,136 **** <replacefilter token="@retrieverUrl@" value="http://localhost:8080/ArcRetriever/ArcRetriever"/> <replacefilter token="@guiUrl@" value="http://localhost/wera"/> - <!--<replacefilter token="@guiCollection@" value="${guiCollection}"/>--> <replacefilter token="@collection@" value="test"/> <replacefilter token="@searchEngine@" value="nutch"/> --- 146,149 ---- *************** *** 142,146 **** <tar tarfile="${werapack}-manual-install.tar" basedir="${manualinstalldir}"/> <gzip zipfile="${werapack}-manual-install.tar.gz" src="${werapack}-manual-install.tar"/> ! <delete file="${werapack}-manual-install.tar"/> <echo message="Building Installer package"/> --- 155,159 ---- <tar tarfile="${werapack}-manual-install.tar" basedir="${manualinstalldir}"/> <gzip zipfile="${werapack}-manual-install.tar.gz" src="${werapack}-manual-install.tar"/> ! <delete file="${werapack}-manual-install.tar"/> --> <echo message="Building Installer package"/> *************** *** 168,177 **** <copy todir="./selfextract"> ! <fileset dir="${cvsexport}/wera/installer"> <include name="install_info.txt"/> </fileset> </copy> <copy todir="./selfextract"> ! <fileset dir="${cvsexport}/wera/doc/images"> <include name="iipc.png"/> </fileset> --- 181,190 ---- <copy todir="./selfextract"> ! <fileset dir="${cvsexport}/archive-access/projects/wera/src/installer"> <include name="install_info.txt"/> </fileset> </copy> <copy todir="./selfextract"> ! <fileset dir="${cvsexport}/archive-access/projects/wera/src/articles/images"> <include name="iipc.png"/> </fileset> --- NEW FILE: build-retriever.xml --- <project name="webmodulebuilder" default="compile" basedir="."> <description> Build file WERA arcretriever </description> <property name="extralib" value="extralib"/> <!-- set global properties for this build --> <property name="webappsourcedir" value="cvsexport/archive-access/projects/wera/src/webapps/arcretriever" /> <property name="module" value="arcretriever" /> <property name="src" value="cvsexport/archive-access/projects/wera/src/java" /> <property name="lib" value="cvsexport/archive-access/projects/wera/lib" /> <copy todir="${lib}"> <fileset dir="${extralib}"> <!-- additional lib needed to build the arcretriever (servlet-2.3.jar). Don't know why this is not part of wera lib. What you say stack ? --> <include name="servlet-2.3.jar" /> </fileset> </copy> <property name="dist" value="arcretriever" /> <property name="warfile" value="${module}.war" /> <target name="clean"> <delete file="${warfile}" failonerror="false" /> <delete dir="${dist}" failonerror="false" /> </target> <target name="init"> <!-- Create the dist directory structure used by compile and copy the deployment descriptors into it--> <mkdir dir="${dist}" /> <mkdir dir="${dist}/images" /> <mkdir dir="${dist}/WEB-INF" /> <mkdir dir="${dist}/WEB-INF/classes" /> <mkdir dir="${dist}/WEB-INF/lib" /> <copy todir="${dist}"> <fileset dir="${webappsourcedir}"> <include name="**/*.jsp" /> <include name="**/*.css" /> <include name="LICENSE.TXT" /> </fileset> </copy> <copy todir="${dist}/WEB-INF"> <fileset dir="${webappsourcedir}/WEB-INF"> <include name="web.xml" /> <include name="arcretriever.properties" /> </fileset> </copy> <copy todir="${dist}/WEB-INF/lib"> <fileset dir="${lib}"> <include name="*.jar" /> </fileset> <fileset dir="${extralib}"> <include name="servlet-2.3.jar" /> </fileset> </copy> <copy todir="${dist}/images"> <fileset dir="${webappsourcedir}/images"> <include name="*.jpg" /> <include name="*.png" /> <include name="*.gif" /> </fileset> </copy> </target> <target name="compile" depends="init"> <javac destdir="${dist}/WEB-INF/classes" srcdir="${src}"> <classpath> <fileset dir="${lib}"> <include name="**/*.jar" /> </fileset> </classpath> </javac> </target> <!-- Move this to installer, because installer should update web.xml <target name="war" depends="compile"> <jar jarfile="${warfile}" basedir="${dist}" /> </target> --> </project> Index: build.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/installer/build.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** build.xml 4 Oct 2005 22:59:27 -0000 1.1 --- build.xml 21 Oct 2005 11:12:07 -0000 1.2 *************** *** 35,39 **** <mkdir dir="${apacheWebRootDir}/wera"/> <copy todir="${apacheWebRootDir}/wera"> ! <fileset dir="${basedir}/temp/wera/gui"/> </copy> --- 35,39 ---- <mkdir dir="${apacheWebRootDir}/wera"/> <copy todir="${apacheWebRootDir}/wera"> ! <fileset dir="${basedir}/temp"/> </copy> *************** *** 41,71 **** <mkdir dir="${apacheWebRootDir}/wera/manual/images"/> <copy file="${basedir}/temp/manual/manual.html" tofile="${apacheWebRootDir}/wera/manual/manual.html"/> ! <copy file="${basedir}/temp/manual/manual.pdf" tofile="${apacheWebRootDir}/wera/manual/manual.pdf"/> ! <copy file="${basedir}/temp/manual/RELEASE-NOTES" tofile="${apacheWebRootDir}/wera/RELEASE-NOTES"/> <copy todir="${apacheWebRootDir}/wera/manual/images"> <fileset dir="${basedir}/temp/manual/images"/> </copy> ! <copy file="${basedir}/temp/wera/gui/lib/config.inc.template" tofile="${apacheWebRootDir}/wera/lib/config.inc"/> <replace file="${apacheWebRootDir}/wera/lib/config.inc"> - <replacefilter token="@aidPrefix@" value="${arcDirectory}"/> - <replacefilter token="@aidSuffix@" value=".arc.gz"/> <replacefilter token="@guiInstallDir@" value="${apacheWebRootDir}/wera"/> <replacefilter token="@hostName@" value="${hostName}"/> ! <replacefilter token="@retrieverUrl@" value="http://${hostName}:${tomcatPort}/ArcRetriever/ArcRetriever"/> <replacefilter token="@guiUrl@" value="http://${apacheHostPort}/wera"/> - <!--<replacefilter token="@guiCollection@" value="${guiCollection}"/>--> <replacefilter token="@collection@" value="test"/> <replacefilter token="@searchEngine@" value="nutch"/> <replacefilter token="@searchEngineUrl@" value="http://${hostName}:${tomcatPort}/nutchwax/opensearch"/> </replace> - <echo message="Installing wera ARC Retriever"/> - <copy file="${basedir}/temp/ArcRetriever.war" tofile="${tomcatWebAppDir}/ArcRetriever.war"/> - - <!--<echo message="Installing Nutch ARC-indexer"/> - - <echo message="Installing Nutch Search Web Application"/> - <copy file="${basedir}/temp/archive-access-nutch.war" tofile="${tomcatWebAppDir}/archive-access-nutch.war"/>--> </target> --- 41,72 ---- <mkdir dir="${apacheWebRootDir}/wera/manual/images"/> <copy file="${basedir}/temp/manual/manual.html" tofile="${apacheWebRootDir}/wera/manual/manual.html"/> ! <!--<copy file="${basedir}/temp/manual/manual.pdf" tofile="${apacheWebRootDir}/wera/manual/manual.pdf"/>--> <copy todir="${apacheWebRootDir}/wera/manual/images"> <fileset dir="${basedir}/temp/manual/images"/> </copy> ! <copy file="${basedir}/temp/lib/config.inc.template" tofile="${apacheWebRootDir}/wera/lib/config.inc"/> <replace file="${apacheWebRootDir}/wera/lib/config.inc"> <replacefilter token="@guiInstallDir@" value="${apacheWebRootDir}/wera"/> <replacefilter token="@hostName@" value="${hostName}"/> ! <replacefilter token="@retrieverUrl@" value="http://${hostName}:${tomcatPort}/arcretriever/arcretriever"/> <replacefilter token="@guiUrl@" value="http://${apacheHostPort}/wera"/> <replacefilter token="@collection@" value="test"/> <replacefilter token="@searchEngine@" value="nutch"/> <replacefilter token="@searchEngineUrl@" value="http://${hostName}:${tomcatPort}/nutchwax/opensearch"/> </replace> + + <echo message="Installing ARC Retriever"/> + <replace file="${basedir}/temp/arcretriever/WEB-INF/web.xml"> + <replacefilter token="@arcdir@" value="${arcDirectory}"/> + </replace> + <jar jarfile="${tomcatWebAppDir}/arcretriever.war" basedir="${basedir}/temp/arcretriever" /> + + <!-- + <mkdir dir="${tomcatWebAppDir}/arcretriever"/> + <copy todir="${tomcatWebAppDir}/arcretriever"> + <fileset dir="${basedir}/temp/arcretriever"/> + </copy>--> </target> Index: antinstall-config.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/installer/antinstall-config.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** antinstall-config.xml 4 Oct 2005 22:59:27 -0000 1.1 --- antinstall-config.xml 21 Oct 2005 11:12:07 -0000 1.2 *************** *** 8,13 **** displayText="" explanatoryText="Copyright (C) 2001-2005 Royal Library in Stockholm, Royal Library in Copenhagen, Helsinki University Library of Finland, National Library of Norway, National and University Library of Iceland."/> ! <comment displayText="http://nwatoolset.sourceforge.net/"/> ! <comment displayText="http://netpreserve.org/"/> </page> --- 8,12 ---- displayText="" explanatoryText="Copyright (C) 2001-2005 Royal Library in Stockholm, Royal Library in Copenhagen, Helsinki University Library of Finland, National Library of Norway, National and University Library of Iceland."/> ! <comment displayText="http://archive-access.sourceforge.net/projects/wera/"/> </page> *************** *** 21,26 **** <comment displayText=""/> <directory property="arcDirectory" defaultValue="/var/arcs" displayText="ARC file directory:" explanatoryText="Please enter the directory where the ARC files you have indexed /plan to index with nutchwax. The ARC retriever in this release has no knowledge of where ARC files recide. When WERA retrieves a specific archived document from the retriever the request has to include the full path to the ARC file in question. This will change in later releases."/> - <!--<directory property="nutchwaxDir" defaultValue="/usr/local/nutchwax" displayText="NutchWax install dir" create="true"/>--> - <!--<text property="guiCollection" defaultValue="test" displayText="Collection name:"/>--> </page> --- 20,23 ---- *************** *** 30,38 **** <text property="apachePort" defaultValue="80" displayText="Apache port number:"/> </page> ! <!-- ! <page type="input" name="searchengine" displayText="Nutch Search Engine" target="tginstall"> ! <text property="searchEngineUrl" defaultValue="http://${hostName}:${tomcatPort}/nutchwax/opensearch" displayText="Nutch A9 opensearch url:"/> ! </page> ! --> <page type="progress" name="progress" displayText="Installation progress" target="cleanuptarget"></page> --- 27,31 ---- <text property="apachePort" defaultValue="80" displayText="Apache port number:"/> </page> ! <page type="progress" name="progress" displayText="Installation progress" target="cleanuptarget"></page> |
From: Sverre B. <sv...@us...> - 2005-10-21 10:59:33
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/webapps/wera In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32690 Modified Files: index.php Log Message: typo Index: index.php =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/webapps/wera/index.php,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** index.php 21 Oct 2005 07:02:54 -0000 1.10 --- index.php 21 Oct 2005 10:59:23 -0000 1.11 *************** *** 386,390 **** </td> <td align="right" class="norm"> ! <a href="./articles//manual.html">Manual</a> | <a href="./articles/releasenotes.html">Release Notes</a> | <a href="http://sourceforge.net/tracker/?group_id=118427&atid=681137">Report bugs</a> </td> </tr> --- 386,390 ---- </td> <td align="right" class="norm"> ! <a href="./articles/manual.html">Manual</a> | <a href="./articles/releasenotes.html">Release Notes</a> | <a href="http://sourceforge.net/tracker/?group_id=118427&atid=681137">Report bugs</a> </td> </tr> |
From: Sverre B. <sv...@us...> - 2005-10-21 09:14:47
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/webapps/arcretriever/WEB-INF In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12550/src/webapps/arcretriever/WEB-INF Modified Files: web.xml Log Message: Index: web.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/webapps/arcretriever/WEB-INF/web.xml,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** web.xml 19 Oct 2005 23:59:22 -0000 1.3 --- web.xml 21 Oct 2005 09:14:35 -0000 1.4 *************** *** 12,16 **** <init-param> <param-name>arcdir</param-name> ! <param-value>arcs</param-value> <description>Full path to directory of ARC files. Be aware that changing this value in the web.xml of an --- 12,16 ---- <init-param> <param-name>arcdir</param-name> ! <param-value>@arcdir@</param-value> <description>Full path to directory of ARC files. Be aware that changing this value in the web.xml of an |
From: Sverre B. <sv...@us...> - 2005-10-21 07:33:50
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/articles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22634 Modified Files: what-is-wera.xml Log Message: minor change Index: what-is-wera.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/articles/what-is-wera.xml,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** what-is-wera.xml 20 Oct 2005 16:34:11 -0000 1.3 --- what-is-wera.xml 21 Oct 2005 07:33:39 -0000 1.4 *************** *** 70,74 **** </figure> ! <para>Explanation of figure 1:</para> <itemizedlist> --- 70,74 ---- </figure> ! <para>Explanation of above figure:</para> <itemizedlist> *************** *** 113,140 **** to the timeline view script (1,2). For that particular version Wera constructs a request to the <emphasis>arcretriever</emphasis> ! containing the name ! of the ARC file where the version resides as well as the offset ! within that file where the version is stored (the ARC name and ! offset are stored in the index). Wera now requests, and receives ! an archived resource (3, 4) from the ! <emphasis>arcretriever</emphasis> (request ! example: <literal>http://localhost:8082/arcretriever/arcretriever?reqtype=getfile&aid=5902508/IAH-20051004171809-00000-test</literal>). ! If the resource is of type ! <literal>text/html</literal> (information in result set ! from NutchWax), a javascript link rewriter is inserted in the ! resource to ensure that links point to Wera rather than out to the ! internet. Before Wera delivers the resource to the users browser, ! header information on content type and encoding is set according ! to values received in the NutchWax result set. This is done to ! ensure that the users browser renders the resource ! correctly.</para> <note> <para>A resource of type <literal>text/html</literal> will often ! contain inline ! references to images etc. Provided the javascript link rewriter ! does its job on these, the step above will be repeated for each ! of these.</para> </note> </listitem> --- 113,136 ---- to the timeline view script (1,2). For that particular version Wera constructs a request to the <emphasis>arcretriever</emphasis> ! containing the name of the ARC file where the version resides as ! well as the offset within that file where the version is stored ! (the ARC name and offset are stored in the index). Wera now ! requests, and receives an archived resource (3, 4) from the ! <emphasis>arcretriever</emphasis> (request example: <literal>http://localhost:8082/arcretriever/arcretriever?reqtype=getfile&aid=5902508/IAH-20051004171809-00000-test</literal>). ! If the resource is of type <literal>text/html</literal> ! (information in result set from NutchWax), a javascript link ! rewriter is inserted in the resource to ensure that links point to ! Wera rather than out to the internet. Before Wera delivers the ! resource to the users browser, header information on content type ! and encoding is set according to values received in the NutchWax ! result set. This is done to ensure that the users browser renders ! the resource correctly.</para> <note> <para>A resource of type <literal>text/html</literal> will often ! contain inline references to images etc. Provided the javascript ! link rewriter does its job on these, the step above will be ! repeated for each of these.</para> </note> </listitem> *************** *** 146,173 **** <title>Practical use</title> ! <para>The original vision for the ! <ulink url="http://nwa.nb.no">NwaToolset</ulink> (the predecessor of Wera) ! was to enable search across the different Nordic Web Archives and ! provide seamless navigation within the different archives. The ability ! to search across the different indexes was solved by the using <ulink url="http://fastsearch.com/">Fast Search & Transfer</ulink>'s multi node architecture. To enable Wera to retrieve a particular document with a given <literal>aid</literal> (Archive ID) from the right archive the ! collection field was introduced ! in the index (also present in the NutchWax index). The Wera config file ! holds the mapping from collection to archive (or rather Wera ! installation).</para> <para>Another reason to include the collection field was to ensure that the actual link rewriting was done by the owner of the document. Each archive holder would have to set up their own Wera installation. When ! one Wera was requesting a document from a remote archive, the remote Wera ! should make the necessary changes to the document before delivering it ! to the calling Wera. The reason for this was to make sure that the owner ! had full control over what was delivered to the calling site, thus being ! able to threat the document in accordance with local policies rather ! than the policies of the caller site. The figure below illustrates the ! currently supported use of mapping between collection and archive ! nodes.</para> <figure> --- 142,168 ---- <title>Practical use</title> ! <para>The original vision for the <ulink ! url="http://nwa.nb.no">NwaToolset</ulink> (the predecessor of Wera) was ! to enable search across the different Nordic Web Archives and provide ! seamless navigation within the different archives. The ability to search ! across the different indexes was solved by the using <ulink url="http://fastsearch.com/">Fast Search & Transfer</ulink>'s multi node architecture. To enable Wera to retrieve a particular document with a given <literal>aid</literal> (Archive ID) from the right archive the ! collection field was introduced in the index (also present in the ! NutchWax index). The Wera config file holds the mapping from collection ! to archive (or rather Wera installation).</para> <para>Another reason to include the collection field was to ensure that the actual link rewriting was done by the owner of the document. Each archive holder would have to set up their own Wera installation. When ! one Wera was requesting a document from a remote archive, the remote ! Wera should make the necessary changes to the document before delivering ! it to the calling Wera. The reason for this was to make sure that the ! owner had full control over what was delivered to the calling site, thus ! being able to threat the document in accordance with local policies ! rather than the policies of the caller site. The figure below ! illustrates the currently supported use of mapping between collection ! and archive nodes.</para> <figure> *************** *** 181,195 **** </figure> ! <para>In the Wera installation of ! <emphasis>W1</emphasis> the different collections indexed ! in NutchWax are mapped to corresponding Wera installations of ! <emphasis>W2- Wn</emphasis>. ! When the timeline view on W1 encounters a resource located on a ! different node (e.g. the collection mapping points to the Wera ! installation of <emphasis>W2</emphasis>) it requests that resource from ! the Wera installation at <literal>W2</literal>. Wera at ! <literal>W2</literal> fetches the resource from its Retriever and does ! the necessary changes to the file before delivering it to Wera at ! <literal>W1</literal> (e.g. inserts javascript link rewriter or rewrites it server side). When Wera at <literal>W1</literal> receives this file it does an additional --- 176,188 ---- </figure> ! <para>In the Wera installation of <emphasis>W1</emphasis> the different ! collections indexed in NutchWax are mapped to corresponding Wera ! installations of <emphasis>W2- Wn</emphasis>. When the timeline view on ! W1 encounters a resource located on a different node (e.g. the ! collection mapping points to the Wera installation of ! <emphasis>W2</emphasis>) it requests that resource from the Wera ! installation at <literal>W2</literal>. Wera at <literal>W2</literal> ! fetches the resource from its Retriever and does the necessary changes ! to the file before delivering it to Wera at <literal>W1</literal> (e.g. inserts javascript link rewriter or rewrites it server side). When Wera at <literal>W1</literal> receives this file it does an additional *************** *** 205,207 **** </section> </section> ! </article> --- 198,200 ---- </section> </section> ! </article> \ No newline at end of file |
From: Michael S. <sta...@us...> - 2005-10-21 07:03:04
|
Update of /cvsroot/archive-access/archive-access/projects/wera In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15837 Modified Files: maven.xml project.properties Log Message: Fix for '[ 1322448 ] Automatic build - improvements needed'. * maven.xml Explicit copy of images. Copy docbook stuff under wera. * project.properties Don't automatically copy jpgs, pngs, etc. Do the images dir copy expliclity in maven.xml. * src/webapps/wera/index.php Point at new subdirectory location for manual and release notes. Index: maven.xml =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/maven.xml,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** maven.xml 6 Oct 2005 22:32:45 -0000 1.4 --- maven.xml 21 Oct 2005 07:02:54 -0000 1.5 *************** *** 39,42 **** --- 39,43 ---- <!--Make up the arcretriever war file.--> <attainGoal name="war" /> + <attainGoal name="docbook" /> <!--Set filter token used in a few instances below.--> *************** *** 66,69 **** --- 67,71 ---- <fileset file="${maven.build.dir}/${maven.war.final.name}" /> </copy> + <attainGoal name="copy_docbook" /> </postGoal> *************** *** 77,80 **** --- 79,87 ---- <j:if test="${sDocbookSupportPresent == 'true'}"> <attainGoal name="sdocbook:generate-html"/> + <!--Copy over the images directory--> + <mkdir dir="${maven.build.dir}/docs/articles/images/" /> + <copy todir="${maven.build.dir}/docs/articles/images/" verbose="true"> + <fileset dir="${maven.src.dir}/articles/images/" /> + </copy> <!--<attainGoal name="sdocbook:generate-pdf"/> --> *************** *** 86,89 **** --- 93,97 ---- </j:if> </goal> + <goal name="copy_docbook"> <!--Copies docbooks under dist docs and under webapps. This goal runs *************** *** 91,95 **** --> <property name="docbook.assembled" ! value="${maven.dist.bin.assembly.dir}/docs/articles/" /> <j:set var="docbookPresent" value="false" /> <util:available file="${docbook.assembled}" > --- 99,103 ---- --> <property name="docbook.assembled" ! value="${maven.build.dir}/docs/articles/" /> <j:set var="docbookPresent" value="false" /> <util:available file="${docbook.assembled}" > *************** *** 97,116 **** </util:available> <j:if test="${docbookPresent == 'true'}"> ! <!--Copy under docs in dist--> ! <mkdir dir="${docbook.assembled}" /> ! <copy todir="${docbook.assembled}"> <fileset dir="${maven.build.dir}/docs/articles/" /> </copy> ! <!--Copy into webapps dir--> ! <mkdir dir="${maven.build.dir}/webapps/admin/docs/articles/" /> ! <copy todir="${maven.build.dir}/webapps/admin/docs/articles/"> <fileset dir="${maven.build.dir}/docs/articles/" /> </copy> </j:if> - <!-- else --> - <j:if test="${sDocbookSupportPresent != 'true'}"> - <echo - message="Docbook copy skipped (Nothing to copy -- not built?)." /> - </j:if> </goal> --- 105,122 ---- </util:available> <j:if test="${docbookPresent == 'true'}"> ! <echo message="Copying over docbook" /> ! <!--Copy under docs in binary build--> ! <mkdir dir="${maven.dist.bin.assembly.dir}/docs/articles/" /> ! <copy todir="${maven.dist.bin.assembly.dir}/docs/articles/" ! verbose="true"> <fileset dir="${maven.build.dir}/docs/articles/" /> </copy> ! <!--Copy under wera webapp--> ! <mkdir dir="${maven.dist.bin.assembly.dir}/webapps/wera/articles/" /> ! <copy todir="${maven.dist.bin.assembly.dir}/webapps/wera/articles/" ! verbose="true"> <fileset dir="${maven.build.dir}/docs/articles/" /> </copy> </j:if> </goal> Index: project.properties =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/project.properties,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** project.properties 6 Oct 2005 22:32:45 -0000 1.4 --- project.properties 21 Oct 2005 07:02:54 -0000 1.5 *************** *** 19,23 **** maven.sdocbook.src.dir = ${basedir}/src/articles maven.sdocbook.target.dir = ${maven.build.dir}/docs/articles ! maven.sdocbook.resources.include = **/*.gif,**/*.png maven.sdocbook.html.params = -PARAM generate.id.attributes 1 -PARAM section.autolabel 1 -PARAM part.autolabel 1 -PARAM chapter.autolabel 1 -PARAM generate.meta.abstract 1 maven.sdocbook.fo.params = -PARAM generate.id.attributes 1 -PARAM section.autolabel 1 -PARAM part.autolabel 1 -PARAM chapter.autolabel 1 -PARAM generate.meta.abstract 1 --- 19,23 ---- maven.sdocbook.src.dir = ${basedir}/src/articles maven.sdocbook.target.dir = ${maven.build.dir}/docs/articles ! maven.sdocbook.resources.include = maven.sdocbook.html.params = -PARAM generate.id.attributes 1 -PARAM section.autolabel 1 -PARAM part.autolabel 1 -PARAM chapter.autolabel 1 -PARAM generate.meta.abstract 1 maven.sdocbook.fo.params = -PARAM generate.id.attributes 1 -PARAM section.autolabel 1 -PARAM part.autolabel 1 -PARAM chapter.autolabel 1 -PARAM generate.meta.abstract 1 |
From: Michael S. <sta...@us...> - 2005-10-21 07:03:04
|
Update of /cvsroot/archive-access/archive-access/projects/wera/src/webapps/wera In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15837/src/webapps/wera Modified Files: index.php Log Message: Fix for '[ 1322448 ] Automatic build - improvements needed'. * maven.xml Explicit copy of images. Copy docbook stuff under wera. * project.properties Don't automatically copy jpgs, pngs, etc. Do the images dir copy expliclity in maven.xml. * src/webapps/wera/index.php Point at new subdirectory location for manual and release notes. Index: index.php =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wera/src/webapps/wera/index.php,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** index.php 20 Oct 2005 18:53:01 -0000 1.9 --- index.php 21 Oct 2005 07:02:54 -0000 1.10 *************** *** 386,390 **** </td> <td align="right" class="norm"> ! <a href="./manual/manual.html">Manual</a> | <a href="./releasenotes.html">Release Notes</a> | <a href="http://sourceforge.net/tracker/?group_id=118427&atid=681137">Report bugs</a> </td> </tr> --- 386,390 ---- </td> <td align="right" class="norm"> ! <a href="./articles//manual.html">Manual</a> | <a href="./articles/releasenotes.html">Release Notes</a> | <a href="http://sourceforge.net/tracker/?group_id=118427&atid=681137">Report bugs</a> </td> </tr> |
From: Michael S. <sta...@us...> - 2005-10-21 04:16:23
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/bin In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21415/bin Modified Files: nutch Log Message: Implement '[ 1309781 ] Add in skipping certain types if > size' for Dan. * bin/nutch Add new col-dedup command. * conf/nutch-site.xml.nutchwax Remove dedup collection parameter. Not used. * src/java/org/archive/access/nutch/CollectionDeleteDuplicates.java A copy of nutch DeleteDuplicates that adds in hash of collection to url and content md5. Have to make copy rather than subclass because the original is not subclassable -- its all private in awkward places. Index: nutch =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/bin/nutch,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** nutch 15 Sep 2005 18:22:53 -0000 1.2 --- nutch 21 Oct 2005 04:16:12 -0000 1.3 *************** *** 39,42 **** --- 39,43 ---- echo " merge merge several segment indexes" echo " dedup remove duplicates from a set of segment indexes" + echo " col-dedup remove collection duplicates from segment indexes" echo " updatedb update db from segments after fetching" echo " updatesegs update segments with link data from the db" *************** *** 151,154 **** --- 152,159 ---- elif [ "$COMMAND" = "dedup" ] ; then CLASS=org.apache.nutch.indexer.DeleteDuplicates + elif [ "$COMMAND" = "col-dedup" ] ; then + # Do a dedup that counts collection into url and content md5. Will + # ensure dedup done only within a collection. + CLASS=org.archive.access.nutch.CollectionDeleteDuplicates elif [ "$COMMAND" = "updatedb" ] ; then CLASS=org.apache.nutch.tools.UpdateDatabaseTool |
From: Michael S. <sta...@us...> - 2005-10-21 04:16:20
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21415/src/java/org/archive/access/nutch Added Files: CollectionDeleteDuplicates.java Log Message: Implement '[ 1309781 ] Add in skipping certain types if > size' for Dan. * bin/nutch Add new col-dedup command. * conf/nutch-site.xml.nutchwax Remove dedup collection parameter. Not used. * src/java/org/archive/access/nutch/CollectionDeleteDuplicates.java A copy of nutch DeleteDuplicates that adds in hash of collection to url and content md5. Have to make copy rather than subclass because the original is not subclassable -- its all private in awkward places. --- NEW FILE: CollectionDeleteDuplicates.java --- /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.access.nutch; import java.io.*; import java.security.*; import java.text.*; import java.util.*; import java.util.logging.*; import org.apache.nutch.io.*; import org.apache.nutch.fs.*; import org.apache.nutch.util.*; import org.apache.lucene.index.IndexReader; import org.apache.lucene.document.Document; import org.apache.nutch.indexer.IndexSegment; /****************************************************************** * Deletes duplicate documents in a set of Lucene indexes. * Duplicates have either the same collection and contents (via MD5 hash), or * the same collection and same URL. Based on nutch DeleteDuplicates class. * We can't subclass DeleteDuplicates because its all private in the wrong * places. Adds collection content to the URL hash to compare and to the * content MD5 to compare. Done by St.Ack. * * @author Doug Cutting * @author Mike Cafarella ******************************************************************/ public class CollectionDeleteDuplicates { private static final Logger LOG = LogFormatter.getLogger(CollectionDeleteDuplicates.class.getName()); /******************************************************** * The key used in sorting for duplicates. *******************************************************/ public static class IndexedDoc implements WritableComparable { private MD5Hash hash = new MD5Hash(); private float score; private int index; // the segment index private int doc; // within the index private int urlLen; public void write(DataOutput out) throws IOException { hash.write(out); out.writeFloat(score); out.writeInt(index); out.writeInt(doc); out.writeInt(urlLen); } public void readFields(DataInput in) throws IOException { hash.readFields(in); this.score = in.readFloat(); this.index = in.readInt(); this.doc = in.readInt(); this.urlLen = in.readInt(); } public int compareTo(Object o) { throw new RuntimeException("this is never used"); } /** * Order equal hashes by decreasing score and increasing urlLen. */ public static class ByHashScore extends WritableComparator { public ByHashScore() { super(IndexedDoc.class); } public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){ int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN); if (c != 0) return c; float thisScore = readFloat(b1, s1+MD5Hash.MD5_LEN); float thatScore = readFloat(b2, s2+MD5Hash.MD5_LEN); if (thisScore < thatScore) return 1; else if (thisScore > thatScore) return -1; int thisUrlLen = readInt(b1, s1+MD5Hash.MD5_LEN+12); int thatUrlLen = readInt(b2, s2+MD5Hash.MD5_LEN+12); return thisUrlLen - thatUrlLen; } } /** * Order equal hashes by decreasing index and document. */ public static class ByHashDoc extends WritableComparator { public ByHashDoc() { super(IndexedDoc.class); } public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){ int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN); if (c != 0) return c; int thisIndex = readInt(b1, s1+MD5Hash.MD5_LEN+4); int thatIndex = readInt(b2, s2+MD5Hash.MD5_LEN+4); if (thisIndex != thatIndex) return thatIndex - thisIndex; int thisDoc = readInt(b1, s1+MD5Hash.MD5_LEN+8); int thatDoc = readInt(b2, s2+MD5Hash.MD5_LEN+8); return thatDoc - thisDoc; } } } /***************************************************** ****************************************************/ private interface Hasher { void updateHash(MD5Hash hash, Document doc); } ////////////////////////////////////////////////////// // CollectionDeleteDuplicates class ////////////////////////////////////////////////////// private IndexReader[] readers; private File tempFile; /** * Constructs a duplicate detector for the provided indexes. */ public CollectionDeleteDuplicates(IndexReader[] readers, File workingDir) throws IOException { this.readers = readers; this.tempFile = new File(workingDir, "ddup-" + new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System.currentTimeMillis()))); } /** * Closes the indexes, saving changes. */ public void close() throws IOException { for (int i = 0; i < readers.length; i++) { readers[i].close(); } tempFile.delete(); } /** * Delete pages with duplicate content hashes. Of those with the same * content hash, keep the page with the highest score. */ public void deleteContentDuplicates() throws IOException { final MessageDigest digest; try { digest = MessageDigest.getInstance("MD5"); } catch (Exception e) { throw new RuntimeException(e.toString()); } LOG.info("Reading content hashes..."); computeHashes(new Hasher() { public void updateHash(MD5Hash hash, Document doc) { try { // Add collection name to the digest. digest.update(UTF8.getBytes(doc.get("collection"))); // Now add the doc. content digest. digest.update(UTF8.getBytes(doc.get("digest"))); // Set the digest into hash. digest.digest(hash.getDigest(), 0, MD5Hash.MD5_LEN); // This is what happens in CollectionDeleteDuplicates. // hash.setDigest(doc.get("digest")); } catch (Exception e) { throw new RuntimeException(e.toString()); } } }); LOG.info("Sorting content hashes..."); SequenceFile.Sorter byHashScoreSorter = new SequenceFile.Sorter(new LocalFileSystem(), new IndexedDoc.ByHashScore(),NullWritable.class); byHashScoreSorter.sort(tempFile.getPath(), tempFile.getPath() + ".sorted"); LOG.info("Deleting content duplicates..."); int duplicateCount = deleteDuplicates(); LOG.info("Deleted " + duplicateCount + " content duplicates."); } /** * Delete pages with duplicate URLs. Of those with the same * URL, keep the most recently fetched page. */ public void deleteUrlDuplicates() throws IOException { final MessageDigest digest; try { digest = MessageDigest.getInstance("MD5"); } catch (Exception e) { throw new RuntimeException(e.toString()); } LOG.info("Reading url hashes..."); computeHashes(new Hasher() { public void updateHash(MD5Hash hash, Document doc) { try { digest.update(UTF8.getBytes(doc.get("url"))); // Add collection name to the digest. digest.update(UTF8.getBytes(doc.get("collection"))); // Set the digest into hash. digest.digest(hash.getDigest(), 0, MD5Hash.MD5_LEN); } catch (Exception e) { throw new RuntimeException(e.toString()); } } }); LOG.info("Sorting url hashes..."); SequenceFile.Sorter byHashDocSorter = new SequenceFile.Sorter(new LocalFileSystem(), new IndexedDoc.ByHashDoc(), NullWritable.class); byHashDocSorter.sort(tempFile.getPath(), tempFile.getPath() + ".sorted"); LOG.info("Deleting url duplicates..."); int duplicateCount = deleteDuplicates(); LOG.info("Deleted " + duplicateCount + " url duplicates."); } /** * Compute hashes over all the input indices */ private void computeHashes(Hasher hasher) throws IOException { IndexedDoc indexedDoc = new IndexedDoc(); SequenceFile.Writer writer = new SequenceFile.Writer(new LocalFileSystem(), tempFile.getPath(), IndexedDoc.class, NullWritable.class); try { for (int index = 0; index < readers.length; index++) { IndexReader reader = readers[index]; int readerMax = reader.maxDoc(); indexedDoc.index = index; for (int doc = 0; doc < readerMax; doc++) { if (!reader.isDeleted(doc)) { Document document = reader.document(doc); hasher.updateHash(indexedDoc.hash, document); indexedDoc.score = Float.parseFloat(document.get("boost")); indexedDoc.doc = doc; indexedDoc.urlLen = document.get("url").length(); writer.append(indexedDoc, NullWritable.get()); } } } } finally { writer.close(); } } /** * Actually remove the duplicates from the indices */ private int deleteDuplicates() throws IOException { if (tempFile.exists()) { tempFile.delete(); } if (!new File(tempFile.getPath() + ".sorted").renameTo(tempFile)) { throw new IOException("Couldn't rename!"); } IndexedDoc indexedDoc = new IndexedDoc(); SequenceFile.Reader reader = new SequenceFile.Reader(new LocalFileSystem(), tempFile.getPath()); try { int duplicateCount = 0; MD5Hash prevHash = null; // previous hash while (reader.next(indexedDoc, NullWritable.get())) { if (prevHash == null) { // initialize prevHash prevHash = new MD5Hash(); prevHash.set(indexedDoc.hash); continue; } if (indexedDoc.hash.equals(prevHash)) { // found a duplicate readers[indexedDoc.index].delete(indexedDoc.doc); // delete it duplicateCount++; } else { prevHash.set(indexedDoc.hash); // reset prevHash } } return duplicateCount; } finally { reader.close(); tempFile.delete(); } } /** * Delete duplicates in the indexes in the named directory. */ public static void main(String[] args) throws Exception { // // Usage, arg checking // String usage = "DeleteDuplicates (-local | -ndfs <namenode:port>) [-workingdir <workingdir>] <segmentsDir>"; if (args.length < 2) { System.err.println("Usage: " + usage); return; } NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0); File workingDir = new File(new File("").getCanonicalPath()); try { // // Build an array of IndexReaders for all the segments we want to process // int j = 0; if ("-workingdir".equals(args[j])) { j++; workingDir = new File(new File(args[j++]).getCanonicalPath()); } workingDir = new File(workingDir, "ddup-workingdir"); String segmentsDir = args[j++]; File[] directories = nfs.listFiles(new File(segmentsDir)); Vector vReaders = new Vector(); Vector putbackList = new Vector(); int maxDoc = 0; for (int i = 0; i < directories.length; i++) { // // Make sure the index has been completed // File indexDone = new File(directories[i], IndexSegment.DONE_NAME); if (nfs.exists(indexDone) && nfs.isFile(indexDone)) { // // Make sure the specified segment can be processed locally // File indexDir = new File(directories[i], "index"); File tmpDir = new File(workingDir, "ddup-" + new SimpleDateFormat("yyyMMddHHmmss").format(new Date(System.currentTimeMillis()))); File localIndexDir = nfs.startLocalOutput(indexDir, tmpDir); putbackList.add(indexDir); putbackList.add(tmpDir); // // Construct the reader // IndexReader reader = IndexReader.open(localIndexDir); if (reader.hasDeletions()) { LOG.info("Clearing old deletions in " + indexDir + "(" + localIndexDir + ")"); reader.undeleteAll(); } maxDoc += reader.maxDoc(); vReaders.add(reader); } } // // Now build the CollectionDeleteDuplicates object, and complete // IndexReader[] readers = new IndexReader[vReaders.size()]; for(int i = 0; vReaders.size()>0; i++) { readers[i] = (IndexReader)vReaders.remove(0); } if (workingDir.exists()) { FileUtil.fullyDelete(workingDir); } workingDir.mkdirs(); CollectionDeleteDuplicates dd = new CollectionDeleteDuplicates(readers, workingDir); dd.deleteUrlDuplicates(); dd.deleteContentDuplicates(); dd.close(); // // Dups have been deleted. Now make sure they are placed back to NFS // LOG.info("Duplicate deletion complete locally. Now returning to NFS..."); for (Iterator it = putbackList.iterator(); it.hasNext(); ) { File indexDir = (File) it.next(); File tmpDir = (File) it.next(); nfs.completeLocalOutput(indexDir, tmpDir); } LOG.info("CollectionDeleteDuplicates complete"); FileUtil.fullyDelete(workingDir); } finally { nfs.close(); } } } |
From: Michael S. <sta...@us...> - 2005-10-21 04:16:20
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/conf In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21415/conf Modified Files: nutch-site.xml.nutchwax Log Message: Implement '[ 1309781 ] Add in skipping certain types if > size' for Dan. * bin/nutch Add new col-dedup command. * conf/nutch-site.xml.nutchwax Remove dedup collection parameter. Not used. * src/java/org/archive/access/nutch/CollectionDeleteDuplicates.java A copy of nutch DeleteDuplicates that adds in hash of collection to url and content md5. Have to make copy rather than subclass because the original is not subclassable -- its all private in awkward places. Index: nutch-site.xml.nutchwax =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/conf/nutch-site.xml.nutchwax,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** nutch-site.xml.nutchwax 21 Oct 2005 00:42:14 -0000 1.1 --- nutch-site.xml.nutchwax 21 Oct 2005 04:16:12 -0000 1.2 *************** *** 142,152 **** value is -1 which says don't skip text/html docs.</description> </property> - <property> - <name>archive.dedup.count.collection</name> - <value>false</value> - <description>If true, when deduping, compare collection names - as well as URL and content-md5 deduping. - </description> - </property> - </nutch-conf> --- 142,144 ---- |
From: Brad <bra...@us...> - 2005-10-21 03:24:49
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/localresourcestore In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14129/src/java/org/archive/wayback/localresourcestore Modified Files: LocalARCResourceStore.java Log Message: Heavy modification of configuration to be Context-level, instead of Servlet-level, which dramatically reduces configuration redundancy. Cleaned up IndexPipeline, moved a few classes around, added a really simple JSP to view the Index and Merge queue sizes, and a filter, which both allows the index thread to start with the context, and allows access to the jsp. Index: LocalARCResourceStore.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/localresourcestore/LocalARCResourceStore.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** LocalARCResourceStore.java 19 Oct 2005 01:22:37 -0000 1.2 --- LocalARCResourceStore.java 21 Oct 2005 03:24:40 -0000 1.3 *************** *** 41,45 **** */ public class LocalARCResourceStore implements ResourceStore { ! private static final String RESOURCE_PATH = "resourcestore.arcpath"; private static final String ARCTAIL = ".arc.gz"; --- 41,45 ---- */ public class LocalARCResourceStore implements ResourceStore { ! private static final String RESOURCE_PATH = "arcpath"; private static final String ARCTAIL = ".arc.gz"; |
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/arcindexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14129/src/java/org/archive/wayback/arcindexer Modified Files: IndexPipeline.java ArcIndexer.java Added Files: PipelineFilter.java PipelineStatus.java BDBResourceIndexWriter.java Log Message: Heavy modification of configuration to be Context-level, instead of Servlet-level, which dramatically reduces configuration redundancy. Cleaned up IndexPipeline, moved a few classes around, added a really simple JSP to view the Index and Merge queue sizes, and a filter, which both allows the index thread to start with the context, and allows access to the jsp. --- NEW FILE: BDBResourceIndexWriter.java --- /* BDBResourceIndexWriter * * Created on 2005/10/18 14:00:00 * * Copyright (C) 2005 Internet Archive. * * This file is part of the Wayback Machine (crawler.archive.org). * * Wayback Machine is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Wayback Machine is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Wayback Machine; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.arcindexer; import java.io.File; import java.io.RandomAccessFile; import org.archive.wayback.core.ResourceResult; import org.archive.wayback.core.ResourceResults; import org.archive.wayback.localbdbresourceindex.BDBResourceIndex; import com.sleepycat.je.DatabaseException; /** * Implements updates to a BDBResourceIndex * * @author Brad Tofel * @version $Date: 2005/10/21 03:24:40 $, $Revision: 1.1 $ */ public class BDBResourceIndexWriter { private BDBResourceIndex db = null; /** * Constructor */ public BDBResourceIndexWriter() { super(); } protected void init(final String thePath, final String theDbName) throws Exception { db = new BDBResourceIndex(thePath, theDbName); } protected void init(BDBResourceIndex db) { this.db = db; } protected void shutdown() throws DatabaseException { db.shutdownDB(); } /** * reads all ResourceResult objects from CDX at filePath, and merges them * into the BDBResourceIndex. * * @param indexFile * to CDX file * @throws Exception */ public void importFile(File indexFile) throws Exception { ResourceResults results = readFile(indexFile); db.addResults(results); } private ResourceResults readFile(File indexFile) throws Exception { RandomAccessFile raFile = new RandomAccessFile(indexFile, "r"); ResourceResults results = new ResourceResults(); int lineNumber = 0; while (true) { String line = raFile.readLine(); if (line == null) { break; } lineNumber++; if ((lineNumber == 1) && (line.contains(" CDX "))) { continue; } ResourceResult result = new ResourceResult(); result.parseLine(line, lineNumber); results.addResourceResult(result); } return results; } /** * @param args */ public static void main(String[] args) { try { BDBResourceIndexWriter idx = new BDBResourceIndexWriter(); idx.init(args[0], args[1]); idx.importFile(new File(args[2])); idx.shutdown(); } catch (Exception e) { e.printStackTrace(); } } } Index: ArcIndexer.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/arcindexer/ArcIndexer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ArcIndexer.java 19 Oct 2005 01:22:36 -0000 1.2 --- ArcIndexer.java 21 Oct 2005 03:24:40 -0000 1.3 *************** *** 60,70 **** * Create a ResourceResults representing the records in ARC file at arcPath. * ! * @param arcPath * @return ResourceResults in arcPath. * @throws IOException */ ! public ResourceResults indexArc(final String arcPath) throws IOException { ResourceResults results = new ResourceResults(); - File arc = new File(arcPath); ARCReader arcReader = ARCReaderFactory.get(arc); arcReader.setParseHttpHeaders(true); --- 60,69 ---- * Create a ResourceResults representing the records in ARC file at arcPath. * ! * @param arc * @return ResourceResults in arcPath. * @throws IOException */ ! public ResourceResults indexArc(File arc) throws IOException { ResourceResults results = new ResourceResults(); ARCReader arcReader = ARCReaderFactory.get(arc); arcReader.setParseHttpHeaders(true); *************** *** 133,145 **** * * @param results ! * @param cdxPath * @throws IOException */ public void serializeResults(final ResourceResults results, ! final String cdxPath) throws IOException { ! Iterator itr = results.iterator(); ! File cdx = new File(cdxPath); ! FileOutputStream output = new FileOutputStream(cdx); output.write((ResourceResult.getCDXHeaderString() + "\n").getBytes()); while (itr.hasNext()) { ResourceResult result = (ResourceResult) itr.next(); --- 132,146 ---- * * @param results ! * @param target * @throws IOException */ public void serializeResults(final ResourceResults results, ! File target) throws IOException { ! ! // TODO will this automatically close when it falls out of scope? ! FileOutputStream output = new FileOutputStream(target); output.write((ResourceResult.getCDXHeaderString() + "\n").getBytes()); + + Iterator itr = results.iterator(); while (itr.hasNext()) { ResourceResult result = (ResourceResult) itr.next(); *************** *** 153,158 **** public static void main(String[] args) { ArcIndexer indexer = new ArcIndexer(); ! String arc = args[0]; ! String cdx = args[1]; try { ResourceResults results = indexer.indexArc(arc); --- 154,159 ---- public static void main(String[] args) { ArcIndexer indexer = new ArcIndexer(); ! File arc = new File(args[0]); ! File cdx = new File(args[1]); try { ResourceResults results = indexer.indexArc(arc); --- NEW FILE: PipelineStatus.java --- /* PipelineStatus * * Created on Oct 20, 2005 * * Copyright (C) 2005 Internet Archive. * * This file is part of the wayback (crawler.archive.org). * * wayback is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * wayback is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with wayback; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.arcindexer; /** * Data bag for handing off status of Pipeline to PipelineStatus.jsp. * * @author brad * @version $Date: 2005/10/21 03:24:40 $, $Revision: 1.1 $ */ public class PipelineStatus { private String numQueuedForIndex; private String numQueuedForMerge; /** * Constructor */ public PipelineStatus() { super(); // TODO Auto-generated constructor stub } /** * @return Returns the numQueuedForIndex. */ public String getNumQueuedForIndex() { return numQueuedForIndex; } /** * @param numQueuedForIndex * The numQueuedForIndex to set. */ public void setNumQueuedForIndex(String numQueuedForIndex) { this.numQueuedForIndex = numQueuedForIndex; } /** * @return Returns the numQueuedForMerge. */ public String getNumQueuedForMerge() { return numQueuedForMerge; } /** * @param numQueuedForMerge * The numQueuedForMerge to set. */ public void setNumQueuedForMerge(String numQueuedForMerge) { this.numQueuedForMerge = numQueuedForMerge; } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub } } Index: IndexPipeline.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/arcindexer/IndexPipeline.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** IndexPipeline.java 19 Oct 2005 01:22:36 -0000 1.2 --- IndexPipeline.java 21 Oct 2005 03:24:40 -0000 1.3 *************** *** 29,40 **** import java.util.ArrayList; import java.util.Iterator; import org.archive.wayback.core.ResourceResults; import com.sun.org.apache.xml.internal.utils.StringToStringTable; /** ! * Implements updating of a BDBResourceIndex using several directories with data ! * files or flag files. * * @author Brad Tofel --- 29,49 ---- import java.util.ArrayList; import java.util.Iterator; + import java.util.Properties; import org.archive.wayback.core.ResourceResults; + import org.archive.wayback.localbdbresourceindex.BDBResourceIndex; import com.sun.org.apache.xml.internal.utils.StringToStringTable; /** ! * Implements indexing of new ARC files, and merging with a BDBResourceIndex. ! * Assumes LocalBDBResourceIndex and LocalARCResourceStore for now. ! * Maintains state using directories and files for now. ! * ! * There are 3 primary components, each could be a thread, but the steps are ! * run in serial for the moment: ! * 1) watch for new ARC files, and queue them for indexing ! * 2) index queued ARC files into CDX format, queue the CDX files for merging. ! * 3) merge queued CDX files with the ResourceIndex. * * @author Brad Tofel *************** *** 42,48 **** */ public class IndexPipeline { ! private File arcDir = null; ! private File mergeDir = null; private File queuedDir = null; --- 51,76 ---- */ public class IndexPipeline { ! private final static String RUN_PIPELINE = "indexpipeline.runpipeline"; ! private final static String INDEX_PATH = "resourceindex.indexpath"; ! ! private final static String DB_NAME = "resourceindex.dbname"; ! ! private final static String ARC_PATH = "arcpath"; ! ! private final static String WORK_PATH = "indexpipeline.workpath"; ! ! private final static String QUEUED_DIR = "queued"; ! ! private final static String TO_BE_INDEXED_DIR = "toBeIndexed"; ! ! private final static String INDEXING_DIR = "indexing"; ! ! private final static String TO_BE_MERGED_DIR = "toBeMerged"; ! ! ! private File arcDir = null; ! ! private File workDir = null; private File queuedDir = null; *************** *** 52,56 **** private File indexingDir = null; ! private ArcIndexer indexer = null; /** --- 80,89 ---- private File indexingDir = null; ! private File toBeMergedDir = null; ! ! private BDBResourceIndex db = null; ! ! private static Thread indexUpdateThread = null; ! /** *************** *** 68,95 **** /** ! * Initialize this object from several path arguments. * ! * @param arcDir ! * @param mergeDir ! * @param workDir * @throws IOException */ ! public void init(final String arcDir, final String mergeDir, ! final String workDir) throws IOException { ! this.arcDir = new File(arcDir); ! this.mergeDir = new File(mergeDir); ! this.queuedDir = new File(workDir + "/queued"); ! this.toBeIndexedDir = new File(workDir + "/to-be-indexed"); ! this.indexingDir = new File(workDir + "/indexing"); ! ensureDir(new File(workDir)); ! ensureDir(this.queuedDir); ! ensureDir(this.toBeIndexedDir); ! ensureDir(this.indexingDir); ! indexer = new ArcIndexer(); } ! private StringToStringTable dirToSTST(File dir) { StringToStringTable hash = new StringToStringTable(); ! String entries[] = dir.list(); for (int i = 0; i < entries.length; i++) { hash.put(entries[i], "i"); --- 101,178 ---- /** ! * Initialize this object, creating directories if needed, and starting ! * thread if configured. * ! * @param p configuration * @throws IOException */ ! ! public void init(Properties p) throws IOException { ! ! // where do we find ARC files? ! String arcPath = (String) p.get(ARC_PATH); ! if (arcPath == null || (arcPath.length() <= 0)) { ! throw new IllegalArgumentException("Failed to find " + ARC_PATH); ! } ! ! // where is the BDB? (and what is it named?) ! String dbPath = (String) p.get(INDEX_PATH); ! if (dbPath == null || (dbPath.length() <= 0)) { ! throw new IllegalArgumentException("Failed to find " + INDEX_PATH); ! } ! ! String dbName = (String) p.get(DB_NAME); ! if (dbName == null || (dbName.length() <= 0)) { ! throw new IllegalArgumentException("Failed to find " + DB_NAME); ! } ! ! // where do we keep working files? ! String workPath = (String) p.get(WORK_PATH); ! if (workPath == null || (workPath.length() <= 0)) { ! throw new IllegalArgumentException("Failed to find " + WORK_PATH); ! } ! ! String runPipeline = (String) p.get(RUN_PIPELINE); ! try { ! db = new BDBResourceIndex(dbPath, dbName); ! } catch (Exception e) { ! // TODO is this the right choice? was already obfuscated from BDBException... ! throw new IOException(e.getMessage()); ! } ! arcDir = new File(arcPath); ! workDir = new File(workPath); ! queuedDir = new File(workDir,QUEUED_DIR); ! toBeIndexedDir = new File(workDir,TO_BE_INDEXED_DIR); ! indexingDir = new File(workDir,INDEXING_DIR); ! toBeMergedDir = new File(workDir,TO_BE_MERGED_DIR); ! ! ensureDir(workDir); ! ensureDir(queuedDir); ! ensureDir(toBeIndexedDir); ! ensureDir(indexingDir); ! ensureDir(toBeMergedDir); ! ! if ((runPipeline != null) && (runPipeline.equals("1"))) { ! ! System.out ! .println("LocalDBDResourceIndex starting pipeline thread..."); ! if (indexUpdateThread == null) { ! startIndexPipelineThread(db); ! } ! } ! } ! ! private synchronized void startIndexPipelineThread( ! final BDBResourceIndex bdb) { ! if (indexUpdateThread != null) { ! return; ! } ! indexUpdateThread = new IndexPipelineThread(bdb, this); ! indexUpdateThread.start(); } ! private StringToStringTable getQueuedFiles() { StringToStringTable hash = new StringToStringTable(); ! String entries[] = queuedDir.list(); for (int i = 0; i < entries.length; i++) { hash.put(entries[i], "i"); *************** *** 98,124 **** } ! private StringToStringTable getQueuedFiles() { ! return dirToSTST(this.queuedDir); } ! private ArrayList getNewArcs() { StringToStringTable queued = getQueuedFiles(); ArrayList newArcs = new ArrayList(); ! String arcs[] = this.arcDir.list(); ! for (int i = 0; i < arcs.length; i++) { ! if (!queued.contains(arcs[i])) { ! newArcs.add(arcs[i]); } } ! return newArcs; } ! private void queueArc(final String newArc) throws IOException { ! File newQueuedFile = new File(this.queuedDir.getAbsolutePath() + "/" ! + newArc); ! File newToBeIndexedFile = new File(this.toBeIndexedDir ! .getAbsolutePath() ! + "/" + newArc); newToBeIndexedFile.createNewFile(); newQueuedFile.createNewFile(); --- 181,220 ---- } ! private Iterator getDirFilesIterator(File dir) { ! String files[] = dir.list(); ! ArrayList list = new ArrayList(); ! if (files != null) { ! for (int i = 0; i < files.length; i++) { ! File file = new File(dir, files[i]); ! if (file.isFile()) { ! list.add(files[i]); ! } ! } ! } ! return list.iterator(); } ! // this should be a method call into ResourceStore... ! private Iterator getNewArcs() { StringToStringTable queued = getQueuedFiles(); ArrayList newArcs = new ArrayList(); ! String arcs[] = arcDir.list(); ! if (arcs != null) { ! for (int i = 0; i < arcs.length; i++) { ! File arc = new File(arcDir,arcs[i]); ! if(arc.isFile() && arcs[i].endsWith(".arc.gz")) { ! if (!queued.contains(arcs[i])) { ! newArcs.add(arcs[i]); ! } ! } } } ! return newArcs.iterator(); } ! private void queueArcForIndex(final String newArc) throws IOException { ! File newQueuedFile = new File(queuedDir,newArc); ! File newToBeIndexedFile = new File(toBeIndexedDir,newArc); newToBeIndexedFile.createNewFile(); newQueuedFile.createNewFile(); *************** *** 126,182 **** /** ! * Find all new ARC files, and queue them for indexing. ! * * @throws IOException */ ! public void queueNewArcs() throws IOException { ! ArrayList newArcs = getNewArcs(); ! if (!newArcs.isEmpty()) { ! Iterator itr = newArcs.iterator(); ! while (itr.hasNext()) { ! String newArc = (String) itr.next(); ! queueArc(newArc); ! } } } ! /** * Index any ARC files queued for indexing, queueing the resulting CDX files * for merging with the BDBResourceIndex. * * @throws MalformedURLException * @throws IOException */ ! public void indexArcs() throws MalformedURLException, IOException { ! queueNewArcs(); ! String toBeIndexed[] = this.toBeIndexedDir.list(); ! for (int i = 0; i < toBeIndexed.length; i++) { ! ! String base = toBeIndexed[i]; ! File arcFile = new File(this.arcDir.getAbsolutePath().concat( ! "/" + base)); ! File tmpFile = new File(this.indexingDir.getAbsolutePath().concat( ! "/" + base)); ! File flagFile = new File(this.toBeIndexedDir.getAbsolutePath() ! .concat("/" + base)); ! File finalFile = new File(this.mergeDir.getAbsolutePath().concat( ! "/" + base)); ! ResourceResults res = indexer.indexArc(arcFile.getAbsolutePath()); ! indexer.serializeResults(res, tmpFile.getAbsolutePath()); ! if (!tmpFile.renameTo(finalFile)) { throw new IOException("Unable to move " ! + tmpFile.getAbsolutePath() + " to " ! + finalFile.getAbsolutePath()); } ! if (!flagFile.delete()) { throw new IOException("Unable to delete " ! + flagFile.getAbsolutePath()); } } } /** * @param args */ --- 222,312 ---- /** ! * Find any new ARC files and queue them for indexing. * @throws IOException */ ! public void queueNewArcsForIndex() throws IOException { ! Iterator newArcs = getNewArcs(); ! while(newArcs.hasNext()) { ! String newArc = (String) newArcs.next(); ! queueArcForIndex(newArc); } } ! /** * Index any ARC files queued for indexing, queueing the resulting CDX files * for merging with the BDBResourceIndex. * + * @param indexer * @throws MalformedURLException * @throws IOException */ ! public void indexArcs(ArcIndexer indexer) throws MalformedURLException, IOException { ! Iterator toBeIndexed = getDirFilesIterator(toBeIndexedDir); ! while(toBeIndexed.hasNext()) { ! String base = (String) toBeIndexed.next(); ! File arcFile = new File(arcDir,base); ! File toBeIndexedFlagFile = new File(toBeIndexedDir,base); ! File indexFile = new File(indexingDir,base); ! File toBeMergedFile = new File(toBeMergedDir,base); ! ResourceResults res = indexer.indexArc(arcFile); ! indexer.serializeResults(res, indexFile); ! if (!indexFile.renameTo(toBeMergedFile)) { throw new IOException("Unable to move " ! + indexFile.getAbsolutePath() + " to " ! + toBeMergedFile.getAbsolutePath()); } ! if (!toBeIndexedFlagFile.delete()) { throw new IOException("Unable to delete " ! + toBeIndexedFlagFile.getAbsolutePath()); ! } ! } ! } ! ! /** ! * Add any new CDX files in toBeMergedDir to the BDB, deleting the CDX ! * files as they are merged ! * @param dbWriter ! */ ! public void mergeIndex(BDBResourceIndexWriter dbWriter) { ! int numMerged = 0; ! Iterator toBeMerged = getDirFilesIterator(toBeMergedDir); ! while(toBeMerged.hasNext()) { ! ! File indexFile = new File(toBeMergedDir,(String) toBeMerged.next()); ! ! try { ! dbWriter.importFile(indexFile); ! if (!indexFile.delete()) { ! throw new IOException("Unable to unlink " ! + indexFile.getAbsolutePath()); ! } ! numMerged++; ! } catch (Exception e) { ! e.printStackTrace(); } } + if (numMerged > 0) { + System.out.println("Merged " + numMerged + " files."); + } } /** + * Gather a snapshot of the pipeline in a PipelineStatus object. + * @return PipelineStatus + */ + public PipelineStatus getStatus() { + PipelineStatus status = new PipelineStatus(); + String index[] = toBeIndexedDir.list(); + String merge[] = toBeMergedDir.list(); + String numQueuedForIndex = (index == null) ? "0" : "" + index.length; + String numQueuedForMerge = (merge == null) ? "0" : "" + merge.length; + status.setNumQueuedForIndex(numQueuedForIndex); + status.setNumQueuedForMerge(numQueuedForMerge); + return status; + } + + /** * @param args */ *************** *** 186,194 **** /** ! * @return Returns the mergeDir. */ ! public File getMergeDir() { ! return mergeDir; ! } } --- 316,369 ---- /** ! * Thread that repeatedly runs processing of an IndexPipeline and merges new ! * data into a BDBResourceIndex ! * ! * @author Brad Tofel ! * @version $Date$, $Revision$ */ ! private class IndexPipelineThread extends Thread { ! private final static int SLEEP_MILLISECONDS = 10000; ! ! private BDBResourceIndexWriter merger = null; ! private ArcIndexer indexer = new ArcIndexer(); ! IndexPipeline pipeline = null; ! ! /** ! * Constructor ! * ! * @param bdb ! * initialized BDBResourceIndex ! * @param pipeline ! * initialized IndexPipeline ! */ ! public IndexPipelineThread(final BDBResourceIndex bdb, ! IndexPipeline pipeline) { ! super("IndexPipelineThread"); ! super.setDaemon(true); ! merger = new BDBResourceIndexWriter(); ! merger.init(bdb); ! this.pipeline = pipeline; ! System.out.print("Pipeline Thread is ALIVE!"); ! } + public void run() { + + while (true) { + try { + pipeline.queueNewArcsForIndex(); + pipeline.indexArcs(indexer); + pipeline.mergeIndex(merger); + sleep(SLEEP_MILLISECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + // System.out.println("I'm running!"); catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + } } --- NEW FILE: PipelineFilter.java --- /* PipeLineServletFilter * * Created on Oct 20, 2005 * * Copyright (C) 2005 Internet Archive. * * This file is part of the wayback (crawler.archive.org). * * wayback is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * wayback is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with wayback; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.arcindexer; import java.io.IOException; import java.util.Enumeration; import java.util.Properties; import javax.servlet.Filter; import javax.servlet.FilterChain; import javax.servlet.FilterConfig; import javax.servlet.RequestDispatcher; import javax.servlet.ServletContext; import javax.servlet.ServletException; import javax.servlet.ServletRequest; import javax.servlet.ServletResponse; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; /** * @author brad * */ public class PipelineFilter implements Filter { private final String PIPELINE_STATUS_JSP = "pipeline.statusjsp"; private IndexPipeline pipeline = null; private String pipelineStatusJsp = null; /** * Constructor */ public PipelineFilter() { super(); } public void init(FilterConfig c) throws ServletException { Properties p = new Properties(); pipelineStatusJsp = c.getInitParameter(PIPELINE_STATUS_JSP); if ((pipelineStatusJsp == null) || (pipelineStatusJsp.length() <= 0)) { throw new ServletException("No config (" + PIPELINE_STATUS_JSP + ")"); } ServletContext sc = c.getServletContext(); for (Enumeration e = sc.getInitParameterNames(); e.hasMoreElements();) { String key = (String) e.nextElement(); p.put(key, sc.getInitParameter(key)); } pipeline = new IndexPipeline(); try { pipeline.init(p); } catch (IOException e) { throw new ServletException(e.getMessage()); } } /* * (non-Javadoc) * * @see javax.servlet.Filter#doFilter(javax.servlet.ServletRequest, * javax.servlet.ServletResponse, javax.servlet.FilterChain) */ public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException { if (!handle(request, response)) { chain.doFilter(request, response); } } protected boolean handle(final ServletRequest request, final ServletResponse response) throws IOException, ServletException { if (!(request instanceof HttpServletRequest)) { return false; } if (!(response instanceof HttpServletResponse)) { return false; } HttpServletRequest httpRequest = (HttpServletRequest) request; PipelineStatus status = pipeline.getStatus(); request.setAttribute("pipelinestatus", status); RequestDispatcher dispatcher = httpRequest .getRequestDispatcher(pipelineStatusJsp); dispatcher.forward(request, response); return true; } /* * (non-Javadoc) * * @see javax.servlet.Filter#destroy() */ public void destroy() { } } |
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/localbdbresourceindex In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14129/src/java/org/archive/wayback/localbdbresourceindex Modified Files: LocalBDBResourceIndex.java BDBResourceIndex.java Removed Files: BDBResourceIndexWriter.java Log Message: Heavy modification of configuration to be Context-level, instead of Servlet-level, which dramatically reduces configuration redundancy. Cleaned up IndexPipeline, moved a few classes around, added a really simple JSP to view the Index and Merge queue sizes, and a filter, which both allows the index thread to start with the context, and allows access to the jsp. Index: BDBResourceIndex.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/localbdbresourceindex/BDBResourceIndex.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** BDBResourceIndex.java 19 Oct 2005 01:22:37 -0000 1.2 --- BDBResourceIndex.java 21 Oct 2005 03:24:40 -0000 1.3 *************** *** 95,99 **** } ! protected void shutdownDB() throws DatabaseException { if (db != null) { --- 95,104 ---- } ! /** ! * shut down the BDB. ! * ! * @throws DatabaseException ! */ ! public void shutdownDB() throws DatabaseException { if (db != null) { *************** *** 194,198 **** } ! protected void addResults(ResourceResults results) throws Exception { Iterator itr = results.iterator(); DatabaseEntry key = new DatabaseEntry(); --- 199,208 ---- } ! /** ! * Add all ResourceResult in results to BDB index ! * @param results ! * @throws Exception ! */ ! public void addResults(ResourceResults results) throws Exception { Iterator itr = results.iterator(); DatabaseEntry key = new DatabaseEntry(); --- BDBResourceIndexWriter.java DELETED --- Index: LocalBDBResourceIndex.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/localbdbresourceindex/LocalBDBResourceIndex.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** LocalBDBResourceIndex.java 19 Oct 2005 01:22:37 -0000 1.2 --- LocalBDBResourceIndex.java 21 Oct 2005 03:24:40 -0000 1.3 *************** *** 24,28 **** package org.archive.wayback.localbdbresourceindex; - import java.io.File; import java.io.IOException; import java.util.Properties; --- 24,27 ---- *************** *** 42,60 **** */ public class LocalBDBResourceIndex implements ResourceIndex { - private static Thread indexUpdateThread = null; ! private final static String INDEX_PATH = "resourceindex.indexPath"; ! ! private final static String DB_NAME = "resourceindex.dbName"; ! ! private final static String ARC_PATH = "resourceindex.arcPath"; ! ! private final static String WORK_PATH = "resourceindex.workPath"; ! private final static String RUN_PIPELINE = "resourceindex.runPipeline"; private final static int MAX_RECORDS = 1000; private BDBResourceIndex db = null; /** --- 41,54 ---- */ public class LocalBDBResourceIndex implements ResourceIndex { ! private final static String INDEX_PATH = "resourceindex.indexpath"; ! private final static String DB_NAME = "resourceindex.dbname"; private final static int MAX_RECORDS = 1000; private BDBResourceIndex db = null; + + private IndexPipeline pipeline = null; /** *************** *** 71,100 **** throw new IllegalArgumentException("Failed to find " + INDEX_PATH); } - String arcPath = (String) p.get(ARC_PATH); - if (arcPath == null || (arcPath.length() <= 0)) { - throw new IllegalArgumentException("Failed to find " + ARC_PATH); - } - - String workPath = (String) p.get(WORK_PATH); - if (workPath == null || (workPath.length() <= 0)) { - throw new IllegalArgumentException("Failed to find " + WORK_PATH); - } String dbName = (String) p.get(DB_NAME); if (dbName == null || (dbName.length() <= 0)) { throw new IllegalArgumentException("Failed to find " + DB_NAME); } ! String runPipeline = (String) p.get(RUN_PIPELINE); ! db = new BDBResourceIndex(dbPath, dbName); ! if (runPipeline != null) { ! ! System.out ! .println("LocalDBDResourceIndex starting pipeline thread..."); ! if (indexUpdateThread == null) { ! IndexPipeline pipeline = new IndexPipeline(); ! String mergeDir = workPath + "/mergey"; ! pipeline.init(arcPath, mergeDir, workPath); ! startIndexUpdateThead(db, pipeline); ! } ! } } --- 65,75 ---- throw new IllegalArgumentException("Failed to find " + INDEX_PATH); } String dbName = (String) p.get(DB_NAME); if (dbName == null || (dbName.length() <= 0)) { throw new IllegalArgumentException("Failed to find " + DB_NAME); } ! db = new BDBResourceIndex(dbPath,dbName); ! pipeline = new IndexPipeline(); ! pipeline.init(p); } *************** *** 122,215 **** } } - - protected synchronized void startIndexUpdateThead( - final BDBResourceIndex bdb, IndexPipeline pipeline) { - if (indexUpdateThread != null) { - return; - } - indexUpdateThread = new IndexUpdateThread(bdb, pipeline); - indexUpdateThread.start(); - } - - /** - * Thread that repeatedly runs processing of an IndexPipeline and merges new - * data into a BDBResourceIndex - * - * @author Brad Tofel - * @version $Date$, $Revision$ - */ - private class IndexUpdateThread extends Thread { - private final static int SLEEP_MILLISECONDS = 10000; - - BDBResourceIndexWriter importer = null; - - IndexPipeline pipeline = null; - - /** - * Constructor - * - * @param bdb - * initialized BDBResourceIndex - * @param pipeline - * initialized IndexPipeline - */ - public IndexUpdateThread(final BDBResourceIndex bdb, - IndexPipeline pipeline) { - super("IndexUpdateThread"); - super.setDaemon(true); - this.importer = new BDBResourceIndexWriter(); - importer.init(bdb); - this.pipeline = pipeline; - } - - public void run() { - - while (true) { - try { - indexArcs(); - mergeIndex(); - sleep(SLEEP_MILLISECONDS); - } catch (InterruptedException e) { - e.printStackTrace(); - } - // System.out.println("I'm running!"); - } - } - - private void indexArcs() { - try { - pipeline.indexArcs(); - // System.out.println("Indexed..."); - } catch (IOException e) { - e.printStackTrace(); - } - } - - private void mergeIndex() { - int numMerged = 0; - String newFiles[] = pipeline.getMergeDir().list(); - for (int i = 0; i < newFiles.length; i++) { - // TODO: Special handling of encoding and date. - File newFile = new File(pipeline.getMergeDir() - .getAbsolutePath() - + "/" + newFiles[i]); - - if (newFile.isFile()) { - try { - importer.importFile(newFile.getAbsolutePath()); - if (!newFile.delete()) { - throw new IOException("Unable to unlink " - + newFile.getAbsolutePath()); - } - numMerged++; - } catch (Exception e) { - e.printStackTrace(); - } - } - } - if (numMerged > 0) { - System.out.println("Merged " + numMerged + " files."); - } - } - } } --- 97,99 ---- |