From: <bi...@us...> - 2008-09-22 18:08:11
|
Revision: 2591 http://archive-access.svn.sourceforge.net/archive-access/?rev=2591&view=rev Author: binzino Date: 2008-09-22 18:07:59 +0000 (Mon, 22 Sep 2008) Log Message: ----------- WAX-22: Various code clean-ups based on code review using PMD tool. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-09-04 21:36:09 UTC (rev 2590) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-09-22 18:07:59 UTC (rev 2591) @@ -16,10 +16,12 @@ */ package org.archive.nutchwax; -import java.io.*; +//import java.io.*; import java.util.*; import java.lang.reflect.Field; -import javax.servlet.*; +import javax.servlet.ServletContext; +import javax.servlet.ServletContextEvent; +import javax.servlet.ServletContextListener; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -34,7 +36,6 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Closeable; import org.apache.hadoop.conf.Configuration; import org.apache.lucene.index.IndexReader; Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java 2008-09-04 21:36:09 UTC (rev 2590) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/XSLTFilter.java 2008-09-22 18:07:59 UTC (rev 2591) @@ -25,7 +25,6 @@ import java.io.PrintWriter; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.io.CharArrayWriter; import javax.servlet.Filter; import javax.servlet.FilterChain; @@ -35,15 +34,14 @@ import javax.servlet.ServletOutputStream; import javax.servlet.ServletRequest; import javax.servlet.ServletResponse; -import javax.servlet.ServletResponseWrapper; -import javax.servlet.http.*; +import javax.servlet.http.HttpServletResponse; +import javax.servlet.http.HttpServletResponseWrapper; import javax.xml.transform.Source; import javax.xml.transform.stream.StreamSource; import javax.xml.transform.Templates; import javax.xml.transform.TransformerFactory; import javax.xml.transform.Transformer; -import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; @@ -55,8 +53,6 @@ public void init( FilterConfig config ) throws ServletException { - ServletContext app = config.getServletContext( ); - this.xsltUrl = config.getInitParameter( "xsltUrl" ); if ( this.xsltUrl != null ) @@ -116,9 +112,11 @@ } catch ( javax.xml.transform.TransformerConfigurationException tce ) { + // TODO: Re-throw, or log it and eat it? } catch( javax.xml.transform.TransformerException te ) { + // TODO: Re-throw, or log it and eat it? } } else Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-04 21:36:09 UTC (rev 2590) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-09-22 18:07:59 UTC (rev 2591) @@ -20,17 +20,15 @@ */ package org.archive.nutchwax.tools; -import java.io.File; import java.io.BufferedReader; -import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.Map; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; -import java.util.Collections; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; @@ -132,7 +130,7 @@ String dates[] = sourceDoc.getValues( NutchWax.DATE_KEY ); - java.util.Collections.addAll( uniqueDates, dates ); + Collections.addAll( uniqueDates, dates ); } for ( String date : uniqueDates ) { Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java 2008-09-04 21:36:09 UTC (rev 2590) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java 2008-09-22 18:07:59 UTC (rev 2591) @@ -31,9 +31,6 @@ { public static void main( String[] args ) throws Exception { - String option = ""; - String indexDir = ""; - if ( args.length < 1 ) { usageAndExit( ); Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2008-09-04 21:36:09 UTC (rev 2590) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2008-09-22 18:07:59 UTC (rev 2591) @@ -73,6 +73,7 @@ String destKey = srcKey; switch ( spec.length ) { + default: case 6: destKey = spec[5]; case 5: @@ -83,6 +84,9 @@ store = Boolean.parseBoolean( spec[2] ); case 2: lowerCase = Boolean.parseBoolean( spec[1] ); + case 1: + // Nothing to do + ; } LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + tokenize + ":" + exclusive + ":" + destKey ); Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-09-04 21:36:09 UTC (rev 2590) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/urlfilter-nutchwax/src/java/org/archive/nutchwax/urlfilter/WaybackURLFilter.java 2008-09-22 18:07:59 UTC (rev 2591) @@ -24,14 +24,10 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.net.MalformedURLException; -import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.commons.httpclient.URIException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-10-03 23:08:35
|
Revision: 2597 http://archive-access.svn.sourceforge.net/archive-access/?rev=2597&view=rev Author: binzino Date: 2008-10-03 23:08:29 +0000 (Fri, 03 Oct 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/web/ trunk/archive-access/projects/nutchwax/archive/src/web/style/ trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl trunk/archive-access/projects/nutchwax/archive/src/web/web.xml Added: trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl 2008-10-03 23:08:29 UTC (rev 2597) @@ -0,0 +1,153 @@ +<?xml version="1.0" encoding="utf-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<xsl:stylesheet + version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:nutch="http://www.nutch.org/opensearchrss/1.0/" + xmlns:opensearch="http://a9.com/-/spec/opensearchrss/1.0/" +> +<xsl:output method="xml" /> + +<xsl:template match="rss/channel"> + <html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <title><xsl:value-of select="title" /></title> + <style media="all" lang="en" type="text/css"> + body + { + padding : 20px; + margin : 0; + font-family : Verdana; sans-serif; + font-size : 9pt; + color : #000000; + background-color: #ffffff; + } + .pageTitle + { + font-size : 125% ; + font-weight : bold ; + text-align : center ; + padding-bottom : 2em ; + } + .searchForm + { + margin : 20px 0 5px 0; + padding-bottom : 0px; + border-bottom : 1px solid black; + } + .searchResult + { + margin : 0; + padding : 0; + } + .searchResult h1 + { + margin : 0 0 5px 0 ; + padding : 0 ; + font-size : 120%; + } + .searchResult .details + { + font-size: 80%; + color: green; + } + .searchResult .dates + { + font-size: 80%; + } + .searchResult .dates a + { + color: #3366cc; + } + form#searchForm + { + margin : 0; padding: 0 0 10px 0; + } + .searchFields + { + padding : 3px 0; + } + .searchFields input + { + margin : 0 0 0 15px; + padding : 0; + } + input#query + { + margin : 0; + } + ol + { + margin : 5px 0 0 0; + padding : 0 0 0 2em; + } + ol li + { + margin : 0 0 15px 0; + } + </style> + </head> + <body> + <!-- Page header: title and search form --> + <div class="pageTitle" > + NutchWAX Sample XSLT + </div> + <div> + This simple XSLT demonstrates the transformation of OpenSearch XML results into a fully-functional, human-friendly HTML search page. No JSP needed. + </div> + <div class="searchForm"> + <form id="searchForm" name="searchForm" method="get" action="search" > + <span class="searchFields"> + Search for + <input id="query" name="query" type="text" size="40" value="{nutch:query}" /> + <input type="submit" value="Search"/> + </span> + </form> + </div> + <div style="font-size: 8pt; margin:0; padding:0 0 0.5em 0;">Results <xsl:value-of select="opensearch:startIndex + 1" />-<xsl:value-of select="opensearch:startIndex + opensearch:itemsPerPage" /> of about <xsl:value-of select="opensearch:totalResults" /> <span style="margin-left: 1em;"><a href="{nutch:nextPage}">Next</a></span></div> + <!-- Search results --> + <ol start="{opensearch:startIndex + 1}"> + <xsl:apply-templates select="item" /> + </ol> + <a href="{nutch:nextPage}">Next</a> + </body> +</html> +</xsl:template> + +<xsl:template match="item"> + <li> + <div class="searchResult"> + <h1><a href="{concat('http://wayback.archive-it.org/',nutch:collection,'/',nutch:date,'/',link)}"><xsl:value-of select="title" /></a></h1> + <div> + <xsl:value-of select="description" /> + </div> + <div class="details"> + <xsl:value-of select="link" /> - <xsl:value-of select="round( nutch:length div 1024 )"/>k - <xsl:value-of select="nutch:type" /> + </div> + <div class="dates"> + <a href="{concat('http://wayback.archive-it.org/',nutch:collection,'/*/',link)}">All versions</a> - <a href="?query={../nutch:query} site:{nutch:site}&hitsPerSite=0">More from <xsl:value-of select="nutch:site" /></a> + </div> + </div> + </li> +</xsl:template> + +<xsl:template match="nutch:date" > + <xsl:value-of select="substring(.,1,4)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,5,2)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,7,2)" /><xsl:text> </xsl:text> +</xsl:template> + +</xsl:stylesheet> Added: trunk/archive-access/projects/nutchwax/archive/src/web/web.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/web/web.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/web/web.xml 2008-10-03 23:08:29 UTC (rev 2597) @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> +<!DOCTYPE web-app + PUBLIC "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN" + "http://java.sun.com/dtd/web-app_2_3.dtd"> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<web-app> + +<!-- order is very important here --> + +<listener> + <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class> + <listener-class>org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor</listener-class> +</listener> + +<servlet> + <servlet-name>Cached</servlet-name> + <servlet-class>org.apache.nutch.servlet.Cached</servlet-class> +</servlet> + +<servlet> + <servlet-name>OpenSearch</servlet-name> + <servlet-class>org.apache.nutch.searcher.OpenSearchServlet</servlet-class> +</servlet> + +<servlet-mapping> + <servlet-name>Cached</servlet-name> + <url-pattern>/servlet/cached</url-pattern> +</servlet-mapping> + +<servlet-mapping> + <servlet-name>OpenSearch</servlet-name> + <url-pattern>/opensearch</url-pattern> +</servlet-mapping> + +<servlet-mapping> + <servlet-name>OpenSearch</servlet-name> + <url-pattern>/search</url-pattern> +</servlet-mapping> + +<filter> + <filter-name>XSLT Filter</filter-name> + <filter-class>org.archive.nutchwax.XSLTFilter</filter-class> + <init-param> + <param-name>xsltUrl</param-name> + <param-value>style/search.xsl</param-value> + </init-param> +</filter> + +<filter-mapping> + <filter-name>XSLT Filter</filter-name> + <url-pattern>/search</url-pattern> +</filter-mapping> + +<welcome-file-list> + <welcome-file>search.html</welcome-file> + <welcome-file>index.html</welcome-file> + <welcome-file>index.jsp</welcome-file> +</welcome-file-list> + +<taglib> + <taglib-uri>http://jakarta.apache.org/taglibs/i18n</taglib-uri> + <taglib-location>/WEB-INF/taglibs-i18n.tld</taglib-location> + </taglib> + +</web-app> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-10 05:02:22
|
Revision: 2658 http://archive-access.svn.sourceforge.net/archive-access/?rev=2658&view=rev Author: binzino Date: 2008-12-10 05:02:19 +0000 (Wed, 10 Dec 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/etc/ trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/ trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java Added: trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave 2008-12-10 05:02:19 UTC (rev 2658) @@ -0,0 +1,63 @@ +#! /bin/sh +# +# ----------------------------------- +# Initscript for NutchWAX searcher slave +# ----------------------------------- + +set -e + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin +DESC="NutchWAX searcher slave" +NAME="searcher-slave" + +DAEMON="/3/search/nutchwax-0.12.2/bin/nutch org.archive.nutchwax.DistributedSearch\$Server 9000 /3/search/deploy" +NUTCH_HOME=/3/search/nutchwax-0.12.2 +JAVA_HOME=/usr +export NUTCH_HEAPSIZE=2500 +PIDFILE=/var/run/$NAME.pid +SCRIPTNAME=/etc/init.d/$NAME + +# Gracefully exit if the package has been removed. +test -x /usr/bin/java || exit 0 + +# --------------------------------------- +# Function that starts the daemon/service +# --------------------------------------- +d_start() +{ +start-stop-daemon --start -b -m -c webcrawl:webcrawl --pidfile $PIDFILE --exec $DAEMON +} + +# -------------------------------------- +# Function that stops the daemon/service +# -------------------------------------- +d_stop() +{ +start-stop-daemon --stop --pidfile $PIDFILE +} + +case "$1" in +start) +echo -n "Starting $DESC: $NAME" +d_start +echo "." +;; +stop) +echo -n "Stopping $DESC: $NAME" +d_stop +echo "." +;; +restart|force-reload) +echo -n "Restarting $DESC: $NAME" +d_stop +sleep 1 +d_start +echo "." +;; +*) +echo "Usage: $SCRIPTNAME {start|stop|restart|force-reload}" >&2 +exit 1 +;; +esac + +exit 0 Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java 2008-12-10 05:02:19 UTC (rev 2658) @@ -0,0 +1,208 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax.tools; + +import java.io.*; +import java.util.*; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.mapred.FileAlreadyExistsException; +import org.apache.hadoop.util.*; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.util.ReflectionUtils; + +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.IndexWriter; + +/** + * + */ +public class PageRanker extends Configured implements Tool +{ + public static final Log LOG = LogFactory.getLog(PageRanker.class); + + public static final String DONE_NAME = "merge.done"; + + public PageRanker() { + + } + + public PageRanker(Configuration conf) { + setConf(conf); + } + + /** + * Create an index for the input files in the named directory. + */ + public static void main(String[] args) + throws Exception + { + int res = ToolRunner.run(NutchConfiguration.create(), new PageRanker(), args); + System.exit(res); + } + + /** + * + */ + public int run(String[] args) + throws Exception + { + String usage = "Usage: PageRanker [OPTIONS] outputFile <linkdb|paths>\n" + + "Emit PageRank values for URLs in linkDb(s). Suitable for use with\n" + + "PageRank scoring filter.\n" + + "\n" + + "OPTIONS:\n" + + " -p Use exact path as given, don't assume it's a typical\n" + + " linkdb with \"current/part-nnnnn\" subdirs.\n" + + " -t threshold Do not emit records with less than this many inlinks.\n" + + " Default value 10." + ; + if ( args.length < 1 ) + { + System.err.println( "Usage: " + usage ); + return -1; + } + + boolean exactPath = false; + int threshold = 10; + + int pos = 0; + for ( ; pos < args.length && args[pos].charAt(0) == '-' ; pos++ ) + { + if ( args[pos].equals( "-p" ) ) + { + exactPath = true; + } + if ( args[pos].equals( "-t" ) ) + { + pos++; + if ( args.length - pos < 1 ) + { + System.err.println( "Error: missing argument to -t option" ); + return -1; + } + try + { + threshold = Integer.parseInt( args[pos] ); + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Error: bad value for -t option: " + args[pos] ); + return -1; + } + } + } + + Configuration conf = getConf( ); + FileSystem fs = FileSystem.get( conf ); + + if ( pos >= args.length ) + { + System.err.println( "Error: missing outputFile" ); + return -1; + } + + Path outputPath = new Path( args[pos++] ); + if ( fs.exists( outputPath ) ) + { + System.err.println( "Erorr: outputFile already exists: " + outputPath ); + return -1; + } + + PrintWriter output = new PrintWriter( new OutputStreamWriter( fs.create( outputPath ).getWrappedStream( ), "UTF-8" ) ); + + if ( pos >= args.length ) + { + System.err.println( "Error: missing linkdb" ); + return -1; + } + + List<Path> mapfiles = new ArrayList<Path>(); + + // If we are using exact paths, add each one to the list. + // Otherwise, assume the given path is to a linkdb and look for + // <linkdbPath>/current/part-nnnnn sub-dirs. + if ( exactPath ) + { + for ( ; pos < args.length ; pos++ ) + { + mapfiles.add( new Path( args[pos] ) ); + } + } + else + { + FileStatus[] fstats = fs.listStatus( new Path(args[pos]+"/current"), HadoopFSUtil.getPassDirectoriesFilter(fs)); + mapfiles.addAll(Arrays.asList(HadoopFSUtil.getPaths(fstats))); + } + + System.out.println( "mapfiles = " + mapfiles ); + try + { + for ( Path p : mapfiles ) + { + MapFile.Reader reader = new MapFile.Reader( fs, p.toString(), conf ); + + WritableComparable key = (WritableComparable) ReflectionUtils.newInstance( reader.getKeyClass() , conf ); + Writable value = (Writable) ReflectionUtils.newInstance( reader.getValueClass(), conf ); + + while ( reader.next( key, value ) ) + { + if ( key instanceof Text && value instanceof Inlinks ) + { + Text toUrl = (Text) key; + Inlinks inlinks = (Inlinks) value; + + if ( inlinks.size( ) < threshold ) + { + continue ; + } + + String toUrlString = toUrl.toString( ); + + // HACK: Should make this into some externally configurable regex. + if ( toUrlString.startsWith( "http" ) ) + { + output.println( inlinks.size( ) + " " + toUrl.toString() ); + } + } + } + } + + return 0; + } + catch ( Exception e ) + { + LOG.fatal( "PageRanker: " + StringUtils.stringifyException( e ) ); + return -1; + } + finally + { + output.flush( ); + output.close( ); + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-11 22:58:33
|
Revision: 2660 http://archive-access.svn.sourceforge.net/archive-access/?rev=2660&view=rev Author: binzino Date: 2008-12-11 22:58:28 +0000 (Thu, 11 Dec 2008) Log Message: ----------- Initial checkin of Nutch source-files that are over-ridden and copied into the Nutch source tree when compiling. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2008-12-11 22:58:28 UTC (rev 2660) @@ -0,0 +1,375 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.IOException; +import java.io.Reader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.BufferedReader; + +import java.util.HashMap; +import java.util.Map; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.nutch.protocol.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.mapred.lib.*; +import org.apache.nutch.crawl.*; + +/** Implements {@link HitSummarizer} and {@link HitContent} for a set of + * fetched segments. */ +public class FetchedSegments implements HitSummarizer, HitContent +{ + public static final Log LOG = LogFactory.getLog(FetchedSegments.class); + + private static class Segment implements Closeable { + + private static final Partitioner PARTITIONER = new HashPartitioner(); + + private FileSystem fs; + private Path segmentDir; + + private MapFile.Reader[] content; + private MapFile.Reader[] parseText; + private MapFile.Reader[] parseData; + private MapFile.Reader[] crawl; + private Configuration conf; + + public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException { + this.fs = fs; + this.segmentDir = segmentDir; + this.conf = conf; + } + + public CrawlDatum getCrawlDatum(Text url) throws IOException { + synchronized (this) { + if (crawl == null) + crawl = getReaders(CrawlDatum.FETCH_DIR_NAME); + } + return (CrawlDatum)getEntry(crawl, url, new CrawlDatum()); + } + + public byte[] getContent(Text url) throws IOException { + synchronized (this) { + if (content == null) + content = getReaders(Content.DIR_NAME); + } + return ((Content)getEntry(content, url, new Content())).getContent(); + } + + public ParseData getParseData(Text url) throws IOException { + synchronized (this) { + if (parseData == null) + parseData = getReaders(ParseData.DIR_NAME); + } + return (ParseData)getEntry(parseData, url, new ParseData()); + } + + public ParseText getParseText(Text url) throws IOException { + synchronized (this) { + if (parseText == null) + parseText = getReaders(ParseText.DIR_NAME); + } + return (ParseText)getEntry(parseText, url, new ParseText()); + } + + private MapFile.Reader[] getReaders(String subDir) throws IOException { + return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf); + } + + private Writable getEntry(MapFile.Reader[] readers, Text url, + Writable entry) throws IOException { + return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry); + } + + public void close() throws IOException { + if (content != null) { closeReaders(content); } + if (parseText != null) { closeReaders(parseText); } + if (parseData != null) { closeReaders(parseData); } + if (crawl != null) { closeReaders(crawl); } + } + + private void closeReaders(MapFile.Reader[] readers) throws IOException { + for (int i = 0; i < readers.length; i++) { + readers[i].close(); + } + } + + } + + private HashMap segments = new HashMap( ); + private boolean perCollection = false; + private Summarizer summarizer; + + /** Construct given a directory containing fetcher output. */ + public FetchedSegments(FileSystem fs, String segmentsDir, Configuration conf) throws IOException + { + this.summarizer = new SummarizerFactory(conf).getSummarizer(); + + Path[] segmentDirs = HadoopFSUtil.getPaths( fs.listStatus(new Path(segmentsDir), HadoopFSUtil.getPassDirectoriesFilter(fs)) ); + if ( segmentDirs == null ) + { + LOG.warn( "No segment directories: " + segmentsDir ); + return ; + } + + this.perCollection = conf.getBoolean( "nutchwax.FetchedSegments.perCollection", false ); + + LOG.info( "Per-collection segments: " + this.perCollection ); + + for ( int i = 0; i < segmentDirs.length; i++ ) + { + if ( this.perCollection ) + { + // Assume segmentDir is actually a 'collection' dir which + // contains a list of segments, such as: + // crawl/segments/194/segment-foo + // /segment-bar + // /segment-baz + // crawl/segments/366/segment-frotz + // /segment-fizzle + // /segment-bizzle + // The '194' and '366' are collection dirs, which contain the + // actual segment dirs. + Path collectionDir = segmentDirs[i]; + + Map perCollectionSegments = (Map) this.segments.get( collectionDir.getName( ) ); + if ( perCollectionSegments == null ) + { + perCollectionSegments = new HashMap( ); + this.segments.put( collectionDir.getName( ), perCollectionSegments ); + } + + // Now, get a list of all the sub-dirs of the collectionDir, + // and create segments for them, adding them to the + // per-collection map. + Path[] perCollectionSegmentDirs = HadoopFSUtil.getPaths( fs.listStatus( collectionDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ) ); + for ( Path segmentDir : perCollectionSegmentDirs ) + { + perCollectionSegments.put( segmentDir.getName( ), new Segment( fs, segmentDir, conf ) ); + } + + addRemaps( fs, collectionDir, (Map<String,Segment>) perCollectionSegments ); + } + else + { + Path segmentDir = segmentDirs[i]; + segments.put(segmentDir.getName(), new Segment(fs, segmentDir, conf)); + } + } + + // If we not-doing perCollection segments, process a single + // "remap" file for the "segments" dir. + if ( ! this.perCollection ) + { + addRemaps( fs, new Path(segmentsDir), (Map<String,Segment>) segments ); + } + + LOG.info( "segments: " + segments ); + } + + protected void addRemaps( FileSystem fs, Path segmentDir, Map<String,Segment> segments ) + throws IOException + { + Path segmentRemapFile = new Path( segmentDir, "remap" ); + + if ( ! fs.exists( segmentRemapFile ) ) + { + LOG.warn( "Remap file doesn't exist: " + segmentRemapFile ); + + return ; + } + + // InputStream is = segmentRemapFile.getFileSystem( conf ).open( segmentRemapFile ); + InputStream is = fs.open( segmentRemapFile ); + + BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + String fields[] = line.trim( ).split( "\\s+" ); + + if ( fields.length < 2 ) + { + LOG.warn( "Malformed remap line, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + // Look for the "to" name in the segments. + Segment toSegment = segments.get( fields[1] ); + if ( toSegment == null ) + { + LOG.warn( "Segment remap destination doesn't exist: " + fields[1] ); + } + else + { + LOG.warn( "Remap: " + fields[0] + " => " + fields[1] ); + segments.put( fields[0], toSegment ); + } + } + } + + + public String[] getSegmentNames() { + return (String[])segments.keySet().toArray(new String[segments.size()]); + } + + public byte[] getContent(HitDetails details) throws IOException { + return getSegment(details).getContent(getUrl(details)); + } + + public ParseData getParseData(HitDetails details) throws IOException { + return getSegment(details).getParseData(getUrl(details)); + } + + public long getFetchDate(HitDetails details) throws IOException { + return getSegment(details).getCrawlDatum(getUrl(details)) + .getFetchTime(); + } + + public ParseText getParseText(HitDetails details) throws IOException { + return getSegment(details).getParseText(getUrl(details)); + } + + public Summary getSummary(HitDetails details, Query query) + throws IOException { + + if (this.summarizer == null) { return new Summary(); } + + Segment segment = getSegment(details); + ParseText parseText = segment.getParseText(getUrl(details)); + String text = (parseText != null) ? parseText.getText() : ""; + + return this.summarizer.getSummary(text, query); + } + + private class SummaryThread extends Thread { + private HitDetails details; + private Query query; + + private Summary summary; + private Throwable throwable; + + public SummaryThread(HitDetails details, Query query) { + this.details = details; + this.query = query; + } + + public void run() { + try { + this.summary = getSummary(details, query); + } catch (Throwable throwable) { + this.throwable = throwable; + } + } + + } + + + public Summary[] getSummary(HitDetails[] details, Query query) + throws IOException { + SummaryThread[] threads = new SummaryThread[details.length]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new SummaryThread(details[i], query); + threads[i].start(); + } + + Summary[] results = new Summary[details.length]; + for (int i = 0; i < threads.length; i++) { + try { + threads[i].join(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + if (threads[i].throwable instanceof IOException) { + throw (IOException)threads[i].throwable; + } else if (threads[i].throwable != null) { + throw new RuntimeException(threads[i].throwable); + } + results[i] = threads[i].summary; + } + return results; + } + + + private Segment getSegment(HitDetails details) + { + if ( this.perCollection ) + { + LOG.info( "getSegment: " + details ); + LOG.info( " collection: " + details.getValue("collection") ); + LOG.info( " segment : " + details.getValue("segment") ); + + String collectionId = details.getValue("collection"); + String segmentName = details.getValue("segment"); + + Map perCollectionSegments = (Map) this.segments.get( collectionId ); + + Segment segment = (Segment) perCollectionSegments.get( segmentName ); + + if ( segment == null ) + { + LOG.warn( "Didn't find segment: collection=" + collectionId + " segment=" + segmentName ); + } + + return segment; + } + else + { + LOG.info( "getSegment: " + details ); + LOG.info( " segment : " + details.getValue("segment") ); + + String segmentName = details.getValue( "segment" ); + Segment segment = (Segment) segments.get( segmentName ); + + if ( segment == null ) + { + LOG.warn( "Didn't find segment: " + segmentName ); + } + + return segment; + } + } + + private Text getUrl(HitDetails details) { + String url = details.getValue("orig"); + if (StringUtils.isBlank(url)) { + url = details.getValue("url"); + } + return new Text(url); + } + + public void close() throws IOException { + Iterator iterator = segments.values().iterator(); + while (iterator.hasNext()) { + ((Segment) iterator.next()).close(); + } + } + +} Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2008-12-11 22:58:28 UTC (rev 2660) @@ -0,0 +1,179 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.indexer.FsDirectory; +import org.apache.nutch.indexer.NutchSimilarity; + +/** Implements {@link Searcher} and {@link HitDetailer} for either a single + * merged index, or a set of indexes. */ +public class IndexSearcher implements Searcher, HitDetailer { + + private org.apache.lucene.search.Searcher luceneSearcher; + private org.apache.lucene.index.IndexReader reader; + private LuceneQueryOptimizer optimizer; + private FileSystem fs; + private Configuration conf; + private QueryFilters queryFilters; + + /** Construct given a number of indexes. */ + public IndexSearcher(Path[] indexDirs, Configuration conf) throws IOException { + IndexReader[] readers = new IndexReader[indexDirs.length]; + this.conf = conf; + this.fs = FileSystem.get(conf); + for (int i = 0; i < indexDirs.length; i++) { + readers[i] = IndexReader.open(getDirectory(indexDirs[i])); + } + init(new MultiReader(readers), conf); + } + + /** Construct given a single merged index. */ + public IndexSearcher(Path index, Configuration conf) + throws IOException { + this.conf = conf; + this.fs = FileSystem.get(conf); + init(IndexReader.open(getDirectory(index)), conf); + } + + private void init(IndexReader reader, Configuration conf) throws IOException { + this.reader = reader; + this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader); + this.luceneSearcher.setSimilarity(new NutchSimilarity()); + this.optimizer = new LuceneQueryOptimizer(conf); + this.queryFilters = new QueryFilters(conf); + } + + private Directory getDirectory(Path file) throws IOException { + if ("file".equals(this.fs.getUri().getScheme())) { + Path qualified = file.makeQualified(FileSystem.getLocal(conf)); + File fsLocal = new File(qualified.toUri()); + return FSDirectory.getDirectory(fsLocal.getAbsolutePath()); + } else { + return new FsDirectory(this.fs, file, false, this.conf); + } + } + + public Hits search(Query query, int numHits, + String dedupField, String sortField, boolean reverse) + + throws IOException { + org.apache.lucene.search.BooleanQuery luceneQuery = + this.queryFilters.filter(query); + + System.out.println( "Nutch query: " + query ); + System.out.println( "Lucene query: " + luceneQuery ); + + return translateHits + (optimizer.optimize(luceneQuery, luceneSearcher, numHits, + sortField, reverse), + dedupField, sortField); + } + + public String getExplanation(Query query, Hit hit) throws IOException { + return luceneSearcher.explain(this.queryFilters.filter(query), + hit.getIndexDocNo()).toHtml(); + } + + public HitDetails getDetails(Hit hit) throws IOException { + + Document doc = luceneSearcher.doc(hit.getIndexDocNo()); + + List docFields = doc.getFields(); + String[] fields = new String[docFields.size()]; + String[] values = new String[docFields.size()]; + for (int i = 0; i < docFields.size(); i++) { + Field field = (Field)docFields.get(i); + fields[i] = field.name(); + values[i] = field.stringValue(); + } + + return new HitDetails(fields, values); + } + + public HitDetails[] getDetails(Hit[] hits) throws IOException { + HitDetails[] results = new HitDetails[hits.length]; + for (int i = 0; i < hits.length; i++) + results[i] = getDetails(hits[i]); + return results; + } + + private Hits translateHits(TopDocs topDocs, + String dedupField, String sortField) + throws IOException { + + String[] dedupValues = null; + if (dedupField != null) + dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField); + + ScoreDoc[] scoreDocs = topDocs.scoreDocs; + int length = scoreDocs.length; + Hit[] hits = new Hit[length]; + for (int i = 0; i < length; i++) { + + int doc = scoreDocs[i].doc; + + WritableComparable sortValue; // convert value to writable + if (sortField == null) { + sortValue = new FloatWritable(scoreDocs[i].score); + } else { + Object raw = ((FieldDoc)scoreDocs[i]).fields[0]; + if (raw instanceof Integer) { + sortValue = new IntWritable(((Integer)raw).intValue()); + } else if (raw instanceof Float) { + sortValue = new FloatWritable(((Float)raw).floatValue()); + } else if (raw instanceof String) { + sortValue = new Text((String)raw); + } else { + throw new RuntimeException("Unknown sort value type!"); + } + } + + String dedupValue = dedupValues == null ? null : dedupValues[doc]; + + hits[i] = new Hit(doc, sortValue, dedupValue); + } + return new Hits(topDocs.totalHits, hits); + } + + public void close() throws IOException { + if (luceneSearcher != null) { luceneSearcher.close(); } + if (reader != null) { reader.close(); } + } + +} Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 2008-12-11 22:58:28 UTC (rev 2660) @@ -0,0 +1,333 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.IOException; +import java.net.URLEncoder; +import java.util.Map; +import java.util.HashMap; +import java.util.Set; +import java.util.HashSet; + +import javax.servlet.ServletException; +import javax.servlet.ServletConfig; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import javax.xml.parsers.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.w3c.dom.*; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + + +/** Present search results using A9's OpenSearch extensions to RSS, plus a few + * Nutch-specific extensions. */ +public class OpenSearchServlet extends HttpServlet { + private static final Map NS_MAP = new HashMap(); + private int MAX_HITS_PER_PAGE; + + static { + NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/"); + NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/"); + } + + private static final Set SKIP_DETAILS = new HashSet(); + static { + SKIP_DETAILS.add("url"); // redundant with RSS link + SKIP_DETAILS.add("title"); // redundant with RSS title + } + + private NutchBean bean; + private Configuration conf; + + public void init(ServletConfig config) throws ServletException { + try { + this.conf = NutchConfiguration.get(config.getServletContext()); + bean = NutchBean.get(config.getServletContext(), this.conf); + } catch (IOException e) { + throw new ServletException(e); + } + MAX_HITS_PER_PAGE = conf.getInt("searcher.max.hits.per.page", -1); + } + + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query request from " + request.getRemoteAddr()); + } + + // get parameters from request + request.setCharacterEncoding("UTF-8"); + String queryString = request.getParameter("query"); + if (queryString == null) + queryString = ""; + String urlQuery = URLEncoder.encode(queryString, "UTF-8"); + + // the query language + String queryLang = request.getParameter("lang"); + + int start = 0; // first hit to display + String startString = request.getParameter("start"); + if (startString != null) + start = Integer.parseInt(startString); + + int hitsPerPage = 10; // number of hits to display + String hitsString = request.getParameter("hitsPerPage"); + if (hitsString != null) + hitsPerPage = Integer.parseInt(hitsString); + if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE) + hitsPerPage = MAX_HITS_PER_PAGE; + + String sort = request.getParameter("sort"); + boolean reverse = + sort!=null && "true".equals(request.getParameter("reverse")); + + // De-Duplicate handling. Look for duplicates field and for how many + // duplicates per results to return. Default duplicates field is 'site' + // and duplicates per results default is '2'. + String dedupField = request.getParameter("dedupField"); + if (dedupField == null || dedupField.length() == 0) { + dedupField = "site"; + } + int hitsPerDup = 2; + String hitsPerDupString = request.getParameter("hitsPerDup"); + if (hitsPerDupString != null && hitsPerDupString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerDupString); + } else { + // If 'hitsPerSite' present, use that value. + String hitsPerSiteString = request.getParameter("hitsPerSite"); + if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerSiteString); + } + } + + // Make up query string for use later drawing the 'rss' logo. + String params = "&hitsPerPage=" + hitsPerPage + + (queryLang == null ? "" : "&lang=" + queryLang) + + (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") + + (dedupField == null ? "" : "&dedupField=" + dedupField)); + + Query query = Query.parse(queryString, queryLang, this.conf); + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query: " + queryString); + NutchBean.LOG.info("lang: " + queryLang); + } + + // execute the query + Hits hits; + try { + hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField, + sort, reverse); + } catch (IOException e) { + if (NutchBean.LOG.isWarnEnabled()) { + NutchBean.LOG.warn("Search Error", e); + } + hits = new Hits(0,new Hit[0]); + } + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("total hits: " + hits.getTotal()); + } + + // generate xml results + int end = (int)Math.min(hits.getLength(), start + hitsPerPage); + int length = end-start; + + Hit[] show = hits.getHits(start, end-start); + HitDetails[] details = bean.getDetails(show); + Summary[] summaries = bean.getSummary(details, query); + + String requestUrl = request.getRequestURL().toString(); + String base = requestUrl.substring(0, requestUrl.lastIndexOf('/')); + + + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + Document doc = factory.newDocumentBuilder().newDocument(); + + Element rss = addNode(doc, doc, "rss"); + addAttribute(doc, rss, "version", "2.0"); + addAttribute(doc, rss, "xmlns:opensearch", + (String)NS_MAP.get("opensearch")); + addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch")); + + Element channel = addNode(doc, rss, "channel"); + + addNode(doc, channel, "title", "Nutch: " + queryString); + addNode(doc, channel, "description", "Nutch search results for query: " + + queryString); + addNode(doc, channel, "link", + base+"/search.jsp" + +"?query="+urlQuery + +"&start="+start + +"&hitsPerDup="+hitsPerDup + +params); + + addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal()); + addNode(doc, channel, "opensearch", "startIndex", ""+start); + addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage); + + addNode(doc, channel, "nutch", "query", queryString); + + + if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show + || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){ + addNode(doc, channel, "nutch", "nextPage", requestUrl + +"?query="+urlQuery + +"&start="+end + +"&hitsPerDup="+hitsPerDup + +params); + } + + if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { + addNode(doc, channel, "nutch", "showAllHits", requestUrl + +"?query="+urlQuery + +"&hitsPerDup="+0 + +params); + } + + for (int i = 0; i < length; i++) { + Hit hit = show[i]; + HitDetails detail = details[i]; + String title = detail.getValue("title"); + String url = detail.getValue("url"); + String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); + + if (title == null || title.equals("")) { // use url for docs w/o title + title = url; + } + + Element item = addNode(doc, channel, "item"); + + addNode(doc, item, "title", title); + if (summaries[i] != null) { + addNode(doc, item, "description", summaries[i].toString() ); + } + addNode(doc, item, "link", url); + + addNode(doc, item, "nutch", "site", hit.getDedupValue()); + + addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id); + addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id + +"&query="+urlQuery+"&lang="+queryLang); + + if (hit.moreFromDupExcluded()) { + addNode(doc, item, "nutch", "moreFromSite", requestUrl + +"?query=" + +URLEncoder.encode("site:"+hit.getDedupValue() + +" "+queryString, "UTF-8") + +"&hitsPerSite="+0 + +params); + } + + for (int j = 0; j < detail.getLength(); j++) { // add all from detail + String field = detail.getField(j); + if (!SKIP_DETAILS.contains(field)) + addNode(doc, item, "nutch", field, detail.getValue(j)); + } + } + + // dump DOM tree + + DOMSource source = new DOMSource(doc); + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer = transFactory.newTransformer(); + transformer.setOutputProperty("indent", "yes"); + StreamResult result = new StreamResult(response.getOutputStream()); + response.setContentType("text/xml"); + transformer.transform(source, result); + + } catch (javax.xml.parsers.ParserConfigurationException e) { + throw new ServletException(e); + } catch (javax.xml.transform.TransformerException e) { + throw new ServletException(e); + } + + } + + private static Element addNode(Document doc, Node parent, String name) { + Element child = doc.createElement(name); + parent.appendChild(child); + return child; + } + + private static void addNode(Document doc, Node parent, + String name, String text) { + Element child = doc.createElement(name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addNode(Document doc, Node parent, + String ns, String name, String text) { + Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addAttribute(Document doc, Element node, + String name, String value) { + Attr attribute = doc.createAttribute(name); + attribute.setValue(getLegalXml(value)); + node.getAttributes().setNamedItem(attribute); + } + + /* + * Ensure string is legal xml. + * @param text String to verify. + * @return Passed <code>text</code> or a new string with illegal + * characters removed if any found in <code>text</code>. + * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char + */ + protected static String getLegalXml(final String text) { + if (text == null) { + return null; + } + StringBuffer buffer = null; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (!isLegalXml(c)) { + if (buffer == null) { + // Start up a buffer. Copy characters here from now on + // now we've found at least one bad character in original. + buffer = new StringBuffer(text.length()); + buffer.append(text.substring(0, i)); + } + } else { + if (buffer != null) { + buffer.append(c); + } + } + } + return (buffer != null)? buffer.toString(): text; + } + + private static boolean isLegalXml(final char c) { + return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff) + || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-15 02:11:18
|
Revision: 2664 http://archive-access.svn.sourceforge.net/archive-access/?rev=2664&view=rev Author: binzino Date: 2008-12-15 01:47:48 +0000 (Mon, 15 Dec 2008) Log Message: ----------- Added own version of OpenSerach servlet which adds some XML elements and has a few other enhancements. Also revised the sample XSLT to take advantage of these changes in the OpenSearch servlet. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java 2008-12-15 01:47:48 UTC (rev 2664) @@ -0,0 +1,372 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.net.URLEncoder; +import java.util.Map; +import java.util.HashMap; +import java.util.Set; +import java.util.HashSet; + +import javax.servlet.ServletException; +import javax.servlet.ServletConfig; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import javax.xml.parsers.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.w3c.dom.*; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.nutch.searcher.Hit; +import org.apache.nutch.searcher.HitDetails; +import org.apache.nutch.searcher.Hits; +import org.apache.nutch.searcher.NutchBean; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.Summary; + +/** + * Present search results using A9's OpenSearch extensions to RSS, + * plus a few Nutch-specific extensions. + */ +public class OpenSearchServlet extends HttpServlet +{ + private static final Map NS_MAP = new HashMap(); + private int MAX_HITS_PER_PAGE; + + static { + NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/"); + NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/"); + } + + private static final Set SKIP_DETAILS = new HashSet(); + static { + SKIP_DETAILS.add("url"); // redundant with RSS link + SKIP_DETAILS.add("title"); // redundant with RSS title + } + + private NutchBean bean; + private Configuration conf; + + public void init(ServletConfig config) throws ServletException { + try { + this.conf = NutchConfiguration.get(config.getServletContext()); + bean = NutchBean.get(config.getServletContext(), this.conf); + } catch (IOException e) { + throw new ServletException(e); + } + MAX_HITS_PER_PAGE = conf.getInt("searcher.max.hits.per.page", -1); + } + + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + long responseTime = System.nanoTime( ); + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query request from " + request.getRemoteAddr()); + } + + // get parameters from request + request.setCharacterEncoding("UTF-8"); + String queryString = request.getParameter("query"); + if (queryString == null) + queryString = ""; + String urlQuery = URLEncoder.encode(queryString, "UTF-8"); + + // the query language + String queryLang = request.getParameter("lang"); + + int start = 0; // first hit to display + String startString = request.getParameter("start"); + if (startString != null) + start = Integer.parseInt(startString); + + int hitsPerPage = 10; // number of hits to display + String hitsString = request.getParameter("hitsPerPage"); + if (hitsString != null) + hitsPerPage = Integer.parseInt(hitsString); + if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE) + hitsPerPage = MAX_HITS_PER_PAGE; + + String sort = request.getParameter("sort"); + boolean reverse = sort != null && "true".equals(request.getParameter("reverse")); + + // De-Duplicate handling. Look for duplicates field and for how many + // duplicates per results to return. Default duplicates field is 'site' + // and duplicates per results default is '2'. + String dedupField = request.getParameter("dedupField"); + if (dedupField == null || dedupField.length() == 0) { + dedupField = "site"; + } + int hitsPerDup = 2; + String hitsPerDupString = request.getParameter("hitsPerDup"); + String hitsPerSiteString = request.getParameter("hitsPerSite"); + if (hitsPerDupString != null && hitsPerDupString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerDupString); + } else { + // If 'hitsPerSite' present, use that value. + if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerSiteString); + } + } + + // Make up query string for use later drawing the 'rss' logo. + String params = "&hitsPerPage=" + hitsPerPage + + (queryLang == null ? "" : "&lang=" + queryLang) + + (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") + + (dedupField == null ? "" : "&dedupField=" + dedupField)); + + Query query = Query.parse(queryString, queryLang, this.conf); + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query: " + queryString); + NutchBean.LOG.info("lang: " + queryLang); + } + + // execute the query + Hits hits; + try { + hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField, sort, reverse); + } catch (IOException e) { + if (NutchBean.LOG.isWarnEnabled()) { + NutchBean.LOG.warn("Search Error", e); + } + hits = new Hits(0,new Hit[0]); + } + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("total hits: " + hits.getTotal()); + } + + responseTime = System.nanoTime( ) - responseTime; + + // generate xml results + int end = (int)Math.min(hits.getLength(), start + hitsPerPage); + int length = end-start; + + Hit[] show = hits.getHits(start, end-start); + HitDetails[] details = bean.getDetails(show); + Summary[] summaries = bean.getSummary(details, query); + + String requestUrl = request.getRequestURL().toString(); + String base = requestUrl.substring(0, requestUrl.lastIndexOf('/')); + + + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + Document doc = factory.newDocumentBuilder().newDocument(); + + Element rss = addNode(doc, doc, "rss"); + addAttribute(doc, rss, "version", "2.0"); + addAttribute(doc, rss, "xmlns:opensearch", + (String)NS_MAP.get("opensearch")); + addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch")); + + Element channel = addNode(doc, rss, "channel"); + + addNode(doc, channel, "title", "Nutch: " + queryString); + addNode(doc, channel, "description", "Nutch search results for query: " + + queryString); + addNode(doc, channel, "link", + base+"/search.jsp" + +"?query="+urlQuery + +"&start="+start + +"&hitsPerDup="+hitsPerDup + +params); + + addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal()); + addNode(doc, channel, "opensearch", "startIndex", ""+start); + addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage); + + addNode(doc, channel, "nutch", "query", queryString); + addNode(doc, channel, "nutch", "responseTime", Double.toString( ((long) responseTime / 1000 / 1000 ) / 1000.0 ) ); + + // Add a <nutch:urlParams> element containing a list of all the URL parameters. + Element urlParams = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:urlParams" ); + channel.appendChild( urlParams ); + + for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) ) + { + String key = e.getKey( ); + for ( String value : e.getValue( ) ) + { + Element urlParam = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:param" ); + addAttribute( doc, urlParam, "name", key ); + addAttribute( doc, urlParam, "value", value ); + urlParams.appendChild(urlParam); + } + } + + // Hmm, we should indicate whether or not the "totalResults" + // number as being exact some other way; perhaps just have a + // <nutch:totalIsExact>true</nutch:totalIsExact> element. + /* + if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show + || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){ + addNode(doc, channel, "nutch", "nextPage", requestUrl + +"?query="+urlQuery + +"&start="+end + +"&hitsPerDup="+hitsPerDup + +params); + } + */ + + // Same here, this seems odd. + /* + if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { + addNode(doc, channel, "nutch", "showAllHits", requestUrl + +"?query="+urlQuery + +"&hitsPerDup="+0 + +params); + } + */ + + for (int i = 0; i < length; i++) { + Hit hit = show[i]; + HitDetails detail = details[i]; + String title = detail.getValue("title"); + String url = detail.getValue("url"); + String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); + + if (title == null || title.equals("")) { // use url for docs w/o title + title = url; + } + + Element item = addNode(doc, channel, "item"); + + addNode(doc, item, "title", title); + if (summaries[i] != null) { + addNode(doc, item, "description", summaries[i].toString() ); + } + addNode(doc, item, "link", url); + + addNode(doc, item, "nutch", "site", hit.getDedupValue()); + + addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id); + addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id + +"&query="+urlQuery+"&lang="+queryLang); + + // Probably don't need this as the XML processor/front-end can + // easily do this themselves. + if (hit.moreFromDupExcluded()) { + addNode(doc, item, "nutch", "moreFromSite", requestUrl + +"?query=" + +URLEncoder.encode("site:"+hit.getDedupValue() + +" "+queryString, "UTF-8") + +"&hitsPerSite="+0 + +params); + } + + for (int j = 0; j < detail.getLength(); j++) { // add all from detail + String field = detail.getField(j); + if (!SKIP_DETAILS.contains(field)) + addNode(doc, item, "nutch", field, detail.getValue(j)); + } + } + + // dump DOM tree + + DOMSource source = new DOMSource(doc); + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer = transFactory.newTransformer(); + transformer.setOutputProperty("indent", "yes"); + StreamResult result = new StreamResult(response.getOutputStream()); + response.setContentType("text/xml"); + transformer.transform(source, result); + + } catch (javax.xml.parsers.ParserConfigurationException e) { + throw new ServletException(e); + } catch (javax.xml.transform.TransformerException e) { + throw new ServletException(e); + } + + } + + private static Element addNode(Document doc, Node parent, String name) { + Element child = doc.createElement(name); + parent.appendChild(child); + return child; + } + + private static void addNode(Document doc, Node parent, + String name, String text) { + if ( text == null ) text = ""; + Element child = doc.createElement(name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addNode(Document doc, Node parent, + String ns, String name, String text) { + if ( text == null ) text = ""; + Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addAttribute(Document doc, Element node, + String name, String value) { + Attr attribute = doc.createAttribute(name); + attribute.setValue(getLegalXml(value)); + node.getAttributes().setNamedItem(attribute); + } + + /* + * Ensure string is legal xml. + * @param text String to verify. + * @return Passed <code>text</code> or a new string with illegal + * characters removed if any found in <code>text</code>. + * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char + */ + protected static String getLegalXml(final String text) { + if (text == null) { + return null; + } + StringBuffer buffer = null; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (!isLegalXml(c)) { + if (buffer == null) { + // Start up a buffer. Copy characters here from now on + // now we've found at least one bad character in original. + buffer = new StringBuffer(text.length()); + buffer.append(text.substring(0, i)); + } + } else { + if (buffer != null) { + buffer.append(c); + } + } + } + return (buffer != null)? buffer.toString(): text; + } + + private static boolean isLegalXml(final char c) { + return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff) + || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff); + } + +} Modified: trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl 2008-12-14 21:10:33 UTC (rev 2663) +++ trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl 2008-12-15 01:47:48 UTC (rev 2664) @@ -115,42 +115,49 @@ <span class="searchFields"> Search for <input id="query" name="query" type="text" size="40" value="{nutch:query}" /> + + <!-- Create hidden form fields for the rest of the URL parameters --> + <xsl:for-each select="nutch:urlParams/nutch:param[@name!='start' and @name!='query']"> + <xsl:element name="input" namespace="http://www.w3.org/1999/xhtml"> + <xsl:attribute name="type">hidden</xsl:attribute> + <xsl:attribute name="name" ><xsl:value-of select="@name" /></xsl:attribute> + <xsl:attribute name="value"><xsl:value-of select="@value" /></xsl:attribute> + </xsl:element> + </xsl:for-each> + <input type="submit" value="Search"/> </span> </form> </div> - <div style="font-size: 8pt; margin:0; padding:0 0 0.5em 0;">Results <xsl:value-of select="opensearch:startIndex + 1" />-<xsl:value-of select="opensearch:startIndex + opensearch:itemsPerPage" /> of about <xsl:value-of select="opensearch:totalResults" /> <span style="margin-left: 1em;"><a href="{nutch:nextPage}">Next</a></span></div> + <div style="font-size: 8pt; margin:0; padding:0 0 0.5em 0;">Results <xsl:value-of select="opensearch:startIndex + 1" />-<xsl:value-of select="opensearch:startIndex + opensearch:itemsPerPage" /> of about <xsl:value-of select="opensearch:totalResults" /> <span style="margin-left: 1em;"></span></div> <!-- Search results --> <ol start="{opensearch:startIndex + 1}"> <xsl:apply-templates select="item" /> </ol> <!-- Generate list of page links --> <center> - <xsl:if test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) != 1"> - <a href="search?query={nutch:query}&start={(floor(opensearch:startIndex div opensearch:itemsPerPage) - 1) * opensearch:itemsPerPage}">«</a><xsl:text> </xsl:text> - </xsl:if> - <xsl:choose> - <xsl:when test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) < 11"> - <xsl:call-template name="pageLinks" > - <xsl:with-param name="begin" select="1" /> - <xsl:with-param name="end" select="21" /> - <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> - </xsl:call-template> - </xsl:when> - <xsl:otherwise> - <xsl:call-template name="pageLinks" > - <xsl:with-param name="begin" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 - 10" /> - <xsl:with-param name="end" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 + 11" /> - <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> - </xsl:call-template> - </xsl:otherwise> - </xsl:choose> - <a href="{nutch:nextPage}">»</a> + <xsl:call-template name="pageLinks"> + <xsl:with-param name="labelPrevious" select="'«'" /> + <xsl:with-param name="labelNext" select="'»'" /> + </xsl:call-template> </center> </body> </html> </xsl:template> + +<!-- ====================================================================== + NutchWAX XSLT template/fuction library. + + The idea is that the above xhtml code is what most NutchWAX users + will modify to tailor to their own look and feel. The stuff + below implements the core logic for generating results lists, + page links, etc. + + Hopefully NutchWAX web developers will be able to easily edit the + above xhtml and css and won't have to change the below. + ====================================================================== --> + <!-- Template to emit a search result as an HTML list item (<li/>). --> <xsl:template match="item"> @@ -176,32 +183,99 @@ <xsl:value-of select="substring(.,1,4)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,5,2)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,7,2)" /><xsl:text> </xsl:text> </xsl:template> -<!-- Template to generate a list of numbered links to results pages. +<!-- Template to emit a list of numbered page links, *including* + "previous" and "next" links on either end, using the given labels. Parameters: + labelPrevious Link text for "previous page" link + labelNext Link text for "next page" link + --> +<xsl:template name="pageLinks"> + <xsl:param name="labelPrevious" /> + <xsl:param name="labelNext" /> + <!-- If we are on any page past the first, emit a "previous" link --> + <xsl:if test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) != 1"> + <xsl:call-template name="pageLink"> + <xsl:with-param name="pageNum" select="floor(opensearch:startIndex div opensearch:itemsPerPage)" /> + <xsl:with-param name="linkText" select="$labelPrevious" /> + </xsl:call-template> + <xsl:text> </xsl:text> + </xsl:if> + <!-- Now, emit numbered page links --> + <xsl:choose> + <xsl:when test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) < 11"> + <xsl:call-template name="numberedPageLinks" > + <xsl:with-param name="begin" select="1" /> + <xsl:with-param name="end" select="21" /> + <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> + </xsl:call-template> + </xsl:when> + <xsl:otherwise> + <xsl:call-template name="numberedPageLinks" > + <xsl:with-param name="begin" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 - 10" /> + <xsl:with-param name="end" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 + 11" /> + <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> + </xsl:call-template> + </xsl:otherwise> + </xsl:choose> + <!-- Lastly, emit a "next" link. --> + <xsl:text> </xsl:text> + <xsl:call-template name="pageLink"> + <xsl:with-param name="pageNum" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 2" /> + <xsl:with-param name="linkText" select="$labelNext" /> + </xsl:call-template> +</xsl:template> + +<!-- Template to emit a list of numbered links to results pages. + Parameters: begin starting # inclusive end ending # exclusive current the current page, don't emit a link --> -<xsl:template name="pageLinks"> +<xsl:template name="numberedPageLinks"> <xsl:param name="begin" /> <xsl:param name="end" /> <xsl:param name="current" /> <xsl:if test="$begin < $end"> - <xsl:choose> - <xsl:when test="$begin = $current" > - <xsl:value-of select="$current" /> - </xsl:when> - <xsl:otherwise> - <a href="?query={nutch:query}&start={($begin -1) * opensearch:itemsPerPage}&hitsPerPage={opensearch:itemsPerPage}"><xsl:value-of select="$begin" /></a> - </xsl:otherwise> - </xsl:choose> - <xsl:text> </xsl:text> - <xsl:call-template name="pageLinks"> - <xsl:with-param name="begin" select="$begin + 1" /> - <xsl:with-param name="end" select="$end" /> - <xsl:with-param name="current" select="$current" /> + <xsl:choose> + <xsl:when test="$begin = $current" > + <xsl:value-of select="$current" /> + </xsl:when> + <xsl:otherwise> + <xsl:call-template name="pageLink" > + <xsl:with-param name="pageNum" select="$begin" /> + <xsl:with-param name="linkText" select="$begin" /> </xsl:call-template> + </xsl:otherwise> + </xsl:choose> + <xsl:text> </xsl:text> + <xsl:call-template name="numberedPageLinks"> + <xsl:with-param name="begin" select="$begin + 1" /> + <xsl:with-param name="end" select="$end" /> + <xsl:with-param name="current" select="$current" /> + </xsl:call-template> </xsl:if> </xsl:template> +<!-- Template to emit a single page link. All of the URL parameters + listed in the OpenSearch results are included in the link. + Parmeters: + pageNum page number of the link + linkText text of the link + --> +<xsl:template name="pageLink"> + <xsl:param name="pageNum" /> + <xsl:param name="linkText" /> + <xsl:element name="a" namespace="http://www.w3.org/1999/xhtml"> + <xsl:attribute name="href"> + <xsl:text>?</xsl:text> + <xsl:for-each select="nutch:urlParams/nutch:param[@name!='start']"> + <xsl:value-of select="@name" /><xsl:text>=</xsl:text><xsl:value-of select="@value" /> + <xsl:text>&</xsl:text> + </xsl:for-each> + <xsl:text>start=</xsl:text><xsl:value-of select="($pageNum -1) * opensearch:itemsPerPage" /> + </xsl:attribute> + <xsl:value-of select="$linkText" /> + </xsl:element> +</xsl:template> + </xsl:stylesheet> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-12-16 06:41:48
|
Revision: 2674 http://archive-access.svn.sourceforge.net/archive-access/?rev=2674&view=rev Author: binzino Date: 2008-12-16 06:41:44 +0000 (Tue, 16 Dec 2008) Log Message: ----------- Moved web files into src/nutch sub-tree so they will be copied into Nutch corresponding sources directories for inclusion in Nutch ant build targets. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/jsp/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/jsp/search.xsl trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/web.xml Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/web/ Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/jsp/search.xsl =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/jsp/search.xsl (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/jsp/search.xsl 2008-12-16 06:41:44 UTC (rev 2674) @@ -0,0 +1,281 @@ +<?xml version="1.0" encoding="utf-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<xsl:stylesheet + version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:nutch="http://www.nutch.org/opensearchrss/1.0/" + xmlns:opensearch="http://a9.com/-/spec/opensearchrss/1.0/" +> +<xsl:output method="xml" /> + +<xsl:template match="rss/channel"> + <html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <title><xsl:value-of select="title" /></title> + <style media="all" lang="en" type="text/css"> + body + { + padding : 20px; + margin : 0; + font-family : Verdana; sans-serif; + font-size : 9pt; + color : #000000; + background-color: #ffffff; + } + .pageTitle + { + font-size : 125% ; + font-weight : bold ; + text-align : center ; + padding-bottom : 2em ; + } + .searchForm + { + margin : 20px 0 5px 0; + padding-bottom : 0px; + border-bottom : 1px solid black; + } + .searchResult + { + margin : 0; + padding : 0; + } + .searchResult h1 + { + margin : 0 0 5px 0 ; + padding : 0 ; + font-size : 120%; + } + .searchResult .details + { + font-size: 80%; + color: green; + } + .searchResult .dates + { + font-size: 80%; + } + .searchResult .dates a + { + color: #3366cc; + } + form#searchForm + { + margin : 0; padding: 0 0 10px 0; + } + .searchFields + { + padding : 3px 0; + } + .searchFields input + { + margin : 0 0 0 15px; + padding : 0; + } + input#query + { + margin : 0; + } + ol + { + margin : 5px 0 0 0; + padding : 0 0 0 2em; + } + ol li + { + margin : 0 0 15px 0; + } + </style> + </head> + <body> + <!-- Page header: title and search form --> + <div class="pageTitle" > + NutchWAX Sample XSLT + </div> + <div> + This simple XSLT demonstrates the transformation of OpenSearch XML results into a fully-functional, human-friendly HTML search page. No JSP needed. + </div> + <div class="searchForm"> + <form id="searchForm" name="searchForm" method="get" action="search" > + <span class="searchFields"> + Search for + <input id="query" name="query" type="text" size="40" value="{nutch:query}" /> + + <!-- Create hidden form fields for the rest of the URL parameters --> + <xsl:for-each select="nutch:urlParams/nutch:param[@name!='start' and @name!='query']"> + <xsl:element name="input" namespace="http://www.w3.org/1999/xhtml"> + <xsl:attribute name="type">hidden</xsl:attribute> + <xsl:attribute name="name" ><xsl:value-of select="@name" /></xsl:attribute> + <xsl:attribute name="value"><xsl:value-of select="@value" /></xsl:attribute> + </xsl:element> + </xsl:for-each> + + <input type="submit" value="Search"/> + </span> + </form> + </div> + <div style="font-size: 8pt; margin:0; padding:0 0 0.5em 0;">Results <xsl:value-of select="opensearch:startIndex + 1" />-<xsl:value-of select="opensearch:startIndex + opensearch:itemsPerPage" /> of about <xsl:value-of select="opensearch:totalResults" /> <span style="margin-left: 1em;"></span></div> + <!-- Search results --> + <ol start="{opensearch:startIndex + 1}"> + <xsl:apply-templates select="item" /> + </ol> + <!-- Generate list of page links --> + <center> + <xsl:call-template name="pageLinks"> + <xsl:with-param name="labelPrevious" select="'«'" /> + <xsl:with-param name="labelNext" select="'»'" /> + </xsl:call-template> + </center> + </body> +</html> +</xsl:template> + + +<!-- ====================================================================== + NutchWAX XSLT template/fuction library. + + The idea is that the above xhtml code is what most NutchWAX users + will modify to tailor to their own look and feel. The stuff + below implements the core logic for generating results lists, + page links, etc. + + Hopefully NutchWAX web developers will be able to easily edit the + above xhtml and css and won't have to change the below. + ====================================================================== --> + +<!-- Template to emit a search result as an HTML list item (<li/>). + --> +<xsl:template match="item"> + <li> + <div class="searchResult"> + <h1><a href="{concat('http://wayback.archive-it.org/',nutch:collection,'/',nutch:date,'/',link)}"><xsl:value-of select="title" /></a></h1> + <div> + <xsl:value-of select="description" /> + </div> + <div class="details"> + <xsl:value-of select="link" /> - <xsl:value-of select="round( nutch:length div 1024 )"/>k - <xsl:value-of select="nutch:type" /> + </div> + <div class="dates"> + <a href="{concat('http://wayback.archive-it.org/',nutch:collection,'/*/',link)}">All versions</a> - <a href="?query={../nutch:query} site:{nutch:site}&hitsPerSite=0">More from <xsl:value-of select="nutch:site" /></a> + </div> + </div> + </li> +</xsl:template> + +<!-- Template to emit a date in YYYY/MM/DD format + --> +<xsl:template match="nutch:date" > + <xsl:value-of select="substring(.,1,4)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,5,2)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,7,2)" /><xsl:text> </xsl:text> +</xsl:template> + +<!-- Template to emit a list of numbered page links, *including* + "previous" and "next" links on either end, using the given labels. + Parameters: + labelPrevious Link text for "previous page" link + labelNext Link text for "next page" link + --> +<xsl:template name="pageLinks"> + <xsl:param name="labelPrevious" /> + <xsl:param name="labelNext" /> + <!-- If we are on any page past the first, emit a "previous" link --> + <xsl:if test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) != 1"> + <xsl:call-template name="pageLink"> + <xsl:with-param name="pageNum" select="floor(opensearch:startIndex div opensearch:itemsPerPage)" /> + <xsl:with-param name="linkText" select="$labelPrevious" /> + </xsl:call-template> + <xsl:text> </xsl:text> + </xsl:if> + <!-- Now, emit numbered page links --> + <xsl:choose> + <xsl:when test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) < 11"> + <xsl:call-template name="numberedPageLinks" > + <xsl:with-param name="begin" select="1" /> + <xsl:with-param name="end" select="21" /> + <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> + </xsl:call-template> + </xsl:when> + <xsl:otherwise> + <xsl:call-template name="numberedPageLinks" > + <xsl:with-param name="begin" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 - 10" /> + <xsl:with-param name="end" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 + 11" /> + <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> + </xsl:call-template> + </xsl:otherwise> + </xsl:choose> + <!-- Lastly, emit a "next" link. --> + <xsl:text> </xsl:text> + <xsl:call-template name="pageLink"> + <xsl:with-param name="pageNum" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 2" /> + <xsl:with-param name="linkText" select="$labelNext" /> + </xsl:call-template> +</xsl:template> + +<!-- Template to emit a list of numbered links to results pages. + Parameters: + begin starting # inclusive + end ending # exclusive + current the current page, don't emit a link + --> +<xsl:template name="numberedPageLinks"> + <xsl:param name="begin" /> + <xsl:param name="end" /> + <xsl:param name="current" /> + <xsl:if test="$begin < $end"> + <xsl:choose> + <xsl:when test="$begin = $current" > + <xsl:value-of select="$current" /> + </xsl:when> + <xsl:otherwise> + <xsl:call-template name="pageLink" > + <xsl:with-param name="pageNum" select="$begin" /> + <xsl:with-param name="linkText" select="$begin" /> + </xsl:call-template> + </xsl:otherwise> + </xsl:choose> + <xsl:text> </xsl:text> + <xsl:call-template name="numberedPageLinks"> + <xsl:with-param name="begin" select="$begin + 1" /> + <xsl:with-param name="end" select="$end" /> + <xsl:with-param name="current" select="$current" /> + </xsl:call-template> + </xsl:if> +</xsl:template> + +<!-- Template to emit a single page link. All of the URL parameters + listed in the OpenSearch results are included in the link. + Parmeters: + pageNum page number of the link + linkText text of the link + --> +<xsl:template name="pageLink"> + <xsl:param name="pageNum" /> + <xsl:param name="linkText" /> + <xsl:element name="a" namespace="http://www.w3.org/1999/xhtml"> + <xsl:attribute name="href"> + <xsl:text>?</xsl:text> + <xsl:for-each select="nutch:urlParams/nutch:param[@name!='start']"> + <xsl:value-of select="@name" /><xsl:text>=</xsl:text><xsl:value-of select="@value" /> + <xsl:text>&</xsl:text> + </xsl:for-each> + <xsl:text>start=</xsl:text><xsl:value-of select="($pageNum -1) * opensearch:itemsPerPage" /> + </xsl:attribute> + <xsl:value-of select="$linkText" /> + </xsl:element> +</xsl:template> + +</xsl:stylesheet> Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/web.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/web.xml (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/web/web.xml 2008-12-16 06:41:44 UTC (rev 2674) @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> +<!DOCTYPE web-app + PUBLIC "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN" + "http://java.sun.com/dtd/web-app_2_3.dtd"> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<web-app> + +<!-- order is very important here --> + +<listener> + <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class> + <listener-class>org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor</listener-class> +</listener> + +<servlet> + <servlet-name>Cached</servlet-name> + <servlet-class>org.apache.nutch.servlet.Cached</servlet-class> +</servlet> + +<servlet> + <servlet-name>OpenSearch</servlet-name> + <servlet-class>org.archive.nutchwax.OpenSearchServlet</servlet-class> +</servlet> + +<servlet-mapping> + <servlet-name>Cached</servlet-name> + <url-pattern>/servlet/cached</url-pattern> +</servlet-mapping> + +<servlet-mapping> + <servlet-name>OpenSearch</servlet-name> + <url-pattern>/opensearch</url-pattern> +</servlet-mapping> + +<servlet-mapping> + <servlet-name>OpenSearch</servlet-name> + <url-pattern>/search</url-pattern> +</servlet-mapping> + +<filter> + <filter-name>XSLT Filter</filter-name> + <filter-class>org.archive.nutchwax.XSLTFilter</filter-class> + <init-param> + <param-name>xsltUrl</param-name> + <param-value>style/search.xsl</param-value> + </init-param> +</filter> + +<filter-mapping> + <filter-name>XSLT Filter</filter-name> + <url-pattern>/search</url-pattern> +</filter-mapping> + +<welcome-file-list> + <welcome-file>search.html</welcome-file> + <welcome-file>index.html</welcome-file> + <welcome-file>index.jsp</welcome-file> +</welcome-file-list> + +<taglib> + <taglib-uri>http://jakarta.apache.org/taglibs/i18n</taglib-uri> + <taglib-location>/WEB-INF/taglibs-i18n.tld</taglib-location> + </taglib> + +</web-app> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-03-04 01:18:45
|
Revision: 2689 http://archive-access.svn.sourceforge.net/archive-access/?rev=2689&view=rev Author: binzino Date: 2009-03-04 01:18:44 +0000 (Wed, 04 Mar 2009) Log Message: ----------- Added boolean configuration property nutchwax.import.store.content to determine whether or not the Importer stores the full content in the segment's "content" directory. Removed a useless debug message from the end of the Import job. Removed searcher.max.hits from nutch-site.xml as it actually causes lots of problems with search-time site-based de-dup. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-03-03 20:34:38 UTC (rev 2688) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2009-03-04 01:18:44 UTC (rev 2689) @@ -456,8 +456,12 @@ try { - output.collect( key, new NutchWritable( datum ) ); - output.collect( key, new NutchWritable( content ) ); + output.collect( key, new NutchWritable( datum ) ); + + if ( jobConf.getBoolean( "nutchwax.import.store.content", false ) ) + { + output.collect( key, new NutchWritable( content ) ); + } if ( parseResult != null ) { @@ -649,9 +653,6 @@ RunningJob rj = JobClient.runJob( job ); - // Emit job id and status. - System.out.println( "JOB_STATUS: " + rj.getID( ) + ": " + (rj.isSuccessful( ) ? "SUCCESS" : "FAIL" ) ); - return rj.isSuccessful( ) ? 0 : 1; } catch ( Exception e ) Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-03-03 20:34:38 UTC (rev 2688) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2009-03-04 01:18:44 UTC (rev 2689) @@ -137,6 +137,25 @@ <value>1048576</value> </property> +<!-- Whether or not we store the full content in the segment's + "content" directory. Most NutchWAX users are also using Wayback + to serve the archived content, so there's no need for NutchWAX to + keep a "cached" copy as well. + + Setting to 'true' yields the same bahavior as in previous + versions of NutchWAX, and as in Nutch. The content is stored in + the segment's "content" directory. + + Setting to 'false' results in an empty "content" directory in the + segment. The content is not stored. + + Default value is 'false'. + --> +<property> + <name>nutchwax.import.store.content</name> + <value>false</value> +</property> + <!-- Enable per-collection segment sub-dirs, e.g. segments/<collectionId>/segment1 /segment2 @@ -156,11 +175,6 @@ </property> <property> - <name>searcher.max.hits</name> - <value>1000</value> -</property> - -<property> <name>searcher.summary.context</name> <value>8</value> </property> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2009-10-28 21:24:58
|
Revision: 2860 http://archive-access.svn.sourceforge.net/archive-access/?rev=2860&view=rev Author: binzino Date: 2009-10-28 21:24:26 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Moved to Nutch source overlay so that edits in Nutch sources can access this class. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-28 03:40:14 UTC (rev 2859) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-28 21:24:26 UTC (rev 2860) @@ -1,616 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * ARCHIVE: This must be in the lucene index package because it needs - * to call protected methods on other IndexReader objects. - */ -package org.apache.lucene.index; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.FieldSelector; -import org.apache.lucene.document.FieldSelectorResult; -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermDocs; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.index.TermFreqVector; -import org.apache.lucene.index.TermPositions; -import org.apache.lucene.index.TermVectorMapper; - -import java.io.IOException; -import java.util.*; - - -/** An IndexReader which reads multiple, parallel indexes. Each index added - * must have the same number of documents, but typically each contains - * different fields. Each document contains the union of the fields of all - * documents with the same document number. When searching, matches for a - * query term are from the first index added that has the field. - * - * <p>This is useful, e.g., with collections that have large fields which - * change rarely and small fields that change more frequently. The smaller - * fields may be re-indexed in a new index and both indexes may be searched - * together. - * - * <p><strong>Warning:</strong> It is up to you to make sure all indexes - * are created and modified the same way. For example, if you add - * documents to one index, you need to add the same documents in the - * same order to the other indexes. <em>Failure to do so will result in - * undefined behavior</em>. - */ -public class ArchiveParallelReader extends IndexReader { - private List readers = new ArrayList(); - private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close - boolean incRefReaders = false; - private SortedMap fieldToReader = new TreeMap(); - - private int maxDoc; - private int numDocs; - private boolean hasDeletions; - - /** Construct a ArchiveParallelReader. - * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p> - */ - public ArchiveParallelReader() throws IOException { this(true); } - - /** Construct a ArchiveParallelReader. - * @param closeSubReaders indicates whether the subreaders should be closed - * when this ArchiveParallelReader is closed - */ - public ArchiveParallelReader(boolean closeSubReaders) throws IOException { - super(); - this.incRefReaders = !closeSubReaders; - } - - /** Add an IndexReader. - * @throws IOException if there is a low-level IO error - */ - public void add(IndexReader reader) throws IOException - { - ensureOpen(); - if (readers.size() == 0) { - this.maxDoc = reader.maxDoc(); - this.numDocs = reader.numDocs(); - this.hasDeletions = reader.hasDeletions(); - } - - if (reader.maxDoc() != maxDoc) // check compatibility - throw new IllegalArgumentException - ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); - if (reader.numDocs() != numDocs) - throw new IllegalArgumentException - ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); - - Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); - Iterator i = fields.iterator(); - while (i.hasNext()) { // update fieldToReader map - String field = (String)i.next(); - if (fieldToReader.get(field) == null) - fieldToReader.put(field, reader); - } - - readers.add(reader); - - if (incRefReaders) { - reader.incRef(); - } - decrefOnClose.add(Boolean.valueOf(incRefReaders)); - } - - /** - * Tries to reopen the subreaders. - * <br> - * If one or more subreaders could be re-opened (i. e. subReader.reopen() - * returned a new instance != subReader), then a new ArchiveParallelReader instance - * is returned, otherwise this instance is returned. - * <p> - * A re-opened instance might share one or more subreaders with the old - * instance. Index modification operations result in undefined behavior - * when performed before the old instance is closed. - * (see {@link IndexReader#reopen()}). - * <p> - * If subreaders are shared, then the reference count of those - * readers is increased to ensure that the subreaders remain open - * until the last referring reader is closed. - * - * @throws CorruptIndexException if the index is corrupt - * @throws IOException if there is a low-level IO error - */ - public IndexReader reopen() throws CorruptIndexException, IOException { - ensureOpen(); - - boolean reopened = false; - List newReaders = new ArrayList(); - List newDecrefOnClose = new ArrayList(); - - boolean success = false; - - try { - - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = oldReader.reopen(); - newReaders.add(newReader); - // if at least one of the subreaders was updated we remember that - // and return a new MultiReader - if (newReader != oldReader) { - reopened = true; - } - } - - if (reopened) { - ArchiveParallelReader pr = new ArchiveParallelReader(); - for (int i = 0; i < readers.size(); i++) { - IndexReader oldReader = (IndexReader) readers.get(i); - IndexReader newReader = (IndexReader) newReaders.get(i); - if (newReader == oldReader) { - newDecrefOnClose.add(Boolean.TRUE); - newReader.incRef(); - } else { - // this is a new subreader instance, so on close() we don't - // decRef but close it - newDecrefOnClose.add(Boolean.FALSE); - } - pr.add(newReader); - } - pr.decrefOnClose = newDecrefOnClose; - pr.incRefReaders = incRefReaders; - success = true; - return pr; - } else { - success = true; - // No subreader was refreshed - return this; - } - } finally { - if (!success && reopened) { - for (int i = 0; i < newReaders.size(); i++) { - IndexReader r = (IndexReader) newReaders.get(i); - if (r != null) { - try { - if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { - r.decRef(); - } else { - r.close(); - } - } catch (IOException ignore) { - // keep going - we want to clean up as much as possible - } - } - } - } - } - } - - - public int numDocs() { - // Don't call ensureOpen() here (it could affect performance) - return numDocs; - } - - public int maxDoc() { - // Don't call ensureOpen() here (it could affect performance) - return maxDoc; - } - - public boolean hasDeletions() { - // Don't call ensureOpen() here (it could affect performance) - return hasDeletions; - } - - // check first reader - public boolean isDeleted(int n) { - // Don't call ensureOpen() here (it could affect performance) - if (readers.size() > 0) - return ((IndexReader)readers.get(0)).isDeleted(n); - return false; - } - - // delete in all readers - protected void doDelete(int n) throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).deleteDocument(n); - } - hasDeletions = true; - } - - /** - * @see org.apache.lucene.index.ParallelReader.doUndeleteAll - */ - protected void doUndeleteAll() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - ((IndexReader)readers.get(i)).undeleteAll(); - } - hasDeletions = false; - } - - /** - * <p><strong>ARCHIVE</strong> modification</p> - * <p>Return a <code>Document</code> with fields merged from parallel - * indices. The values for a given field will <strong>only</strong> - * come from the first index that has the field. This matches the - * searching behavior where a field is only searched in the first - * index that has the field.</p> - * <p>This differs from the bundled Lucene <code>ParallelReader</code>, - * which adds all values from every index that has the field.</p> - * <p>The <code>fieldSelector<code> parameter is ignored.</p> - * <h3>Implementation Notes</h3> - * <p>Since getting the document from the reader is the expensive - * operation, we only get it once from each reader. Once we've - * gotten the document from the reader, we iterate through the - * fields and only copy those fields that are mapped to the reader.</p> - * <p>The first implementation iterated through the field names, - * getting the document from the corresponding reader for each - * field name (10 fields => 10 document gets) which was a big - * performance hit.</p> - * <p>In this implementation, there are only as many document gets as - * there are readers.</p> - * @param n ordinal position of document to return - * @param fieldSelector ignored - * @return the document with field values assembled from parallel indicdes - * @throws CorruptIndexException if the index is corrupt - * @throws IOException if there is a low-level IO error - */ - public Document document(int n, FieldSelector fieldSelector) - throws CorruptIndexException, IOException - { - ensureOpen(); - Document result = new Document(); - - for ( IndexReader reader : (List<IndexReader>) readers ) - { - Document d = reader.document( n ); - - for ( Fieldable f : ((List<Fieldable>) d.getFields()) ) - { - if ( fieldToReader.get( f.name( ) ) == reader ) - { - result.add( f ); - } - } - } - - return result; - } - - // get all vectors - public TermFreqVector[] getTermFreqVectors(int n) throws IOException { - ensureOpen(); - ArrayList results = new ArrayList(); - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); - TermFreqVector vector = reader.getTermFreqVector(n, field); - if (vector != null) - results.add(vector); - } - return (TermFreqVector[]) - results.toArray(new TermFreqVector[results.size()]); - } - - public TermFreqVector getTermFreqVector(int n, String field) - throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - return reader==null ? null : reader.getTermFreqVector(n, field); - } - - - public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader != null) { - reader.getTermFreqVector(docNumber, field, mapper); - } - } - - public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { - ensureOpen(); - ensureOpen(); - - Iterator i = fieldToReader.entrySet().iterator(); - while (i.hasNext()) { - Map.Entry e = (Map.Entry)i.next(); - String field = (String)e.getKey(); - IndexReader reader = (IndexReader)e.getValue(); - reader.getTermFreqVector(docNumber, field, mapper); - } - - } - - public boolean hasNorms(String field) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - return reader==null ? false : reader.hasNorms(field); - } - - public byte[] norms(String field) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - return reader==null ? null : reader.norms(field); - } - - public void norms(String field, byte[] result, int offset) - throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader!=null) - reader.norms(field, result, offset); - } - - protected void doSetNorm(int n, String field, byte value) - throws CorruptIndexException, IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader!=null) - reader.doSetNorm(n, field, value); - } - - public TermEnum terms() throws IOException { - ensureOpen(); - return new ParallelTermEnum(); - } - - public TermEnum terms(Term term) throws IOException { - ensureOpen(); - return new ParallelTermEnum(term); - } - - public int docFreq(Term term) throws IOException { - ensureOpen(); - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); - return reader==null ? 0 : reader.docFreq(term); - } - - public TermDocs termDocs(Term term) throws IOException { - ensureOpen(); - return new ParallelTermDocs(term); - } - - public TermDocs termDocs() throws IOException { - ensureOpen(); - return new ParallelTermDocs(); - } - - public TermPositions termPositions(Term term) throws IOException { - ensureOpen(); - return new ParallelTermPositions(term); - } - - public TermPositions termPositions() throws IOException { - ensureOpen(); - return new ParallelTermPositions(); - } - - /** - * Checks recursively if all subreaders are up to date. - */ - public boolean isCurrent() throws CorruptIndexException, IOException { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isCurrent()) { - return false; - } - } - - // all subreaders are up to date - return true; - } - - /** - * Checks recursively if all subindexes are optimized - */ - public boolean isOptimized() { - for (int i = 0; i < readers.size(); i++) { - if (!((IndexReader)readers.get(i)).isOptimized()) { - return false; - } - } - - // all subindexes are optimized - return true; - } - - - /** Not implemented. - * @throws UnsupportedOperationException - */ - public long getVersion() { - throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); - } - - // for testing - IndexReader[] getSubReaders() { - return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); - } - - protected void doCommit() throws IOException { - for (int i = 0; i < readers.size(); i++) - ((IndexReader)readers.get(i)).commit(); - } - - protected synchronized void doClose() throws IOException { - for (int i = 0; i < readers.size(); i++) { - if (((Boolean) decrefOnClose.get(i)).booleanValue()) { - ((IndexReader)readers.get(i)).decRef(); - } else { - ((IndexReader)readers.get(i)).close(); - } - } - } - - public Collection getFieldNames (IndexReader.FieldOption fieldNames) { - ensureOpen(); - Set fieldSet = new HashSet(); - for (int i = 0; i < readers.size(); i++) { - IndexReader reader = ((IndexReader)readers.get(i)); - Collection names = reader.getFieldNames(fieldNames); - fieldSet.addAll(names); - } - return fieldSet; - } - - private class ParallelTermEnum extends TermEnum { - private String field; - private Iterator fieldIterator; - private TermEnum termEnum; - - public ParallelTermEnum() throws IOException { - if ( fieldToReader.isEmpty( ) ) return ; - - field = (String)fieldToReader.firstKey(); - if (field != null) - termEnum = ((IndexReader)fieldToReader.get(field)).terms(); - } - - public ParallelTermEnum(Term term) throws IOException { - field = term.field(); - IndexReader reader = ((IndexReader)fieldToReader.get(field)); - if (reader!=null) - termEnum = reader.terms(term); - } - - public boolean next() throws IOException { - if (termEnum==null) - return false; - - // another term in this field? - if (termEnum.next() && termEnum.term().field()==field) - return true; // yes, keep going - - termEnum.close(); // close old termEnum - - // find the next field with terms, if any - if (fieldIterator==null) { - fieldIterator = fieldToReader.tailMap(field).keySet().iterator(); - fieldIterator.next(); // Skip field to get next one - } - while (fieldIterator.hasNext()) { - field = (String) fieldIterator.next(); - termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); - Term term = termEnum.term(); - if (term!=null && term.field()==field) - return true; - else - termEnum.close(); - } - - return false; // no more fields - } - - public Term term() { - if (termEnum==null) - return null; - - return termEnum.term(); - } - - public int docFreq() { - if (termEnum==null) - return 0; - - return termEnum.docFreq(); - } - - public void close() throws IOException { - if (termEnum!=null) - termEnum.close(); - } - - } - - // wrap a TermDocs in order to support seek(Term) - private class ParallelTermDocs implements TermDocs { - protected TermDocs termDocs; - - public ParallelTermDocs() {} - public ParallelTermDocs(Term term) throws IOException { seek(term); } - - public int doc() { return termDocs.doc(); } - public int freq() { return termDocs.freq(); } - - public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); - termDocs = reader!=null ? reader.termDocs(term) : null; - } - - public void seek(TermEnum termEnum) throws IOException { - seek(termEnum.term()); - } - - public boolean next() throws IOException { - if (termDocs==null) - return false; - - return termDocs.next(); - } - - public int read(final int[] docs, final int[] freqs) throws IOException { - if (termDocs==null) - return 0; - - return termDocs.read(docs, freqs); - } - - public boolean skipTo(int target) throws IOException { - if (termDocs==null) - return false; - - return termDocs.skipTo(target); - } - - public void close() throws IOException { - if (termDocs!=null) - termDocs.close(); - } - - } - - private class ParallelTermPositions - extends ParallelTermDocs implements TermPositions { - - public ParallelTermPositions() {} - public ParallelTermPositions(Term term) throws IOException { seek(term); } - - public void seek(Term term) throws IOException { - IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); - termDocs = reader!=null ? reader.termPositions(term) : null; - } - - public int nextPosition() throws IOException { - // It is an error to call this if there is no next position, e.g. if termDocs==null - return ((TermPositions)termDocs).nextPosition(); - } - - public int getPayloadLength() { - return ((TermPositions)termDocs).getPayloadLength(); - } - - public byte[] getPayload(byte[] data, int offset) throws IOException { - return ((TermPositions)termDocs).getPayload(data, offset); - } - - - // TODO: Remove warning after API has been finalized - public boolean isPayloadAvailable() { - return ((TermPositions) termDocs).isPayloadAvailable(); - } - } - -} Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2009-10-28 21:24:26 UTC (rev 2860) @@ -0,0 +1,616 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * ARCHIVE: This must be in the lucene index package because it needs + * to call protected methods on other IndexReader objects. + */ +package org.apache.lucene.index; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.TermVectorMapper; + +import java.io.IOException; +import java.util.*; + + +/** An IndexReader which reads multiple, parallel indexes. Each index added + * must have the same number of documents, but typically each contains + * different fields. Each document contains the union of the fields of all + * documents with the same document number. When searching, matches for a + * query term are from the first index added that has the field. + * + * <p>This is useful, e.g., with collections that have large fields which + * change rarely and small fields that change more frequently. The smaller + * fields may be re-indexed in a new index and both indexes may be searched + * together. + * + * <p><strong>Warning:</strong> It is up to you to make sure all indexes + * are created and modified the same way. For example, if you add + * documents to one index, you need to add the same documents in the + * same order to the other indexes. <em>Failure to do so will result in + * undefined behavior</em>. + */ +public class ArchiveParallelReader extends IndexReader { + private List readers = new ArrayList(); + private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close + boolean incRefReaders = false; + private SortedMap fieldToReader = new TreeMap(); + + private int maxDoc; + private int numDocs; + private boolean hasDeletions; + + /** Construct a ArchiveParallelReader. + * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p> + */ + public ArchiveParallelReader() throws IOException { this(true); } + + /** Construct a ArchiveParallelReader. + * @param closeSubReaders indicates whether the subreaders should be closed + * when this ArchiveParallelReader is closed + */ + public ArchiveParallelReader(boolean closeSubReaders) throws IOException { + super(); + this.incRefReaders = !closeSubReaders; + } + + /** Add an IndexReader. + * @throws IOException if there is a low-level IO error + */ + public void add(IndexReader reader) throws IOException + { + ensureOpen(); + if (readers.size() == 0) { + this.maxDoc = reader.maxDoc(); + this.numDocs = reader.numDocs(); + this.hasDeletions = reader.hasDeletions(); + } + + if (reader.maxDoc() != maxDoc) // check compatibility + throw new IllegalArgumentException + ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); + if (reader.numDocs() != numDocs) + throw new IllegalArgumentException + ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); + + Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); + Iterator i = fields.iterator(); + while (i.hasNext()) { // update fieldToReader map + String field = (String)i.next(); + if (fieldToReader.get(field) == null) + fieldToReader.put(field, reader); + } + + readers.add(reader); + + if (incRefReaders) { + reader.incRef(); + } + decrefOnClose.add(Boolean.valueOf(incRefReaders)); + } + + /** + * Tries to reopen the subreaders. + * <br> + * If one or more subreaders could be re-opened (i. e. subReader.reopen() + * returned a new instance != subReader), then a new ArchiveParallelReader instance + * is returned, otherwise this instance is returned. + * <p> + * A re-opened instance might share one or more subreaders with the old + * instance. Index modification operations result in undefined behavior + * when performed before the old instance is closed. + * (see {@link IndexReader#reopen()}). + * <p> + * If subreaders are shared, then the reference count of those + * readers is increased to ensure that the subreaders remain open + * until the last referring reader is closed. + * + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public IndexReader reopen() throws CorruptIndexException, IOException { + ensureOpen(); + + boolean reopened = false; + List newReaders = new ArrayList(); + List newDecrefOnClose = new ArrayList(); + + boolean success = false; + + try { + + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = oldReader.reopen(); + newReaders.add(newReader); + // if at least one of the subreaders was updated we remember that + // and return a new MultiReader + if (newReader != oldReader) { + reopened = true; + } + } + + if (reopened) { + ArchiveParallelReader pr = new ArchiveParallelReader(); + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = (IndexReader) newReaders.get(i); + if (newReader == oldReader) { + newDecrefOnClose.add(Boolean.TRUE); + newReader.incRef(); + } else { + // this is a new subreader instance, so on close() we don't + // decRef but close it + newDecrefOnClose.add(Boolean.FALSE); + } + pr.add(newReader); + } + pr.decrefOnClose = newDecrefOnClose; + pr.incRefReaders = incRefReaders; + success = true; + return pr; + } else { + success = true; + // No subreader was refreshed + return this; + } + } finally { + if (!success && reopened) { + for (int i = 0; i < newReaders.size(); i++) { + IndexReader r = (IndexReader) newReaders.get(i); + if (r != null) { + try { + if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { + r.decRef(); + } else { + r.close(); + } + } catch (IOException ignore) { + // keep going - we want to clean up as much as possible + } + } + } + } + } + } + + + public int numDocs() { + // Don't call ensureOpen() here (it could affect performance) + return numDocs; + } + + public int maxDoc() { + // Don't call ensureOpen() here (it could affect performance) + return maxDoc; + } + + public boolean hasDeletions() { + // Don't call ensureOpen() here (it could affect performance) + return hasDeletions; + } + + // check first reader + public boolean isDeleted(int n) { + // Don't call ensureOpen() here (it could affect performance) + if (readers.size() > 0) + return ((IndexReader)readers.get(0)).isDeleted(n); + return false; + } + + // delete in all readers + protected void doDelete(int n) throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).deleteDocument(n); + } + hasDeletions = true; + } + + /** + * @see org.apache.lucene.index.ParallelReader.doUndeleteAll + */ + protected void doUndeleteAll() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).undeleteAll(); + } + hasDeletions = false; + } + + /** + * <p><strong>ARCHIVE</strong> modification</p> + * <p>Return a <code>Document</code> with fields merged from parallel + * indices. The values for a given field will <strong>only</strong> + * come from the first index that has the field. This matches the + * searching behavior where a field is only searched in the first + * index that has the field.</p> + * <p>This differs from the bundled Lucene <code>ParallelReader</code>, + * which adds all values from every index that has the field.</p> + * <p>The <code>fieldSelector<code> parameter is ignored.</p> + * <h3>Implementation Notes</h3> + * <p>Since getting the document from the reader is the expensive + * operation, we only get it once from each reader. Once we've + * gotten the document from the reader, we iterate through the + * fields and only copy those fields that are mapped to the reader.</p> + * <p>The first implementation iterated through the field names, + * getting the document from the corresponding reader for each + * field name (10 fields => 10 document gets) which was a big + * performance hit.</p> + * <p>In this implementation, there are only as many document gets as + * there are readers.</p> + * @param n ordinal position of document to return + * @param fieldSelector ignored + * @return the document with field values assembled from parallel indicdes + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public Document document(int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException + { + ensureOpen(); + Document result = new Document(); + + for ( IndexReader reader : (List<IndexReader>) readers ) + { + Document d = reader.document( n ); + + for ( Fieldable f : ((List<Fieldable>) d.getFields()) ) + { + if ( fieldToReader.get( f.name( ) ) == reader ) + { + result.add( f ); + } + } + } + + return result; + } + + // get all vectors + public TermFreqVector[] getTermFreqVectors(int n) throws IOException { + ensureOpen(); + ArrayList results = new ArrayList(); + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + TermFreqVector vector = reader.getTermFreqVector(n, field); + if (vector != null) + results.add(vector); + } + return (TermFreqVector[]) + results.toArray(new TermFreqVector[results.size()]); + } + + public TermFreqVector getTermFreqVector(int n, String field) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.getTermFreqVector(n, field); + } + + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader != null) { + reader.getTermFreqVector(docNumber, field, mapper); + } + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + ensureOpen(); + + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + reader.getTermFreqVector(docNumber, field, mapper); + } + + } + + public boolean hasNorms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? false : reader.hasNorms(field); + } + + public byte[] norms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.norms(field); + } + + public void norms(String field, byte[] result, int offset) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.norms(field, result, offset); + } + + protected void doSetNorm(int n, String field, byte value) + throws CorruptIndexException, IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.doSetNorm(n, field, value); + } + + public TermEnum terms() throws IOException { + ensureOpen(); + return new ParallelTermEnum(); + } + + public TermEnum terms(Term term) throws IOException { + ensureOpen(); + return new ParallelTermEnum(term); + } + + public int docFreq(Term term) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + return reader==null ? 0 : reader.docFreq(term); + } + + public TermDocs termDocs(Term term) throws IOException { + ensureOpen(); + return new ParallelTermDocs(term); + } + + public TermDocs termDocs() throws IOException { + ensureOpen(); + return new ParallelTermDocs(); + } + + public TermPositions termPositions(Term term) throws IOException { + ensureOpen(); + return new ParallelTermPositions(term); + } + + public TermPositions termPositions() throws IOException { + ensureOpen(); + return new ParallelTermPositions(); + } + + /** + * Checks recursively if all subreaders are up to date. + */ + public boolean isCurrent() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isCurrent()) { + return false; + } + } + + // all subreaders are up to date + return true; + } + + /** + * Checks recursively if all subindexes are optimized + */ + public boolean isOptimized() { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isOptimized()) { + return false; + } + } + + // all subindexes are optimized + return true; + } + + + /** Not implemented. + * @throws UnsupportedOperationException + */ + public long getVersion() { + throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); + } + + // for testing + IndexReader[] getSubReaders() { + return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); + } + + protected void doCommit() throws IOException { + for (int i = 0; i < readers.size(); i++) + ((IndexReader)readers.get(i)).commit(); + } + + protected synchronized void doClose() throws IOException { + for (int i = 0; i < readers.size(); i++) { + if (((Boolean) decrefOnClose.get(i)).booleanValue()) { + ((IndexReader)readers.get(i)).decRef(); + } else { + ((IndexReader)readers.get(i)).close(); + } + } + } + + public Collection getFieldNames (IndexReader.FieldOption fieldNames) { + ensureOpen(); + Set fieldSet = new HashSet(); + for (int i = 0; i < readers.size(); i++) { + IndexReader reader = ((IndexReader)readers.get(i)); + Collection names = reader.getFieldNames(fieldNames); + fieldSet.addAll(names); + } + return fieldSet; + } + + private class ParallelTermEnum extends TermEnum { + private String field; + private Iterator fieldIterator; + private TermEnum termEnum; + + public ParallelTermEnum() throws IOException { + if ( fieldToReader.isEmpty( ) ) return ; + + field = (String)fieldToReader.firstKey(); + if (field != null) + termEnum = ((IndexReader)fieldToReader.get(field)).terms(); + } + + public ParallelTermEnum(Term term) throws IOException { + field = term.field(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + termEnum = reader.terms(term); + } + + public boolean next() throws IOException { + if (termEnum==null) + return false; + + // another term in this field? + if (termEnum.next() && termEnum.term().field()==field) + return true; // yes, keep going + + termEnum.close(); // close old termEnum + + // find the next field with terms, if any + if (fieldIterator==null) { + fieldIterator = fieldToReader.tailMap(field).keySet().iterator(); + fieldIterator.next(); // Skip field to get next one + } + while (fieldIterator.hasNext()) { + field = (String) fieldIterator.next(); + termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); + Term term = termEnum.term(); + if (term!=null && term.field()==field) + return true; + else + termEnum.close(); + } + + return false; // no more fields + } + + public Term term() { + if (termEnum==null) + return null; + + return termEnum.term(); + } + + public int docFreq() { + if (termEnum==null) + return 0; + + return termEnum.docFreq(); + } + + public void close() throws IOException { + if (termEnum!=null) + termEnum.close(); + } + + } + + // wrap a TermDocs in order to support seek(Term) + private class ParallelTermDocs implements TermDocs { + protected TermDocs termDocs; + + public ParallelTermDocs() {} + public ParallelTermDocs(Term term) throws IOException { seek(term); } + + public int doc() { return termDocs.doc(); } + public int freq() { return termDocs.freq(); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termDocs(term) : null; + } + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean next() throws IOException { + if (termDocs==null) + return false; + + return termDocs.next(); + } + + public int read(final int[] docs, final int[] freqs) throws IOException { + if (termDocs==null) + return 0; + + return termDocs.read(docs, freqs); + } + + public boolean skipTo(int target) throws IOException { + if (termDocs==null) + return false; + + return termDocs.skipTo(target); + } + + public void close() throws IOException { + if (termDocs!=null) + termDocs.close(); + } + + } + + private class ParallelTermPositions + extends ParallelTermDocs implements TermPositions { + + public ParallelTermPositions() {} + public ParallelTermPositions(Term term) throws IOException { seek(term); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termPositions(term) : null; + } + + public int nextPosition() throws IOException { + // It is an error to call this if there is no next position, e.g. if termDocs==null + return ((TermPositions)termDocs).nextPosition(); + } + + public int getPayloadLength() { + return ((TermPositions)termDocs).getPayloadLength(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return ((TermPositions)termDocs).getPayload(data, offset); + } + + + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return ((TermPositions) termDocs).isPayloadAvailable(); + } + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2010-03-18 22:40:45
|
Revision: 2979 http://archive-access.svn.sourceforge.net/archive-access/?rev=2979&view=rev Author: binzino Date: 2010-03-18 22:40:39 +0000 (Thu, 18 Mar 2010) Log Message: ----------- WAX-74. Add support for storing field value in compressed form. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2010-03-18 22:11:53 UTC (rev 2978) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/conf/nutch-site.xml 2010-03-18 22:40:39 UTC (rev 2979) @@ -44,11 +44,10 @@ <name>nutchwax.filter.index</name> <value> title:false:true:tokenized - content:false:false:tokenized + content:false:compress:tokenized site:false:false:untokenized url:false:true:tokenized - digest:false:true:no collection:true:true:no_norms date:true:true:no_norms Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-03-18 22:11:53 UTC (rev 2978) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2010-03-18 22:40:39 UTC (rev 2979) @@ -36,6 +36,7 @@ import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.lucene.LuceneWriter; import org.apache.nutch.indexer.lucene.LuceneWriter.INDEX; +import org.apache.nutch.indexer.lucene.LuceneWriter.STORE; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; @@ -74,7 +75,7 @@ String srcKey = spec[0]; boolean lowerCase = true; - boolean store = true; + STORE store = STORE.YES; INDEX index = INDEX.TOKENIZED; boolean exclusive = true; String destKey = srcKey; @@ -91,7 +92,10 @@ "no_norms". equals(spec[3]) ? INDEX.NO_NORMS : INDEX.NO; case 3: - store = Boolean.parseBoolean( spec[2] ); + //store = Boolean.parseBoolean( spec[2] ); + store = "true". equals(spec[2]) ? STORE.YES : + "compress".equals(spec[2]) ? STORE.COMPRESS : + STORE.NO; case 2: lowerCase = Boolean.parseBoolean( spec[1] ); case 1: @@ -109,12 +113,12 @@ { String srcKey; boolean lowerCase; - boolean store; + STORE store; INDEX index; boolean exclusive; String destKey; - public FieldSpecification( String srcKey, boolean lowerCase, boolean store, INDEX index, boolean exclusive, String destKey ) + public FieldSpecification( String srcKey, boolean lowerCase, STORE store, INDEX index, boolean exclusive, String destKey ) { this.srcKey = srcKey; this.lowerCase = lowerCase; @@ -147,6 +151,12 @@ try { value = (new URL( meta.get( "url" ) ) ).getHost( ); + + // Strip off any "www." header. + if ( value.startsWith( "www." ) ) + { + value = value.substring( 4 ); + } } catch ( MalformedURLException mue ) { /* Eat it */ } } @@ -171,6 +181,11 @@ int p = value.indexOf( ';' ); if ( p >= 0 ) value = value.substring( 0, p ); } + else if ( "collection".equals( spec.srcKey ) ) + { + // Use value given in config first, otherwise what's in the metadata object. + value = conf.get( "nutchwax.index.collection", meta.get( spec.srcKey ) ); + } else { value = meta.get( spec.srcKey ); @@ -188,7 +203,7 @@ doc.removeField( spec.destKey ); } - if ( spec.store || spec.index != INDEX.NO ) + if ( spec.store != STORE.NO || spec.index != INDEX.NO ) { doc.add( spec.destKey, value ); } @@ -202,13 +217,13 @@ { for ( FieldSpecification spec : this.fieldSpecs ) { - if ( ! spec.store && spec.index == INDEX.NO ) + if ( spec.store == STORE.NO && spec.index == INDEX.NO ) { continue ; } LuceneWriter.addFieldOptions( spec.destKey, - spec.store ? LuceneWriter.STORE.YES : LuceneWriter.STORE.NO, + spec.store, spec.index, conf ); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |