From: <bi...@us...> - 2010-03-16 21:28:28
|
Revision: 2971 http://archive-access.svn.sourceforge.net/archive-access/?rev=2971&view=rev Author: binzino Date: 2010-03-16 21:28:15 +0000 (Tue, 16 Mar 2010) Log Message: ----------- Removed from this release. Might make a re-appearance in a future release. Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java 2010-02-23 00:50:11 UTC (rev 2970) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java 2010-03-16 21:28:15 UTC (rev 2971) @@ -1,355 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.nutchwax; - -import java.io.IOException; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.FileInputStream; -import java.util.Comparator; -import java.util.Collections; -import java.util.List; -import java.util.ArrayList; -import java.util.LinkedList; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import org.jdom.Document; -import org.jdom.Element; -import org.jdom.Namespace; -import org.jdom.output.XMLOutputter; - - -/** - * - */ -public class OpenSearchMaster -{ - public static final Log LOG = LogFactory.getLog( OpenSearchMaster.class ); - - List<OpenSearchSlave> slaves = new ArrayList<OpenSearchSlave>( ); - long timeout = 0; - - public OpenSearchMaster( String slavesFile, long timeout ) - throws IOException - { - this( slavesFile ); - this.timeout = timeout; - } - - public OpenSearchMaster( String slavesFile ) - throws IOException - { - BufferedReader r = null; - try - { - r = new BufferedReader( new InputStreamReader( new FileInputStream( slavesFile ), "utf-8" ) ); - - String line; - while ( (line = r.readLine()) != null ) - { - line = line.trim(); - if ( line.length() == 0 || line.charAt( 0 ) == '#' ) - { - // Ignore it. - continue ; - } - - OpenSearchSlave slave = new OpenSearchSlave( line ); - - this.slaves.add( slave ); - } - } - finally - { - try { if ( r != null ) r.close(); } catch ( IOException ioe ) { } - } - - } - - public Document query( String query, int startIndex, int numResults, int hitsPerSite ) - { - long startTime = System.currentTimeMillis( ); - - List<SlaveQueryThread> slaveThreads = new ArrayList<SlaveQueryThread>( this.slaves.size() ); - - for ( OpenSearchSlave slave : this.slaves ) - { - SlaveQueryThread sqt = new SlaveQueryThread( slave, query, 0, (startIndex+numResults), hitsPerSite ); - - sqt.start( ); - - slaveThreads.add( sqt ); - } - - waitForThreads( slaveThreads, this.timeout ); - - LinkedList<Element> items = new LinkedList<Element>( ); - long totalResults = 0; - - for ( SlaveQueryThread sqt : slaveThreads ) - { - if ( sqt.throwable != null ) - { - continue ; - } - - try - { - // Dump all the results ("item" elements) into a single list. - Element channel = sqt.response.getRootElement( ).getChild( "channel" ); - items.addAll( (List<Element>) channel.getChildren( "item" ) ); - channel.removeChildren( "item" ); - - totalResults += Integer.parseInt( channel.getChild( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ).getTextTrim( ) ); - } - catch ( Exception e ) - { - LOG.error( "Error processing response from slave: " + sqt.slave, e ); - } - - } - - if ( items.size( ) > 0 && hitsPerSite > 0 ) - { - Collections.sort( items, new ElementSiteThenScoreComparator( ) ); - - LinkedList<Element> collapsed = new LinkedList<Element>( ); - - collapsed.add( items.removeFirst( ) ); - - int count = 1; - for ( Element item : items ) - { - String lastSite = collapsed.getLast( ).getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( ); - - if ( lastSite.length( ) == 0 || - !lastSite.equals( item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( ) ) ) - { - collapsed.add( item ); - count = 1; - } - else if ( count < hitsPerSite ) - { - collapsed.add( item ); - count++; - } - } - - // Replace the list of items with the collapsed list. - items = collapsed; - } - - Collections.sort( items, new ElementScoreComparator( ) ); - - // Build the final results OpenSearch XML document. - Element channel = new Element( "channel" ); - channel.addContent( new Element( "title" ) ); - channel.addContent( new Element( "description" ) ); - channel.addContent( new Element( "link" ) ); - - Element eTotalResults = new Element( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); - Element eStartIndex = new Element( "startIndex", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); - Element eItemsPerPage = new Element( "itemsPerPage", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); - - eTotalResults.setText( Long.toString( totalResults ) ); - eStartIndex. setText( Long.toString( startIndex ) ); - eItemsPerPage.setText( Long.toString( numResults ) ); - - channel.addContent( eTotalResults ); - channel.addContent( eStartIndex ); - channel.addContent( eItemsPerPage ); - - // Get a sub-list of only the items we want: [startIndex,(startIndex+numResults)] - List<Element> subList = items.subList( Math.min( startIndex, items.size( ) ), - Math.min( (startIndex+numResults), items.size( ) ) ); - channel.addContent( subList ); - - Element rss = new Element( "rss" ); - rss.addContent( channel ); - - return new Document( rss ); - } - - - /** - * Convenience method to wait for a collection of threads to complete, - * or until a timeout after a startTime expires. - */ - private void waitForThreads( List<SlaveQueryThread> threads, long timeout ) - { - for ( Thread t : threads ) - { - try - { - t.join( timeout ); - } - catch ( InterruptedException ie ) - { - break; - } - } - } - - - public static void main( String args[] ) - throws Exception - { - String usage = "OpenSearchMaster [OPTIONS] SLAVES.txt query" - + "\n\t-h <n> Hits per site" - + "\n\t-n <n> Number of results" - + "\n\t-s <n> Start index" - + "\n"; - - if ( args.length < 2 ) - { - System.err.println( usage ); - System.exit( 1 ); - } - - String slavesFile = args[args.length - 2]; - String query = args[args.length - 1]; - - int startIndex = 0; - int hitsPerSite = 0; - int numHits = 10; - for ( int i = 0 ; i < args.length - 2 ; i++ ) - { - try - { - if ( "-h".equals( args[i] ) ) - { - i++; - hitsPerSite = Integer.parseInt( args[i] ); - } - if ( "-n".equals( args[i] ) ) - { - i++; - numHits = Integer.parseInt( args[i] ); - } - if ( "-s".equals( args[i] ) ) - { - i++; - startIndex = Integer.parseInt( args[i] ); - } - } - catch ( NumberFormatException nfe ) - { - System.err.println( "Error: not a numeric value: " + args[i] ); - System.err.println( usage ); - System.exit( 1 ); - } - } - - OpenSearchMaster master = new OpenSearchMaster( slavesFile ); - - Document doc = master.query( query, startIndex, numHits, hitsPerSite ); - - (new XMLOutputter()).output( doc, System.out ); - } - -} - - -class SlaveQueryThread extends Thread -{ - OpenSearchSlave slave; - - String query; - int startIndex; - int numResults; - int hitsPerSite; - - Document response; - Throwable throwable; - - - SlaveQueryThread( OpenSearchSlave slave, String query, int startIndex, int numResults, int hitsPerSite ) - { - this.slave = slave; - this.query = query; - this.startIndex = startIndex; - this.numResults = numResults; - this.hitsPerSite = hitsPerSite; - } - - public void run( ) - { - try - { - this.response = this.slave.query( this.query, this.startIndex, this.numResults, this.hitsPerSite ); - } - catch ( Throwable t ) - { - this.throwable = t; - } - } -} - - -class ElementScoreComparator implements Comparator<Element> -{ - public int compare( Element e1, Element e2 ) - { - if ( e1 == e2 ) return 0; - if ( e1 == null ) return 1; - if ( e2 == null ) return -1; - - Element score1 = e1.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); - Element score2 = e2.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); - - if ( score1 == score2 ) return 0; - if ( score1 == null ) return 1; - if ( score2 == null ) return -1; - - String text1 = score1.getText().trim(); - String text2 = score2.getText().trim(); - - float value1 = 0.0f; - float value2 = 0.0f; - - try { value1 = Float.parseFloat( text1 ); } catch ( NumberFormatException nfe ) { } - try { value2 = Float.parseFloat( text2 ); } catch ( NumberFormatException nfe ) { } - - if ( value1 == value2 ) return 0; - - return value1 > value2 ? -1 : 1; - } -} - -class ElementSiteThenScoreComparator extends ElementScoreComparator -{ - public int compare( Element e1, Element e2 ) - { - if ( e1 == e2 ) return 0; - if ( e1 == null ) return 1; - if ( e2 == null ) return -1; - - String site1 = e1.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim(); - String site2 = e2.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim(); - - if ( site1.equals( site2 ) ) - { - // Sites are equal, then compare scores. - return super.compare( e1, e2 ); - } - - return site1.compareTo( site2 ); - } -} \ No newline at end of file Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java 2010-02-23 00:50:11 UTC (rev 2970) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java 2010-03-16 21:28:15 UTC (rev 2971) @@ -1,148 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.nutchwax; - -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import javax.servlet.ServletConfig; -import javax.servlet.ServletException; -import javax.servlet.http.HttpServlet; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletResponse; - -import org.jdom.Document; -import org.jdom.Element; -import org.jdom.Namespace; -import org.jdom.output.XMLOutputter; - -/** - * - */ -public class OpenSearchMasterServlet extends HttpServlet -{ - OpenSearchMaster master; - - int hitsPerSite = 0; - - public void init( ServletConfig config ) - throws ServletException - { - String slavesFile = config.getInitParameter( "slaves" ); - - if ( slavesFile == null || slavesFile.trim().length() == 0 ) - { - throw new ServletException( "Required init parameter missing: slaves" ); - } - - int timeout = getInteger( config.getInitParameter( "timeout" ), 0 ); - int hitsPerSite = getInteger( config.getInitParameter( "hitsPerSite" ), 0 ); - - try - { - this.master = new OpenSearchMaster( slavesFile, timeout ); - } - catch ( IOException ioe ) - { - throw new ServletException( ioe ); - } - - } - - public void destroy( ) - { - - } - - public void doGet( HttpServletRequest request, HttpServletResponse response ) - throws ServletException, IOException - { - long responseTime = System.nanoTime( ); - - request.setCharacterEncoding( "UTF-8" ); - - String query = getString ( request.getParameter( "query" ), "" ); - int startIndex = getInteger( request.getParameter( "start" ), 0 ); - int numHits = getInteger( request.getParameter( "hitsPerPage" ), 10 ); - int hitsPerSite = getInteger( request.getParameter( "hitsPerSite" ), this.hitsPerSite ); - - Document doc = this.master.query( query, startIndex, numHits, hitsPerSite ); - - Element eUrlParams = new Element( "urlParams", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); - - for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) ) - { - String key = e.getKey( ); - for ( String value : e.getValue( ) ) - { - Element eParam = new Element( "param", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); - eParam.setAttribute( "name", key ); - eParam.setAttribute( "value", value ); - eUrlParams.addContent( eParam ); - } - } - - doc.getRootElement( ).getChild( "channel" ).addContent( eUrlParams ); - - (new XMLOutputter()).output( doc, response.getOutputStream( ) ); - } - - String getString ( String value, String defaultValue ) - { - if ( value != null ) - { - value = value.trim(); - - if ( value.length( ) != 0 ) - { - return value; - } - } - - return defaultValue; - } - - int getInteger( String value, int defaultValue ) - { - if ( value != null ) - { - value = value.trim(); - - if ( value.length( ) != 0 ) - { - try - { - int i = Integer.parseInt( value ); - - return i; - } - catch ( NumberFormatException nfe ) - { - // TODO: log? - } - } - } - - return defaultValue; - } - -} Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java 2010-02-23 00:50:11 UTC (rev 2970) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java 2010-03-16 21:28:15 UTC (rev 2971) @@ -1,218 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.nutchwax; - -import java.io.IOException; -import java.io.InputStream; -import java.io.UnsupportedEncodingException; -import java.net.HttpURLConnection; -import java.net.MalformedURLException; -import java.net.URL; -import java.net.URLConnection; -import java.net.URLEncoder; -import java.util.List; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import org.jdom.Document; -import org.jdom.Element; -import org.jdom.Namespace; -import org.jdom.input.SAXBuilder; -import org.jdom.output.XMLOutputter; - -/** - * - */ -public class OpenSearchSlave -{ - public static final Log LOG = LogFactory.getLog( OpenSearchSlave.class ); - - private String urlTemplate; - - public OpenSearchSlave( String urlTemplate ) - { - this.urlTemplate = urlTemplate; - } - - public Document query( String query, int startIndex, int requestedNumResults, int hitsPerSite ) - throws Exception - { - URL url = buildRequestUrl( query, startIndex, requestedNumResults, hitsPerSite ); - - InputStream is = null; - try - { - LOG.info( "Querying slave: " + url ); - - is = getInputStream( url ); - - Document doc = (new SAXBuilder()).build( is ); - - doc = validate( doc ); - - return doc; - } - catch ( Exception e ) - { - LOG.error( url.toString(), e ); - throw e; - } - finally - { - // Ensure the InputStream is closed, which should trigger the - // underlying HTTP connection to be cleaned-up. - try { if ( is != null ) is.close( ); } catch ( IOException ioe ) { } // Not much we can do - } - } - - private Document validate( Document doc ) - throws Exception - { - if ( doc.getRootElement( ) == null ) throw new Exception( "Invalid OpenSearch response: missing /rss" ); - Element root = doc.getRootElement( ); - - if ( ! "rss".equals( root.getName( ) ) ) throw new Exception( "Invalid OpenSearch response: missing /rss" ); - Element channel = root.getChild( "channel" ); - - if ( channel == null ) throw new Exception( "Invalid OpenSearch response: missing /rss/channel" ); - - for ( Element item : (List<Element>) channel.getChildren( "item" ) ) - { - Element site = item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); - if ( site == null ) - { - item.addContent( new Element( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ) ); - } - - Element score = item.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); - if ( score == null ) - { - item.addContent( new Element( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ) ); - } - } - - return doc; - } - - /** - * - */ - public URL buildRequestUrl( String query, int startIndex, int requestedNumResults, int hitsPerSite ) - throws MalformedURLException, UnsupportedEncodingException - { - String url = this.urlTemplate; - - // Note about replaceAll: In the Java regex library, the replacement string has a few - // special characters: \ and $. Forunately, since we URL-encode the replacement string, - // any occurance of \ or $ is converted to %xy form. So we don't have to worry about it. :) - url = url.replaceAll( "[{]searchTerms[}]", URLEncoder.encode( query, "utf-8" ) ); - url = url.replaceAll( "[{]count[}]" , String.valueOf( requestedNumResults ) ); - url = url.replaceAll( "[{]startIndex[}]" , String.valueOf( startIndex ) ); - url = url.replaceAll( "[{]hitsPerSite[}]", String.valueOf( hitsPerSite ) ); - - // We don't know about any optional parameters, so we remove them (per the OpenSearch spec.) - url = url.replaceAll( "[{][^}]+[?][}]", "" ); - - return new URL( url ); - } - - - public InputStream getInputStream( URL url ) - throws IOException - { - URLConnection connection = url.openConnection( ); - connection.setDoOutput( false ); - connection.setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; NutchWAX OpenSearchMaster)" ); - connection.connect( ); - - if ( connection instanceof HttpURLConnection ) - { - HttpURLConnection hc = (HttpURLConnection) connection; - - switch ( hc.getResponseCode( ) ) - { - case 200: - // All good. - break; - default: - // Problems! Bail out. - throw new IOException( "HTTP error from " + url + ": " + hc.getResponseMessage( ) ); - } - } - - InputStream is = connection.getInputStream( ); - - return is; - } - - public String toString() - { - return this.urlTemplate; - } - - public static void main( String args[] ) - throws Exception - { - String usage = "OpenSearchSlave [OPTIONS] urlTemplate query" - + "\n\t-h <n> Hits per site" - + "\n\t-n <n> Number of results" - + "\n"; - - if ( args.length < 2 ) - { - System.err.println( usage ); - System.exit( 1 ); - } - - String urlTemplate = args[args.length - 2]; - String query = args[args.length - 1]; - - int hitsPerSite = 0; - int numHits = 10; - for ( int i = 0 ; i < args.length - 2 ; i++ ) - { - try - { - if ( "-h".equals( args[i] ) ) - { - i++; - hitsPerSite = Integer.parseInt( args[i] ); - } - if ( "-n".equals( args[i] ) ) - { - i++; - numHits = Integer.parseInt( args[i] ); - } - } - catch ( NumberFormatException nfe ) - { - System.err.println( "Error: not a numeric value: " + args[i] ); - System.err.println( usage ); - System.exit( 1 ); - } - } - - OpenSearchSlave osl = new OpenSearchSlave( urlTemplate ); - - Document doc = osl.query( query, 0, numHits, hitsPerSite ); - - (new XMLOutputter()).output( doc, System.out ); - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |