|
From: <bi...@us...> - 2010-03-16 21:28:28
|
Revision: 2971
http://archive-access.svn.sourceforge.net/archive-access/?rev=2971&view=rev
Author: binzino
Date: 2010-03-16 21:28:15 +0000 (Tue, 16 Mar 2010)
Log Message:
-----------
Removed from this release. Might make a re-appearance in a future release.
Removed Paths:
-------------
trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java
trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java
trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java
Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java 2010-02-23 00:50:11 UTC (rev 2970)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java 2010-03-16 21:28:15 UTC (rev 2971)
@@ -1,355 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.nutchwax;
-
-import java.io.IOException;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.io.FileInputStream;
-import java.util.Comparator;
-import java.util.Collections;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.LinkedList;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.jdom.Document;
-import org.jdom.Element;
-import org.jdom.Namespace;
-import org.jdom.output.XMLOutputter;
-
-
-/**
- *
- */
-public class OpenSearchMaster
-{
- public static final Log LOG = LogFactory.getLog( OpenSearchMaster.class );
-
- List<OpenSearchSlave> slaves = new ArrayList<OpenSearchSlave>( );
- long timeout = 0;
-
- public OpenSearchMaster( String slavesFile, long timeout )
- throws IOException
- {
- this( slavesFile );
- this.timeout = timeout;
- }
-
- public OpenSearchMaster( String slavesFile )
- throws IOException
- {
- BufferedReader r = null;
- try
- {
- r = new BufferedReader( new InputStreamReader( new FileInputStream( slavesFile ), "utf-8" ) );
-
- String line;
- while ( (line = r.readLine()) != null )
- {
- line = line.trim();
- if ( line.length() == 0 || line.charAt( 0 ) == '#' )
- {
- // Ignore it.
- continue ;
- }
-
- OpenSearchSlave slave = new OpenSearchSlave( line );
-
- this.slaves.add( slave );
- }
- }
- finally
- {
- try { if ( r != null ) r.close(); } catch ( IOException ioe ) { }
- }
-
- }
-
- public Document query( String query, int startIndex, int numResults, int hitsPerSite )
- {
- long startTime = System.currentTimeMillis( );
-
- List<SlaveQueryThread> slaveThreads = new ArrayList<SlaveQueryThread>( this.slaves.size() );
-
- for ( OpenSearchSlave slave : this.slaves )
- {
- SlaveQueryThread sqt = new SlaveQueryThread( slave, query, 0, (startIndex+numResults), hitsPerSite );
-
- sqt.start( );
-
- slaveThreads.add( sqt );
- }
-
- waitForThreads( slaveThreads, this.timeout );
-
- LinkedList<Element> items = new LinkedList<Element>( );
- long totalResults = 0;
-
- for ( SlaveQueryThread sqt : slaveThreads )
- {
- if ( sqt.throwable != null )
- {
- continue ;
- }
-
- try
- {
- // Dump all the results ("item" elements) into a single list.
- Element channel = sqt.response.getRootElement( ).getChild( "channel" );
- items.addAll( (List<Element>) channel.getChildren( "item" ) );
- channel.removeChildren( "item" );
-
- totalResults += Integer.parseInt( channel.getChild( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ).getTextTrim( ) );
- }
- catch ( Exception e )
- {
- LOG.error( "Error processing response from slave: " + sqt.slave, e );
- }
-
- }
-
- if ( items.size( ) > 0 && hitsPerSite > 0 )
- {
- Collections.sort( items, new ElementSiteThenScoreComparator( ) );
-
- LinkedList<Element> collapsed = new LinkedList<Element>( );
-
- collapsed.add( items.removeFirst( ) );
-
- int count = 1;
- for ( Element item : items )
- {
- String lastSite = collapsed.getLast( ).getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( );
-
- if ( lastSite.length( ) == 0 ||
- !lastSite.equals( item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( ) ) )
- {
- collapsed.add( item );
- count = 1;
- }
- else if ( count < hitsPerSite )
- {
- collapsed.add( item );
- count++;
- }
- }
-
- // Replace the list of items with the collapsed list.
- items = collapsed;
- }
-
- Collections.sort( items, new ElementScoreComparator( ) );
-
- // Build the final results OpenSearch XML document.
- Element channel = new Element( "channel" );
- channel.addContent( new Element( "title" ) );
- channel.addContent( new Element( "description" ) );
- channel.addContent( new Element( "link" ) );
-
- Element eTotalResults = new Element( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) );
- Element eStartIndex = new Element( "startIndex", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) );
- Element eItemsPerPage = new Element( "itemsPerPage", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) );
-
- eTotalResults.setText( Long.toString( totalResults ) );
- eStartIndex. setText( Long.toString( startIndex ) );
- eItemsPerPage.setText( Long.toString( numResults ) );
-
- channel.addContent( eTotalResults );
- channel.addContent( eStartIndex );
- channel.addContent( eItemsPerPage );
-
- // Get a sub-list of only the items we want: [startIndex,(startIndex+numResults)]
- List<Element> subList = items.subList( Math.min( startIndex, items.size( ) ),
- Math.min( (startIndex+numResults), items.size( ) ) );
- channel.addContent( subList );
-
- Element rss = new Element( "rss" );
- rss.addContent( channel );
-
- return new Document( rss );
- }
-
-
- /**
- * Convenience method to wait for a collection of threads to complete,
- * or until a timeout after a startTime expires.
- */
- private void waitForThreads( List<SlaveQueryThread> threads, long timeout )
- {
- for ( Thread t : threads )
- {
- try
- {
- t.join( timeout );
- }
- catch ( InterruptedException ie )
- {
- break;
- }
- }
- }
-
-
- public static void main( String args[] )
- throws Exception
- {
- String usage = "OpenSearchMaster [OPTIONS] SLAVES.txt query"
- + "\n\t-h <n> Hits per site"
- + "\n\t-n <n> Number of results"
- + "\n\t-s <n> Start index"
- + "\n";
-
- if ( args.length < 2 )
- {
- System.err.println( usage );
- System.exit( 1 );
- }
-
- String slavesFile = args[args.length - 2];
- String query = args[args.length - 1];
-
- int startIndex = 0;
- int hitsPerSite = 0;
- int numHits = 10;
- for ( int i = 0 ; i < args.length - 2 ; i++ )
- {
- try
- {
- if ( "-h".equals( args[i] ) )
- {
- i++;
- hitsPerSite = Integer.parseInt( args[i] );
- }
- if ( "-n".equals( args[i] ) )
- {
- i++;
- numHits = Integer.parseInt( args[i] );
- }
- if ( "-s".equals( args[i] ) )
- {
- i++;
- startIndex = Integer.parseInt( args[i] );
- }
- }
- catch ( NumberFormatException nfe )
- {
- System.err.println( "Error: not a numeric value: " + args[i] );
- System.err.println( usage );
- System.exit( 1 );
- }
- }
-
- OpenSearchMaster master = new OpenSearchMaster( slavesFile );
-
- Document doc = master.query( query, startIndex, numHits, hitsPerSite );
-
- (new XMLOutputter()).output( doc, System.out );
- }
-
-}
-
-
-class SlaveQueryThread extends Thread
-{
- OpenSearchSlave slave;
-
- String query;
- int startIndex;
- int numResults;
- int hitsPerSite;
-
- Document response;
- Throwable throwable;
-
-
- SlaveQueryThread( OpenSearchSlave slave, String query, int startIndex, int numResults, int hitsPerSite )
- {
- this.slave = slave;
- this.query = query;
- this.startIndex = startIndex;
- this.numResults = numResults;
- this.hitsPerSite = hitsPerSite;
- }
-
- public void run( )
- {
- try
- {
- this.response = this.slave.query( this.query, this.startIndex, this.numResults, this.hitsPerSite );
- }
- catch ( Throwable t )
- {
- this.throwable = t;
- }
- }
-}
-
-
-class ElementScoreComparator implements Comparator<Element>
-{
- public int compare( Element e1, Element e2 )
- {
- if ( e1 == e2 ) return 0;
- if ( e1 == null ) return 1;
- if ( e2 == null ) return -1;
-
- Element score1 = e1.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) );
- Element score2 = e2.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) );
-
- if ( score1 == score2 ) return 0;
- if ( score1 == null ) return 1;
- if ( score2 == null ) return -1;
-
- String text1 = score1.getText().trim();
- String text2 = score2.getText().trim();
-
- float value1 = 0.0f;
- float value2 = 0.0f;
-
- try { value1 = Float.parseFloat( text1 ); } catch ( NumberFormatException nfe ) { }
- try { value2 = Float.parseFloat( text2 ); } catch ( NumberFormatException nfe ) { }
-
- if ( value1 == value2 ) return 0;
-
- return value1 > value2 ? -1 : 1;
- }
-}
-
-class ElementSiteThenScoreComparator extends ElementScoreComparator
-{
- public int compare( Element e1, Element e2 )
- {
- if ( e1 == e2 ) return 0;
- if ( e1 == null ) return 1;
- if ( e2 == null ) return -1;
-
- String site1 = e1.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim();
- String site2 = e2.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim();
-
- if ( site1.equals( site2 ) )
- {
- // Sites are equal, then compare scores.
- return super.compare( e1, e2 );
- }
-
- return site1.compareTo( site2 );
- }
-}
\ No newline at end of file
Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java 2010-02-23 00:50:11 UTC (rev 2970)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java 2010-03-16 21:28:15 UTC (rev 2971)
@@ -1,148 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.nutchwax;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import javax.servlet.ServletConfig;
-import javax.servlet.ServletException;
-import javax.servlet.http.HttpServlet;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-
-import org.jdom.Document;
-import org.jdom.Element;
-import org.jdom.Namespace;
-import org.jdom.output.XMLOutputter;
-
-/**
- *
- */
-public class OpenSearchMasterServlet extends HttpServlet
-{
- OpenSearchMaster master;
-
- int hitsPerSite = 0;
-
- public void init( ServletConfig config )
- throws ServletException
- {
- String slavesFile = config.getInitParameter( "slaves" );
-
- if ( slavesFile == null || slavesFile.trim().length() == 0 )
- {
- throw new ServletException( "Required init parameter missing: slaves" );
- }
-
- int timeout = getInteger( config.getInitParameter( "timeout" ), 0 );
- int hitsPerSite = getInteger( config.getInitParameter( "hitsPerSite" ), 0 );
-
- try
- {
- this.master = new OpenSearchMaster( slavesFile, timeout );
- }
- catch ( IOException ioe )
- {
- throw new ServletException( ioe );
- }
-
- }
-
- public void destroy( )
- {
-
- }
-
- public void doGet( HttpServletRequest request, HttpServletResponse response )
- throws ServletException, IOException
- {
- long responseTime = System.nanoTime( );
-
- request.setCharacterEncoding( "UTF-8" );
-
- String query = getString ( request.getParameter( "query" ), "" );
- int startIndex = getInteger( request.getParameter( "start" ), 0 );
- int numHits = getInteger( request.getParameter( "hitsPerPage" ), 10 );
- int hitsPerSite = getInteger( request.getParameter( "hitsPerSite" ), this.hitsPerSite );
-
- Document doc = this.master.query( query, startIndex, numHits, hitsPerSite );
-
- Element eUrlParams = new Element( "urlParams", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) );
-
- for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) )
- {
- String key = e.getKey( );
- for ( String value : e.getValue( ) )
- {
- Element eParam = new Element( "param", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) );
- eParam.setAttribute( "name", key );
- eParam.setAttribute( "value", value );
- eUrlParams.addContent( eParam );
- }
- }
-
- doc.getRootElement( ).getChild( "channel" ).addContent( eUrlParams );
-
- (new XMLOutputter()).output( doc, response.getOutputStream( ) );
- }
-
- String getString ( String value, String defaultValue )
- {
- if ( value != null )
- {
- value = value.trim();
-
- if ( value.length( ) != 0 )
- {
- return value;
- }
- }
-
- return defaultValue;
- }
-
- int getInteger( String value, int defaultValue )
- {
- if ( value != null )
- {
- value = value.trim();
-
- if ( value.length( ) != 0 )
- {
- try
- {
- int i = Integer.parseInt( value );
-
- return i;
- }
- catch ( NumberFormatException nfe )
- {
- // TODO: log?
- }
- }
- }
-
- return defaultValue;
- }
-
-}
Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java 2010-02-23 00:50:11 UTC (rev 2970)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java 2010-03-16 21:28:15 UTC (rev 2971)
@@ -1,218 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.archive.nutchwax;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
-import java.net.HttpURLConnection;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.net.URLConnection;
-import java.net.URLEncoder;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.jdom.Document;
-import org.jdom.Element;
-import org.jdom.Namespace;
-import org.jdom.input.SAXBuilder;
-import org.jdom.output.XMLOutputter;
-
-/**
- *
- */
-public class OpenSearchSlave
-{
- public static final Log LOG = LogFactory.getLog( OpenSearchSlave.class );
-
- private String urlTemplate;
-
- public OpenSearchSlave( String urlTemplate )
- {
- this.urlTemplate = urlTemplate;
- }
-
- public Document query( String query, int startIndex, int requestedNumResults, int hitsPerSite )
- throws Exception
- {
- URL url = buildRequestUrl( query, startIndex, requestedNumResults, hitsPerSite );
-
- InputStream is = null;
- try
- {
- LOG.info( "Querying slave: " + url );
-
- is = getInputStream( url );
-
- Document doc = (new SAXBuilder()).build( is );
-
- doc = validate( doc );
-
- return doc;
- }
- catch ( Exception e )
- {
- LOG.error( url.toString(), e );
- throw e;
- }
- finally
- {
- // Ensure the InputStream is closed, which should trigger the
- // underlying HTTP connection to be cleaned-up.
- try { if ( is != null ) is.close( ); } catch ( IOException ioe ) { } // Not much we can do
- }
- }
-
- private Document validate( Document doc )
- throws Exception
- {
- if ( doc.getRootElement( ) == null ) throw new Exception( "Invalid OpenSearch response: missing /rss" );
- Element root = doc.getRootElement( );
-
- if ( ! "rss".equals( root.getName( ) ) ) throw new Exception( "Invalid OpenSearch response: missing /rss" );
- Element channel = root.getChild( "channel" );
-
- if ( channel == null ) throw new Exception( "Invalid OpenSearch response: missing /rss/channel" );
-
- for ( Element item : (List<Element>) channel.getChildren( "item" ) )
- {
- Element site = item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) );
- if ( site == null )
- {
- item.addContent( new Element( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ) );
- }
-
- Element score = item.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) );
- if ( score == null )
- {
- item.addContent( new Element( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ) );
- }
- }
-
- return doc;
- }
-
- /**
- *
- */
- public URL buildRequestUrl( String query, int startIndex, int requestedNumResults, int hitsPerSite )
- throws MalformedURLException, UnsupportedEncodingException
- {
- String url = this.urlTemplate;
-
- // Note about replaceAll: In the Java regex library, the replacement string has a few
- // special characters: \ and $. Forunately, since we URL-encode the replacement string,
- // any occurance of \ or $ is converted to %xy form. So we don't have to worry about it. :)
- url = url.replaceAll( "[{]searchTerms[}]", URLEncoder.encode( query, "utf-8" ) );
- url = url.replaceAll( "[{]count[}]" , String.valueOf( requestedNumResults ) );
- url = url.replaceAll( "[{]startIndex[}]" , String.valueOf( startIndex ) );
- url = url.replaceAll( "[{]hitsPerSite[}]", String.valueOf( hitsPerSite ) );
-
- // We don't know about any optional parameters, so we remove them (per the OpenSearch spec.)
- url = url.replaceAll( "[{][^}]+[?][}]", "" );
-
- return new URL( url );
- }
-
-
- public InputStream getInputStream( URL url )
- throws IOException
- {
- URLConnection connection = url.openConnection( );
- connection.setDoOutput( false );
- connection.setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; NutchWAX OpenSearchMaster)" );
- connection.connect( );
-
- if ( connection instanceof HttpURLConnection )
- {
- HttpURLConnection hc = (HttpURLConnection) connection;
-
- switch ( hc.getResponseCode( ) )
- {
- case 200:
- // All good.
- break;
- default:
- // Problems! Bail out.
- throw new IOException( "HTTP error from " + url + ": " + hc.getResponseMessage( ) );
- }
- }
-
- InputStream is = connection.getInputStream( );
-
- return is;
- }
-
- public String toString()
- {
- return this.urlTemplate;
- }
-
- public static void main( String args[] )
- throws Exception
- {
- String usage = "OpenSearchSlave [OPTIONS] urlTemplate query"
- + "\n\t-h <n> Hits per site"
- + "\n\t-n <n> Number of results"
- + "\n";
-
- if ( args.length < 2 )
- {
- System.err.println( usage );
- System.exit( 1 );
- }
-
- String urlTemplate = args[args.length - 2];
- String query = args[args.length - 1];
-
- int hitsPerSite = 0;
- int numHits = 10;
- for ( int i = 0 ; i < args.length - 2 ; i++ )
- {
- try
- {
- if ( "-h".equals( args[i] ) )
- {
- i++;
- hitsPerSite = Integer.parseInt( args[i] );
- }
- if ( "-n".equals( args[i] ) )
- {
- i++;
- numHits = Integer.parseInt( args[i] );
- }
- }
- catch ( NumberFormatException nfe )
- {
- System.err.println( "Error: not a numeric value: " + args[i] );
- System.err.println( usage );
- System.exit( 1 );
- }
- }
-
- OpenSearchSlave osl = new OpenSearchSlave( urlTemplate );
-
- Document doc = osl.query( query, 0, numHits, hitsPerSite );
-
- (new XMLOutputter()).output( doc, System.out );
- }
-
-}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|