From: <bi...@us...> - 2010-02-22 05:17:29
|
Revision: 2960 http://archive-access.svn.sourceforge.net/archive-access/?rev=2960&view=rev Author: binzino Date: 2010-02-22 05:17:20 +0000 (Mon, 22 Feb 2010) Log Message: ----------- Initial revision of OpenSearch master/slave system. Work-in-progress. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE trunk/archive-access/projects/nutchwax/archive/lib/jdom.jar trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java Added: trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE =================================================================== --- trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/lib/jdom.LICENSE 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,56 @@ +/*-- + + $Id: LICENSE.txt,v 1.11 2004/02/06 09:32:57 jhunter Exp $ + + Copyright (C) 2000-2004 Jason Hunter & Brett McLaughlin. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer that follows + these conditions in the documentation and/or other materials + provided with the distribution. + + 3. The name "JDOM" must not be used to endorse or promote products + derived from this software without prior written permission. For + written permission, please contact <request_AT_jdom_DOT_org>. + + 4. Products derived from this software may not be called "JDOM", nor + may "JDOM" appear in their name, without prior written permission + from the JDOM Project Management <request_AT_jdom_DOT_org>. + + In addition, we request (but do not require) that you include in the + end-user documentation provided with the redistribution and/or in the + software itself an acknowledgement equivalent to the following: + "This product includes software developed by the + JDOM Project (http://www.jdom.org/)." + Alternatively, the acknowledgment may be graphical using the logos + available at http://www.jdom.org/images/logos. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + + This software consists of voluntary contributions made by many + individuals on behalf of the JDOM Project and was originally + created by Jason Hunter <jhunter_AT_jdom_DOT_org> and + Brett McLaughlin <brett_AT_jdom_DOT_org>. For more information + on the JDOM Project, please see <http://www.jdom.org/>. + + */ + Added: trunk/archive-access/projects/nutchwax/archive/lib/jdom.jar =================================================================== (Binary files differ) Property changes on: trunk/archive-access/projects/nutchwax/archive/lib/jdom.jar ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMaster.java 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,364 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.util.Comparator; +import java.util.Collections; +import java.util.List; +import java.util.ArrayList; +import java.util.LinkedList; + +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.Namespace; +import org.jdom.output.XMLOutputter; + + +/** + * + */ +public class OpenSearchMaster +{ + List<OpenSearchSlave> slaves = new ArrayList<OpenSearchSlave>( ); + long timeout = 30 * 1000; + + public OpenSearchMaster( String slavesFile, long timeout ) + throws IOException + { + this( slavesFile ); + this.timeout = timeout; + } + + public OpenSearchMaster( String slavesFile ) + throws IOException + { + BufferedReader r = null; + try + { + r = new BufferedReader( new InputStreamReader( new FileInputStream( slavesFile ), "utf-8" ) ); + + String line; + while ( (line = r.readLine()) != null ) + { + line = line.trim(); + if ( line.length() == 0 || line.charAt( 0 ) == '#' ) + { + // Ignore it. + continue ; + } + + OpenSearchSlave slave = new OpenSearchSlave( line ); + + this.slaves.add( slave ); + } + } + finally + { + try { if ( r != null ) r.close(); } catch ( IOException ioe ) { } + } + + } + + public Document query( String query, int startIndex, int numResults, int hitsPerSite ) + { + long startTime = System.currentTimeMillis( ); + + List<SlaveQueryThread> slaveThreads = new ArrayList<SlaveQueryThread>( this.slaves.size() ); + + for ( OpenSearchSlave slave : this.slaves ) + { + SlaveQueryThread sqt = new SlaveQueryThread( slave, query, 0, (startIndex+numResults), hitsPerSite ); + + sqt.start( ); + + slaveThreads.add( sqt ); + } + + waitForThreads( slaveThreads, this.timeout, startTime ); + + LinkedList<Element> items = new LinkedList<Element>( ); + long totalResults = 0; + + for ( SlaveQueryThread sqt : slaveThreads ) + { + if ( sqt.throwable != null ) + { + // TODO: Handle problems with slaves + continue ; + } + + // Dump all the results ("item" elements) into a single list. + Element channel = sqt.response.getRootElement( ).getChild( "channel" ); + items.addAll( (List<Element>) channel.getChildren( "item" ) ); + channel.removeChildren( "item" ); + + try + { + totalResults += Integer.parseInt( channel.getChild( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ).getTextTrim( ) ); + } + catch ( Exception e ) + { + // TODO: Log error getting total. + } + + } + + if ( items.size( ) > 0 && hitsPerSite > 0 ) + { + Collections.sort( items, new ElementSiteThenScoreComparator( ) ); + + LinkedList<Element> collapsed = new LinkedList<Element>( ); + + collapsed.add( items.removeFirst( ) ); + + int count = 1; + for ( Element item : items ) + { + String lastSite = collapsed.getLast( ).getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( ); + + if ( lastSite.length( ) == 0 || + !lastSite.equals( item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim( ) ) ) + { + collapsed.add( item ); + count = 1; + } + else if ( count < hitsPerSite ) + { + collapsed.add( item ); + count++; + } + else + { + // TODO: Log collapse of item. + } + } + + // Replace the list of items with the collapsed list. + items = collapsed; + } + + Collections.sort( items, new ElementScoreComparator( ) ); + + // Build the final results OpenSearch XML document. + Element channel = new Element( "channel" ); + channel.addContent( new Element( "title" ) ); + channel.addContent( new Element( "description" ) ); + channel.addContent( new Element( "link" ) ); + + Element eTotalResults = new Element( "totalResults", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); + Element eStartIndex = new Element( "startIndex", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); + Element eItemsPerPage = new Element( "itemsPerPage", Namespace.getNamespace( "http://a9.com/-/spec/opensearchrss/1.0/" ) ); + + eTotalResults.setText( Long.toString( totalResults ) ); + eStartIndex. setText( Long.toString( startIndex ) ); + eItemsPerPage.setText( Long.toString( numResults ) ); + + channel.addContent( eTotalResults ); + channel.addContent( eStartIndex ); + channel.addContent( eItemsPerPage ); + + // Get a sub-list of only the items we want: [startIndex,(startIndex+numResults)] + List<Element> subList = items.subList( Math.min( startIndex, items.size( ) ), + Math.min( (startIndex+numResults), items.size( ) ) ); + channel.addContent( subList ); + + Element rss = new Element( "rss" ); + rss.addContent( channel ); + + return new Document( rss ); + } + + + /** + * Convenience method to wait for a collection of threads to complete, + * or until a timeout after a startTime expires. + */ + private void waitForThreads( List<SlaveQueryThread> threads, long timeout, long startTime ) + { + for ( Thread t : threads ) + { + long timeRemaining = timeout - (System.currentTimeMillis( ) - startTime); + + // If we are out of time, don't wait for any more threads. + if ( timeRemaining <= 0 ) + { + break; + } + + // Otherwise, wait for the next unfinished thread to finish. + try + { + t.join( timeRemaining ); + } + catch ( InterruptedException ie ) + { + break; + } + } + } + + + public static void main( String args[] ) + throws Exception + { + String usage = "OpenSearchMaster [OPTIONS] SLAVES.txt query" + + "\n\t-h <n> Hits per site" + + "\n\t-n <n> Number of results" + + "\n\t-s <n> Start index" + + "\n"; + + if ( args.length < 2 ) + { + System.err.println( usage ); + System.exit( 1 ); + } + + String slavesFile = args[args.length - 2]; + String query = args[args.length - 1]; + + int startIndex = 0; + int hitsPerSite = 0; + int numHits = 10; + for ( int i = 0 ; i < args.length - 2 ; i++ ) + { + try + { + if ( "-h".equals( args[i] ) ) + { + i++; + hitsPerSite = Integer.parseInt( args[i] ); + } + if ( "-n".equals( args[i] ) ) + { + i++; + numHits = Integer.parseInt( args[i] ); + } + if ( "-s".equals( args[i] ) ) + { + i++; + startIndex = Integer.parseInt( args[i] ); + } + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Error: not a numeric value: " + args[i] ); + System.err.println( usage ); + System.exit( 1 ); + } + } + + OpenSearchMaster master = new OpenSearchMaster( slavesFile ); + + Document doc = master.query( query, startIndex, numHits, hitsPerSite ); + + (new XMLOutputter()).output( doc, System.out ); + } + +} + + +class SlaveQueryThread extends Thread +{ + OpenSearchSlave slave; + + String query; + int startIndex; + int numResults; + int hitsPerSite; + + Document response; + Throwable throwable; + + + SlaveQueryThread( OpenSearchSlave slave, String query, int startIndex, int numResults, int hitsPerSite ) + { + this.slave = slave; + this.query = query; + this.startIndex = startIndex; + this.numResults = numResults; + this.hitsPerSite = hitsPerSite; + } + + public void run( ) + { + try + { + this.response = this.slave.query( this.query, this.startIndex, this.numResults, this.hitsPerSite ); + } + catch ( Throwable t ) + { + this.throwable = t; + } + } +} + + +class ElementScoreComparator implements Comparator<Element> +{ + public int compare( Element e1, Element e2 ) + { + if ( e1 == e2 ) return 0; + if ( e1 == null ) return 1; + if ( e2 == null ) return -1; + + Element score1 = e1.getChild( "score" ); + Element score2 = e2.getChild( "score" ); + + if ( score1 == score2 ) return 0; + if ( score1 == null ) return 1; + if ( score2 == null ) return -1; + + String text1 = score1.getText().trim(); + String text2 = score2.getText().trim(); + + float value1 = 0.0f; + float value2 = 0.0f; + + try { value1 = Float.parseFloat( text1 ); } catch ( NumberFormatException nfe ) { } + try { value2 = Float.parseFloat( text2 ); } catch ( NumberFormatException nfe ) { } + + if ( value1 == value2 ) return 0; + + return value1 > value2 ? -1 : 1; + } +} + +class ElementSiteThenScoreComparator extends ElementScoreComparator +{ + public int compare( Element e1, Element e2 ) + { + if ( e1 == e2 ) return 0; + if ( e1 == null ) return 1; + if ( e2 == null ) return -1; + + String site1 = e1.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim(); + String site2 = e2.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ).getTextTrim(); + + if ( site1.equals( site2 ) ) + { + // Sites are equal, then compare scores. + return super.compare( e1, e2 ); + } + + return site1.compareTo( site2 ); + } +} \ No newline at end of file Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchMasterServlet.java 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.util.List; +import java.util.ArrayList; +import javax.servlet.ServletException; +import javax.servlet.ServletConfig; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + + +/** + * + */ +public class OpenSearchMasterServlet extends HttpServlet +{ + + public void init( ServletConfig config ) + throws ServletException + { + + + } + + public void doGet( HttpServletRequest request, HttpServletResponse response ) + throws ServletException, IOException + { + + } + +} Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchSlave.java 2010-02-22 05:17:20 UTC (rev 2960) @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; +import java.util.List; + +import org.jdom.Document; +import org.jdom.Element; +import org.jdom.Namespace; +import org.jdom.input.SAXBuilder; +import org.jdom.output.XMLOutputter; + +/** + * + */ +public class OpenSearchSlave +{ + private String urlTemplate; + + public OpenSearchSlave( String urlTemplate ) + { + this.urlTemplate = urlTemplate; + } + + public Document query( String query, int startIndex, int requestedNumResults, int hitsPerSite ) + throws Exception + { + URL url = buildRequestUrl( query, startIndex, requestedNumResults, hitsPerSite ); + + InputStream is = null; + try + { + is = getInputStream( url ); + + Document doc = (new SAXBuilder()).build( is ); + + doc = validate( doc ); + + return doc; + } + finally + { + // Ensure the InputStream is closed, which should trigger the + // underlying HTTP connection to be cleaned-up. + try { if ( is != null ) is.close( ); } catch ( IOException ioe ) { } // Not much we can do + } + } + + private Document validate( Document doc ) + throws Exception + { + if ( doc.getRootElement( ) == null ) throw new Exception( "Invalid OpenSearch response: missing /rss" ); + Element root = doc.getRootElement( ); + + if ( ! "rss".equals( root.getName( ) ) ) throw new Exception( "Invalid OpenSearch response: missing /rss" ); + Element channel = root.getChild( "channel" ); + + if ( channel == null ) throw new Exception( "Invalid OpenSearch response: missing /rss/channel" ); + + for ( Element item : (List<Element>) channel.getChildren( "item" ) ) + { + Element site = item.getChild( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); + if ( site == null ) + { + item.addContent( new Element( "site", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ) ); + } + + Element score = item.getChild( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); + if ( score == null ) + { + score = new Element( "score", Namespace.getNamespace( "http://www.nutch.org/opensearchrss/1.0/" ) ); + score.setText( "" ); + + item.addContent( score ); + } + } + + return doc; + } + + /** + * + */ + public URL buildRequestUrl( String query, int startIndex, int requestedNumResults, int hitsPerSite ) + throws MalformedURLException, UnsupportedEncodingException + { + String url = this.urlTemplate; + + // Note about replaceAll: In the Java regex library, the replacement string has a few + // special characters: \ and $. Forunately, since we URL-encode the replacement string, + // any occurance of \ or $ is converted to %xy form. So we don't have to worry about it. :) + url = url.replaceAll( "[{]searchTerms[}]", URLEncoder.encode( query, "utf-8" ) ); + url = url.replaceAll( "[{]count[}]" , String.valueOf( requestedNumResults ) ); + url = url.replaceAll( "[{]startIndex[}]" , String.valueOf( startIndex ) ); + url = url.replaceAll( "[{]hitsPerSite[}]", String.valueOf( hitsPerSite ) ); + + // We don't know about any optional parameters, so we remove them (per the OpenSearch spec.) + url = url.replaceAll( "[{][^}]+[?][}]", "" ); + + return new URL( url ); + } + + + public InputStream getInputStream( URL url ) + throws IOException + { + URLConnection connection = url.openConnection( ); + connection.setDoOutput( false ); + connection.setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; NutchWAX OpenSearchMaster)" ); + connection.connect( ); + + if ( connection instanceof HttpURLConnection ) + { + HttpURLConnection hc = (HttpURLConnection) connection; + + switch ( hc.getResponseCode( ) ) + { + case 200: + // All good. + break; + default: + // Problems! Bail out. + throw new IOException( "HTTP error from " + url + ": " + hc.getResponseMessage( ) ); + } + } + + InputStream is = connection.getInputStream( ); + + return is; + } + + public String toString() + { + return this.urlTemplate; + } + + public static void main( String args[] ) + throws Exception + { + String usage = "OpenSearchSlave [OPTIONS] urlTemplate query" + + "\n\t-h <n> Hits per site" + + "\n\t-n <n> Number of results" + + "\n"; + + if ( args.length < 2 ) + { + System.err.println( usage ); + System.exit( 1 ); + } + + String urlTemplate = args[args.length - 2]; + String query = args[args.length - 1]; + + int hitsPerSite = 0; + int numHits = 10; + for ( int i = 0 ; i < args.length - 2 ; i++ ) + { + try + { + if ( "-h".equals( args[i] ) ) + { + i++; + hitsPerSite = Integer.parseInt( args[i] ); + } + if ( "-n".equals( args[i] ) ) + { + i++; + numHits = Integer.parseInt( args[i] ); + } + } + catch ( NumberFormatException nfe ) + { + System.err.println( "Error: not a numeric value: " + args[i] ); + System.err.println( usage ); + System.exit( 1 ); + } + } + + OpenSearchSlave osl = new OpenSearchSlave( urlTemplate ); + + Document doc = osl.query( query, 0, numHits, hitsPerSite ); + + (new XMLOutputter()).output( doc, System.out ); + } + +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |