|
From: <bi...@us...> - 2008-12-10 05:02:22
|
Revision: 2658
http://archive-access.svn.sourceforge.net/archive-access/?rev=2658&view=rev
Author: binzino
Date: 2008-12-10 05:02:19 +0000 (Wed, 10 Dec 2008)
Log Message:
-----------
Initial revision.
Added Paths:
-----------
trunk/archive-access/projects/nutchwax/archive/src/etc/
trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/
trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave
trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java
Added: trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/etc/init.d/searcher-slave 2008-12-10 05:02:19 UTC (rev 2658)
@@ -0,0 +1,63 @@
+#! /bin/sh
+#
+# -----------------------------------
+# Initscript for NutchWAX searcher slave
+# -----------------------------------
+
+set -e
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+DESC="NutchWAX searcher slave"
+NAME="searcher-slave"
+
+DAEMON="/3/search/nutchwax-0.12.2/bin/nutch org.archive.nutchwax.DistributedSearch\$Server 9000 /3/search/deploy"
+NUTCH_HOME=/3/search/nutchwax-0.12.2
+JAVA_HOME=/usr
+export NUTCH_HEAPSIZE=2500
+PIDFILE=/var/run/$NAME.pid
+SCRIPTNAME=/etc/init.d/$NAME
+
+# Gracefully exit if the package has been removed.
+test -x /usr/bin/java || exit 0
+
+# ---------------------------------------
+# Function that starts the daemon/service
+# ---------------------------------------
+d_start()
+{
+start-stop-daemon --start -b -m -c webcrawl:webcrawl --pidfile $PIDFILE --exec $DAEMON
+}
+
+# --------------------------------------
+# Function that stops the daemon/service
+# --------------------------------------
+d_stop()
+{
+start-stop-daemon --stop --pidfile $PIDFILE
+}
+
+case "$1" in
+start)
+echo -n "Starting $DESC: $NAME"
+d_start
+echo "."
+;;
+stop)
+echo -n "Stopping $DESC: $NAME"
+d_stop
+echo "."
+;;
+restart|force-reload)
+echo -n "Restarting $DESC: $NAME"
+d_stop
+sleep 1
+d_start
+echo "."
+;;
+*)
+echo "Usage: $SCRIPTNAME {start|stop|restart|force-reload}" >&2
+exit 1
+;;
+esac
+
+exit 0
Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/PageRanker.java 2008-12-10 05:02:19 UTC (rev 2658)
@@ -0,0 +1,208 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.archive.nutchwax.tools;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.FileAlreadyExistsException;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ */
+public class PageRanker extends Configured implements Tool
+{
+ public static final Log LOG = LogFactory.getLog(PageRanker.class);
+
+ public static final String DONE_NAME = "merge.done";
+
+ public PageRanker() {
+
+ }
+
+ public PageRanker(Configuration conf) {
+ setConf(conf);
+ }
+
+ /**
+ * Create an index for the input files in the named directory.
+ */
+ public static void main(String[] args)
+ throws Exception
+ {
+ int res = ToolRunner.run(NutchConfiguration.create(), new PageRanker(), args);
+ System.exit(res);
+ }
+
+ /**
+ *
+ */
+ public int run(String[] args)
+ throws Exception
+ {
+ String usage = "Usage: PageRanker [OPTIONS] outputFile <linkdb|paths>\n"
+ + "Emit PageRank values for URLs in linkDb(s). Suitable for use with\n"
+ + "PageRank scoring filter.\n"
+ + "\n"
+ + "OPTIONS:\n"
+ + " -p Use exact path as given, don't assume it's a typical\n"
+ + " linkdb with \"current/part-nnnnn\" subdirs.\n"
+ + " -t threshold Do not emit records with less than this many inlinks.\n"
+ + " Default value 10."
+ ;
+ if ( args.length < 1 )
+ {
+ System.err.println( "Usage: " + usage );
+ return -1;
+ }
+
+ boolean exactPath = false;
+ int threshold = 10;
+
+ int pos = 0;
+ for ( ; pos < args.length && args[pos].charAt(0) == '-' ; pos++ )
+ {
+ if ( args[pos].equals( "-p" ) )
+ {
+ exactPath = true;
+ }
+ if ( args[pos].equals( "-t" ) )
+ {
+ pos++;
+ if ( args.length - pos < 1 )
+ {
+ System.err.println( "Error: missing argument to -t option" );
+ return -1;
+ }
+ try
+ {
+ threshold = Integer.parseInt( args[pos] );
+ }
+ catch ( NumberFormatException nfe )
+ {
+ System.err.println( "Error: bad value for -t option: " + args[pos] );
+ return -1;
+ }
+ }
+ }
+
+ Configuration conf = getConf( );
+ FileSystem fs = FileSystem.get( conf );
+
+ if ( pos >= args.length )
+ {
+ System.err.println( "Error: missing outputFile" );
+ return -1;
+ }
+
+ Path outputPath = new Path( args[pos++] );
+ if ( fs.exists( outputPath ) )
+ {
+ System.err.println( "Erorr: outputFile already exists: " + outputPath );
+ return -1;
+ }
+
+ PrintWriter output = new PrintWriter( new OutputStreamWriter( fs.create( outputPath ).getWrappedStream( ), "UTF-8" ) );
+
+ if ( pos >= args.length )
+ {
+ System.err.println( "Error: missing linkdb" );
+ return -1;
+ }
+
+ List<Path> mapfiles = new ArrayList<Path>();
+
+ // If we are using exact paths, add each one to the list.
+ // Otherwise, assume the given path is to a linkdb and look for
+ // <linkdbPath>/current/part-nnnnn sub-dirs.
+ if ( exactPath )
+ {
+ for ( ; pos < args.length ; pos++ )
+ {
+ mapfiles.add( new Path( args[pos] ) );
+ }
+ }
+ else
+ {
+ FileStatus[] fstats = fs.listStatus( new Path(args[pos]+"/current"), HadoopFSUtil.getPassDirectoriesFilter(fs));
+ mapfiles.addAll(Arrays.asList(HadoopFSUtil.getPaths(fstats)));
+ }
+
+ System.out.println( "mapfiles = " + mapfiles );
+ try
+ {
+ for ( Path p : mapfiles )
+ {
+ MapFile.Reader reader = new MapFile.Reader( fs, p.toString(), conf );
+
+ WritableComparable key = (WritableComparable) ReflectionUtils.newInstance( reader.getKeyClass() , conf );
+ Writable value = (Writable) ReflectionUtils.newInstance( reader.getValueClass(), conf );
+
+ while ( reader.next( key, value ) )
+ {
+ if ( key instanceof Text && value instanceof Inlinks )
+ {
+ Text toUrl = (Text) key;
+ Inlinks inlinks = (Inlinks) value;
+
+ if ( inlinks.size( ) < threshold )
+ {
+ continue ;
+ }
+
+ String toUrlString = toUrl.toString( );
+
+ // HACK: Should make this into some externally configurable regex.
+ if ( toUrlString.startsWith( "http" ) )
+ {
+ output.println( inlinks.size( ) + " " + toUrl.toString() );
+ }
+ }
+ }
+ }
+
+ return 0;
+ }
+ catch ( Exception e )
+ {
+ LOG.fatal( "PageRanker: " + StringUtils.stringifyException( e ) );
+ return -1;
+ }
+ finally
+ {
+ output.flush( );
+ output.close( );
+ }
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|