Revision: 347
http://assorted.svn.sourceforge.net/assorted/?rev=347&view=rev
Author: yangzhang
Date: 2008-02-09 19:54:41 -0800 (Sat, 09 Feb 2008)
Log Message:
-----------
added hash-dist
Added Paths:
-----------
hash-dist/
hash-dist/trunk/
hash-dist/trunk/README
hash-dist/trunk/src/
hash-dist/trunk/src/HashDist.scala
hash-dist/trunk/src/Makefile
Added: hash-dist/trunk/README
===================================================================
--- hash-dist/trunk/README (rev 0)
+++ hash-dist/trunk/README 2008-02-10 03:54:41 UTC (rev 347)
@@ -0,0 +1,26 @@
+This is a simple utility for observing the distribution of a hash-function. For
+instance, the arguments
+
+ full djb2 2 stl 1000000
+
+should be structurally interpreted as
+
+ full
+ ("djb2", 2)
+ ("stl", 2)
+
+This constructs 2 top-level buckets, each leading to 1000000 second-level
+buckets, where the first level is hashed by the DJB2 function, and the second
+level is hashed by the STL function.
+
+Valid options are:
+
+- `cdf`: write a CDF of bucket sizes into the file `cdf`.
+- `dump`: write the complete bucket sizes to stdout.
+- `full`: produce both the CDF and the dump.
+
+Valid hash functions are:
+
+- djb2
+- java
+- stl
Added: hash-dist/trunk/src/HashDist.scala
===================================================================
--- hash-dist/trunk/src/HashDist.scala (rev 0)
+++ hash-dist/trunk/src/HashDist.scala 2008-02-10 03:54:41 UTC (rev 347)
@@ -0,0 +1,93 @@
+import commons._
+import Collections._
+import Control._
+import Io._
+import Tree._
+
+import scala.util._
+
+object HashDist {
+
+ /**
+ * From libstdc++ 4.1 __stl_hash_string.
+ */
+ def hashStl(xs: Seq[Int]) = {
+ var h = 0
+ for (x <- xs) h = 5 * h + x
+ h
+ }
+
+ /**
+ * From Sun JDK6 String.hashCode.
+ */
+ def hashJava(xs: Seq[Int]) = {
+ var h = 0
+ for (x <- xs) h = 31 * h + x
+ h
+ }
+
+ /**
+ * From http://www.cse.yorku.ca/~oz/hash.html. Not sure if this is correct,
+ * since Int is signed.
+ */
+ def hashDjb2(xs: Seq[Int]) = {
+ var h = 5381
+ for (x <- xs) h = ((h << 5) + h) + x
+ h
+ }
+
+ /**
+ * Hash function.
+ */
+ type Hasher = Seq[Int] => Int
+
+ /**
+ * Hash function and number of buckets.
+ */
+ type HashLevel = (Hasher, Int)
+
+ /**
+ * Build a hierarchical hash table to observe the hash functions'
+ * distributions, then dump the distribution in gnuplot datafile format.
+ */
+ def main(args: Array[String]) {
+ // Parse the arguments.
+ val hs: Array[HashLevel] = (
+ for ((name, size) <- pairs(args)) yield {
+ val h = name match {
+ case "djb2" => hashDjb2 _
+ case "java" => hashJava _
+ case "stl" => hashStl _
+ }
+ val n = size.toInt
+ (h,n)
+ }
+ ) toArray
+ val fanouts = hs map (_._2)
+
+ // Construct the hierarchical hash table.
+ val tree = treeFromFanouts(new Array[Int](fanouts.last), fanouts.toStream.init)
+
+ // For each line, hash its way down the tree.
+ for (line <- untilNull(Console.readLine)) {
+ val xs = line.toCharArray map (_ toInt)
+ val path = (for ((h,n) <- hs) yield mod(h(xs), n)).toArray
+ val counts: Array[Int] = tree get path.toStream.init
+ counts(path.last) += 1
+ }
+
+ // Display the final counts.
+ for ((xs,i) <- tree.flatten.zipWithIndex; (x,j) <- xs.zipWithIndex)
+ println(i + " " + j + " " + x)
+
+ // Output the CDF in gnuplot datafile format.
+ using (TextWriter("cdf")) { w =>
+ val counts: Array[Int] = tree.flatten flatMap (x=>x) toArray;
+ Sorting quickSort counts
+ for ((count,i) <- counts zipWithIndex) {
+ w.println(i + " " + count)
+ }
+ }
+ }
+
+}
Added: hash-dist/trunk/src/Makefile
===================================================================
--- hash-dist/trunk/src/Makefile (rev 0)
+++ hash-dist/trunk/src/Makefile 2008-02-10 03:54:41 UTC (rev 347)
@@ -0,0 +1,10 @@
+all: out/HashDist.class
+
+out/HashDist.class: HashDist.scala $(COMMONS_SRCS)
+ mkdir -p out
+ fsc -d out $^
+
+clean:
+ rm -rf out
+
+.PHONY: clean
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|