[Assorted-commits] SF.net SVN: assorted: [347] hash-dist
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-02-10 03:54:36
|
Revision: 347 http://assorted.svn.sourceforge.net/assorted/?rev=347&view=rev Author: yangzhang Date: 2008-02-09 19:54:41 -0800 (Sat, 09 Feb 2008) Log Message: ----------- added hash-dist Added Paths: ----------- hash-dist/ hash-dist/trunk/ hash-dist/trunk/README hash-dist/trunk/src/ hash-dist/trunk/src/HashDist.scala hash-dist/trunk/src/Makefile Added: hash-dist/trunk/README =================================================================== --- hash-dist/trunk/README (rev 0) +++ hash-dist/trunk/README 2008-02-10 03:54:41 UTC (rev 347) @@ -0,0 +1,26 @@ +This is a simple utility for observing the distribution of a hash-function. For +instance, the arguments + + full djb2 2 stl 1000000 + +should be structurally interpreted as + + full + ("djb2", 2) + ("stl", 2) + +This constructs 2 top-level buckets, each leading to 1000000 second-level +buckets, where the first level is hashed by the DJB2 function, and the second +level is hashed by the STL function. + +Valid options are: + +- `cdf`: write a CDF of bucket sizes into the file `cdf`. +- `dump`: write the complete bucket sizes to stdout. +- `full`: produce both the CDF and the dump. + +Valid hash functions are: + +- djb2 +- java +- stl Added: hash-dist/trunk/src/HashDist.scala =================================================================== --- hash-dist/trunk/src/HashDist.scala (rev 0) +++ hash-dist/trunk/src/HashDist.scala 2008-02-10 03:54:41 UTC (rev 347) @@ -0,0 +1,93 @@ +import commons._ +import Collections._ +import Control._ +import Io._ +import Tree._ + +import scala.util._ + +object HashDist { + + /** + * From libstdc++ 4.1 __stl_hash_string. + */ + def hashStl(xs: Seq[Int]) = { + var h = 0 + for (x <- xs) h = 5 * h + x + h + } + + /** + * From Sun JDK6 String.hashCode. + */ + def hashJava(xs: Seq[Int]) = { + var h = 0 + for (x <- xs) h = 31 * h + x + h + } + + /** + * From http://www.cse.yorku.ca/~oz/hash.html. Not sure if this is correct, + * since Int is signed. + */ + def hashDjb2(xs: Seq[Int]) = { + var h = 5381 + for (x <- xs) h = ((h << 5) + h) + x + h + } + + /** + * Hash function. + */ + type Hasher = Seq[Int] => Int + + /** + * Hash function and number of buckets. + */ + type HashLevel = (Hasher, Int) + + /** + * Build a hierarchical hash table to observe the hash functions' + * distributions, then dump the distribution in gnuplot datafile format. + */ + def main(args: Array[String]) { + // Parse the arguments. + val hs: Array[HashLevel] = ( + for ((name, size) <- pairs(args)) yield { + val h = name match { + case "djb2" => hashDjb2 _ + case "java" => hashJava _ + case "stl" => hashStl _ + } + val n = size.toInt + (h,n) + } + ) toArray + val fanouts = hs map (_._2) + + // Construct the hierarchical hash table. + val tree = treeFromFanouts(new Array[Int](fanouts.last), fanouts.toStream.init) + + // For each line, hash its way down the tree. + for (line <- untilNull(Console.readLine)) { + val xs = line.toCharArray map (_ toInt) + val path = (for ((h,n) <- hs) yield mod(h(xs), n)).toArray + val counts: Array[Int] = tree get path.toStream.init + counts(path.last) += 1 + } + + // Display the final counts. + for ((xs,i) <- tree.flatten.zipWithIndex; (x,j) <- xs.zipWithIndex) + println(i + " " + j + " " + x) + + // Output the CDF in gnuplot datafile format. + using (TextWriter("cdf")) { w => + val counts: Array[Int] = tree.flatten flatMap (x=>x) toArray; + Sorting quickSort counts + for ((count,i) <- counts zipWithIndex) { + w.println(i + " " + count) + } + } + } + +} Added: hash-dist/trunk/src/Makefile =================================================================== --- hash-dist/trunk/src/Makefile (rev 0) +++ hash-dist/trunk/src/Makefile 2008-02-10 03:54:41 UTC (rev 347) @@ -0,0 +1,10 @@ +all: out/HashDist.class + +out/HashDist.class: HashDist.scala $(COMMONS_SRCS) + mkdir -p out + fsc -d out $^ + +clean: + rm -rf out + +.PHONY: clean This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |