[Assorted-commits] SF.net SVN: assorted: [358] hash-join/trunk/src/hashjoin.cc
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-02-11 04:52:39
|
Revision: 358 http://assorted.svn.sourceforge.net/assorted/?rev=358&view=rev Author: yangzhang Date: 2008-02-10 20:52:44 -0800 (Sun, 10 Feb 2008) Log Message: ----------- fixed bug omitting subsequent buckets in build/probe Modified Paths: -------------- hash-join/trunk/src/hashjoin.cc Modified: hash-join/trunk/src/hashjoin.cc =================================================================== --- hash-join/trunk/src/hashjoin.cc 2008-02-11 04:52:20 UTC (rev 357) +++ hash-join/trunk/src/hashjoin.cc 2008-02-11 04:52:44 UTC (rev 358) @@ -13,11 +13,15 @@ #include <pthread.h> #include <commons/check.h> +#include <commons/deque.h> #include <commons/files.h> +#include <commons/hash.h> #include <commons/strings.h> #include <commons/threads.h> #include <commons/time.h> +// TODO: #include <boost/array.hpp> + // // Hash Join // @@ -25,6 +29,7 @@ using namespace std; using namespace __gnu_cxx; using namespace commons; +// TODO: using namespace boost; // TODO use dependency injection! unsigned int ncpus = 1; @@ -51,9 +56,11 @@ /** * The data that we hold. */ - vector<char *> bufs; + vector<char*> bufs; }; +// TODO: typedef list< array<char, bucket_size> > bucket; + /** * An abstract in-memory database that holds "tuples" in a contiguous buffer. * The format/interpretation of the buffers is up to the subclasses. @@ -222,7 +229,7 @@ unsigned int db::push_bucket(char **heads, bucket *bs, const char *s, const char *p, size_t nbytes) { - size_t h = __stl_hash_string(s); + size_t h = hash_djb2(s); unsigned int bucket = h % (map_size * ncpus) / map_size; size_t bucket_size = max(1000000UL,buflen / ncpus * 3); if (heads[bucket] + nbytes < bs[bucket].bufs.back() + bucket_size) { @@ -357,14 +364,19 @@ hmap &h = *ph; // Visit each bucket that's destined to us (visit each source). for (unsigned int i = 0; i < ncpus; i++) { - char *p = movbucs[i][pid].bufs[0], - *end = movbucs[i][pid].bufs[0] + movbucs[i][pid].sz[0]; - while (p < end) { - char *title = p; - char *release = strchr(p, '\0') + 1; - p = strchr(release, '\0') + 2; - // Insert into hash map. - h[title] = release; + const vector<char*>& bufs = movbucs[i][pid].bufs; + const vector<size_t>& sz = movbucs[i][pid].sz; + // Iterate over the bucket. + for (unsigned int j = 0; j < bufs.size(); j++) { + char *p = bufs[j], *end = bufs[j] + sz[j]; + // Iterate over the chunk. + while (p < end) { + char *title = p; + char *release = strchr(p, '\0') + 1; + p = strchr(release, '\0') + 2; + // Insert into hash map. + h[title] = release; + } } } } @@ -398,28 +410,32 @@ int hits = 0, misses = 0; // For each source bucket. for (unsigned int i = 0; i < ncpus; i++) { - char *p = actbucs[i][pid].bufs[0], - *end = actbucs[i][pid].bufs[0] + actbucs[i][pid].sz[0]; + const vector<char*>& bufs = actbucs[i][pid].bufs; + const vector<size_t>& sz = actbucs[i][pid].sz; // Iterate over the bucket. - while (p < end) { - char *name = p; - p = strchr(p, '\0') + 1; - while (true) { - char *title = p; + for (unsigned int j = 0; j < bufs.size(); j++) { + char *p = bufs[j], *end = bufs[j] + sz[j]; + // Iterate over the chunk. + while (p < end) { + char *name = p; p = strchr(p, '\0') + 1; - // Emit the joined tuple (if a join was possible). - if (h.find(title) != h.end()) { - hits++; - join(title, name); - } else { - misses++; + while (true) { + char *title = p; + p = strchr(p, '\0') + 1; + // Emit the joined tuple (if a join was possible). + if (h.find(title) != h.end()) { + hits++; + join(title, name); + } else { + misses++; + } + // End of a tuple? (Don't actually need this check, since the + // hash-partitioning "normalizes" the tuples from the actresses file.) + if (*p == '\0') { + p++; + break; + } } - // End of a tuple? (Don't actually need this check, since the - // hash-partitioning "normalizes" the tuples from the actresses file.) - if (*p == '\0') { - p++; - break; - } } } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |