Thread: [Assorted-commits] SF.net SVN: assorted: [332] hash-join/trunk
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-02-07 00:47:03
|
Revision: 332 http://assorted.svn.sourceforge.net/assorted/?rev=332&view=rev Author: yangzhang Date: 2008-02-06 16:46:50 -0800 (Wed, 06 Feb 2008) Log Message: ----------- added prep and eval tools Added Paths: ----------- hash-join/trunk/tools/ hash-join/trunk/tools/DbPrep.scala hash-join/trunk/tools/LogProc.scala hash-join/trunk/tools/Makefile Added: hash-join/trunk/tools/DbPrep.scala =================================================================== --- hash-join/trunk/tools/DbPrep.scala (rev 0) +++ hash-join/trunk/tools/DbPrep.scala 2008-02-07 00:46:50 UTC (rev 332) @@ -0,0 +1,76 @@ +import commons.Control._ +import commons.Io._ +import java.util.regex._ +object DbPrep { + def extract(p: Pattern, s: String) = { + val m = p matcher s + m.find + (m group 1, m group 2) + } + def cleanTitle(line: String) = { + val t = line indexOf " " + if (t > 0) line take t else line + } + def main(args: Array[String]) { + val pMovie = Pattern compile """^([^\t]+)\t+(.*)$""" + val pActress = Pattern compile """^([^\t]+)\t+([^\t]+)$""" + val (doMovies, doActresses) = (true, true) + if (doMovies) { + using (TextReader("movies.list")) { r => + using (TextWriter("movies.dat")) { w => + var line = r.readLine + try { + var body = false + while (line != null) { + if (body && (line contains "----------------")) { + body = false + } + if (body && line != "") { + val (title, release) = extract(pMovie, line) + w print (title + "\0" + release + "\0\0") + } + if (!body && (line contains "=======")) { + body = true + } + line = r.readLine + } + } catch { + case e: Exception => { Console.err.println(line); throw e } + } + } + } + } + if (doActresses) { + using (TextReader("actresses.list")) { r => + using (TextWriter("actresses.dat")) { w => + var line = r.readLine + try { + var body = false + while (line != null) { + if (body && (line contains "----------------")) { + body = false + } + if (body && line != "") { + val (actress, title) = extract(pActress, line) + w print (actress + "\0" + cleanTitle(title) + "\0") + while (line != "") { + line = r.readLine.trim + if (line != "") { + w print (cleanTitle(title) + "\0") + } + } + w print "\0" + } + if (!body && ((line contains "\t") && (line startsWith "----") && (line endsWith "----"))) { + body = true + } + line = r.readLine + } + } catch { + case e: Exception => { Console.err.println(line); throw e } + } + } + } + } + } +} Added: hash-join/trunk/tools/LogProc.scala =================================================================== --- hash-join/trunk/tools/LogProc.scala (rev 0) +++ hash-join/trunk/tools/LogProc.scala 2008-02-07 00:46:50 UTC (rev 332) @@ -0,0 +1,98 @@ +import commons.Collections._ +import commons.Control._ +import commons.Io._ +// import commons.Plotting._ +import scala.collection.mutable._ + +object LogProc { + type FieldMap = Map[String,Int] + type MutFieldMap = HashMap[String,Int] + case class Stats( + ncpus: Int, + values: FieldMap + ) + val descriptors = Array( + ("movieLoading", "loading movies" ), + ("actressLoading", "loading actresses" ), + ("moviePartitioning", "hash-partitioning movies" ), + ("actressPartitioning", "hash-partitioning actresses" ), + ("movieBuilding", "building with movies" ), + ("actressProbing", "probing with actresses" ), + ("sum", "sum" ) + ) + val fieldNameToLabel = Map(descriptors: _*) + def fieldName(k: Int) = descriptors(k)._1 + def main(args: Array[String]) { + val lines = using (TextReader(args(0))) (_.readLines.toArray) + val map = new MutFieldMap + var ncpus = 0 + val stats = new ArrayBuffer[Stats] + var fieldIndex = Iterator from 0 + + // Parse logs into Stats. + for (line <- lines) { + if (line contains " cpus") { + // Include sum. + map("sum") = sum(map.values) + if (ncpus != 0) stats += Stats(ncpus, map.clone) + ncpus = line.split(" ")(1).toInt + fieldIndex = Iterator from 0 + map.clear + } else if (line contains "main time: ") { + map(fieldName(fieldIndex.next)) = line.split(" ").last.toInt + } + } + + // Build actual plot data. + val plotData = new HashMap[String,ArrayBuffer[Int]] { + override def default(k: String) = { + val buf = new ArrayBuffer[Int] + this(k) = buf + buf + } + } + val ncpuList = stats map (_.ncpus) + for (Stats(ncpus, map) <- stats) { + for (field <- map.keys) { + plotData(field) += map(field) + } + } + + // Produce the time and speedup .dats. + for ((field,times) <- plotData) { + val baseline = times(0).toDouble + println(field + ": " + times) + using (TextWriter(camelToHyphen(field) + "-time.dat")) { w => + for ((time,ncpus) <- times zip ncpuList) { + w.println(ncpus + " " + time) + } + } + using (TextWriter(camelToHyphen(field) + "-speedup.dat")) { w => + for ((time,ncpus) <- times map (baseline / _) zip ncpuList) { + w.println(ncpus + " " + time) + } + } + } + + // Instruct gnuplot. + def f(s:String) = { + { + for ((field,_) <- map) yield ( + "'" + camelToHyphen(field) + s + ".dat" + "' with linespoints title '" + fieldNameToLabel(field) + "'" + ) + } mkString ", " + } + run("gnuplot", """ + set terminal pdf + set xlabel 'number of threads' + + set output 'times.pdf' + set ylabel 'time (ms)' + plot """ + f("-time") + """ + + set output 'speedups.pdf' + set ylabel 'speedup (relative to 1 thread)' + plot """ + f("-speedup") + ) + } +} Added: hash-join/trunk/tools/Makefile =================================================================== --- hash-join/trunk/tools/Makefile (rev 0) +++ hash-join/trunk/tools/Makefile 2008-02-07 00:46:50 UTC (rev 332) @@ -0,0 +1,24 @@ +COMMONS_SRCS := $(wildcard commons/*.scala) +DBPREP_SRCS := DbPrep.scala $(COMMONS_SRCS) +LOGPREP_SRCS := LogProc.scala $(COMMONS_SRCS) + +all: out/DbPrep.class out/LogProc.class + +out/DbPrep.class: $(DBPREP_SRCS) + mkdir -p out + fsc -deprecation -d out $^ + +out/LogProc.class: $(LOGPREP_SRCS) + mkdir -p out + fsc -deprecation -d out $^ + +run: out/DbPrep.class + scala -cp out DbPrep + +proc: out/LogProc.class + scala -cp out LogProc log + +clean: + rm -rf out + +.PHONY: clean run This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-14 20:33:13
|
Revision: 413 http://assorted.svn.sourceforge.net/assorted/?rev=413&view=rev Author: yangzhang Date: 2008-02-14 12:33:16 -0800 (Thu, 14 Feb 2008) Log Message: ----------- added more doc and web publishing Added Paths: ----------- hash-join/trunk/doc/ hash-join/trunk/doc/Makefile hash-join/trunk/doc/analysis.txt Added: hash-join/trunk/doc/Makefile =================================================================== --- hash-join/trunk/doc/Makefile (rev 0) +++ hash-join/trunk/doc/Makefile 2008-02-14 20:33:16 UTC (rev 413) @@ -0,0 +1,23 @@ +PROJECT := hash-join +WEBDIR := assorted/htdocs/$(PROJECT) +PANDOC = pandoc -s -S --tab-stop=2 -c ../main.css -o $@ $^ + +all: index.html analysis.html + +index.html: ../README + $(PANDOC) + +analysis.html: analysis.txt + $(PANDOC) + +publish: analysis.html index.html + ssh shell-sf mkdir -p $(WEBDIR)/ + scp $^ shell-sf:$(WEBDIR)/ + +publish-data: times.pdf speedups.pdf + scp $^ shell-sf:$(WEBDIR)/ + +clean: + rm -f index.html analysis.html + +.PHONY: clean publish Added: hash-join/trunk/doc/analysis.txt =================================================================== --- hash-join/trunk/doc/analysis.txt (rev 0) +++ hash-join/trunk/doc/analysis.txt 2008-02-14 20:33:16 UTC (rev 413) @@ -0,0 +1,37 @@ +% Hash-Join Benchmarks +% Yang Zhang + +Here are the graphs from the latest experiments and implementation: + +- [times](times.pdf) +- [speedups](speedups.pdf) + +This implementation was originally not scalable in the hashtable-building +stage, which performed frequent allocations. The hashtable is stock from the +SGI/libstdc++ implementation. I removed this bottleneck by providing a custom +allocator that allocated from a non-freeing local memory arena. + +Profiling reveals that most of the time is spent in the hash functions and the +function that performs the memcpy during hash-partitioning. `actdb::partition1` +is the hash-partitioning function for actresses, and it calls `push_bucket` to +copy tuples into buckets. `scan` is just a function to touch all the data from +the file. + + % cumulative self self total + time seconds seconds calls s/call s/call name + 16.40 0.82 0.82 4547797 0.00 0.00 commons::hash_djb2(char const*) + 14.80 1.56 0.74 4547797 0.00 0.00 __gnu_cxx::__stl_hash_string(char const*) + 13.20 2.22 0.66 4547797 0.00 0.00 db::push_bucket(char**, bucket*, char const*, char const*, unsigned long) + 12.80 2.86 0.64 2 0.32 0.32 commons::scan(void const*, unsigned long) + 10.80 3.40 0.54 1 0.54 1.78 actdb::partition1(unsigned int, bucket*) + ... + +Now the hashtable construction phase is the most scalable part of the +algorithm. The remaining bottlenecks appear to be due to the memory stalls. + +The program does not scale much beyond the 16 threads, though performance does +improve slightly. This is due to the contention for cache capacity among +multiple hardware threads per core. + +This implementation is straightforward, with no fanciness in terms of custom +scheduling and control over allocation, leaving many things up to the OS. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-15 01:40:11
|
Revision: 415 http://assorted.svn.sourceforge.net/assorted/?rev=415&view=rev Author: yangzhang Date: 2008-02-14 17:40:15 -0800 (Thu, 14 Feb 2008) Log Message: ----------- updated analysis, readme, doc publishing Modified Paths: -------------- hash-join/trunk/README hash-join/trunk/doc/Makefile hash-join/trunk/doc/analysis.txt Modified: hash-join/trunk/README =================================================================== --- hash-join/trunk/README 2008-02-14 20:33:35 UTC (rev 414) +++ hash-join/trunk/README 2008-02-15 01:40:15 UTC (rev 415) @@ -29,6 +29,9 @@ there is a match, then emit the resulting joined tuple (movie title, movie release year, actress name). +Results +------- + Here are some [results]. Requirements @@ -75,7 +78,7 @@ this dataset and to observe the resulting distributions. [C++ Commons]: http://assorted.sf.net/cpp-commons/ -[HashDist]: http://assorted.sf.net/ +[HashDist]: http://assorted.svn.sourceforge.net/viewvc/assorted/hash-dist/trunk/ [Multiprocessor Hash-Based Join Algorithms]: http://citeseer.ist.psu.edu/50143.html [Scala Commons]: http://assorted.sf.net/scala-commons/ [g++]: http://gcc.gnu.org/ Modified: hash-join/trunk/doc/Makefile =================================================================== --- hash-join/trunk/doc/Makefile 2008-02-14 20:33:35 UTC (rev 414) +++ hash-join/trunk/doc/Makefile 2008-02-15 01:40:15 UTC (rev 415) @@ -1,6 +1,7 @@ -PROJECT := hash-join -WEBDIR := assorted/htdocs/$(PROJECT) -PANDOC = pandoc -s -S --tab-stop=2 -c ../main.css -o $@ $^ +PROJECT := hash-join +WEBDIR := assorted/htdocs/$(PROJECT) +HTMLFRAG := ../../../assorted-site/trunk +PANDOC = pandoc -s -S --tab-stop=2 -c ../main.css -H $(HTMLFRAG)/header.html -A $(HTMLFRAG)/google-footer.html -o $@ $^ all: index.html analysis.html @@ -14,10 +15,10 @@ ssh shell-sf mkdir -p $(WEBDIR)/ scp $^ shell-sf:$(WEBDIR)/ -publish-data: times.pdf speedups.pdf +publish-data: ../tools/data/*.pdf scp $^ shell-sf:$(WEBDIR)/ clean: rm -f index.html analysis.html -.PHONY: clean publish +.PHONY: clean publish publish-data Modified: hash-join/trunk/doc/analysis.txt =================================================================== --- hash-join/trunk/doc/analysis.txt 2008-02-14 20:33:35 UTC (rev 414) +++ hash-join/trunk/doc/analysis.txt 2008-02-15 01:40:15 UTC (rev 415) @@ -1,4 +1,4 @@ -% Hash-Join Benchmarks +% Hash-Join Analysis % Yang Zhang Here are the graphs from the latest experiments and implementation: @@ -9,7 +9,7 @@ This implementation was originally not scalable in the hashtable-building stage, which performed frequent allocations. The hashtable is stock from the SGI/libstdc++ implementation. I removed this bottleneck by providing a custom -allocator that allocated from a non-freeing local memory arena. +allocator that allocates from a non-freeing local memory arena. Profiling reveals that most of the time is spent in the hash functions and the function that performs the memcpy during hash-partitioning. `actdb::partition1` @@ -27,11 +27,14 @@ ... Now the hashtable construction phase is the most scalable part of the -algorithm. The remaining bottlenecks appear to be due to the memory stalls. +algorithm (despite its random access nature). The remaining bottlenecks appear +to be due to memory stalls, but these are mostly masked by hardware +prefetching. -The program does not scale much beyond the 16 threads, though performance does -improve slightly. This is due to the contention for cache capacity among -multiple hardware threads per core. +The program does not scale much beyond 16 threads, though performance does +improve slightly. The inability to scale beyond 16 is most likely due to the +contention for cache capacity among multiple hardware threads per core. -This implementation is straightforward, with no fanciness in terms of custom -scheduling and control over allocation, leaving many things up to the OS. +I've tried to keep the implementation simple, with no fanciness in terms of +custom task scheduling or control over allocation, leaving many things up to +the OS. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-16 04:46:41
|
Revision: 455 http://assorted.svn.sourceforge.net/assorted/?rev=455&view=rev Author: yangzhang Date: 2008-02-15 20:46:41 -0800 (Fri, 15 Feb 2008) Log Message: ----------- added basic autotools Added Paths: ----------- hash-join/trunk/Makefile.am hash-join/trunk/configure.ac Added: hash-join/trunk/Makefile.am =================================================================== --- hash-join/trunk/Makefile.am (rev 0) +++ hash-join/trunk/Makefile.am 2008-02-16 04:46:41 UTC (rev 455) @@ -0,0 +1,2 @@ +bin_PROGRAMS = hashjoin +hashjoin_SOURCES = src/hashjoin.cc Added: hash-join/trunk/configure.ac =================================================================== --- hash-join/trunk/configure.ac (rev 0) +++ hash-join/trunk/configure.ac 2008-02-16 04:46:41 UTC (rev 455) @@ -0,0 +1,35 @@ +# TODO: header-checking does nothing +# TODO: replace with simple-build +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + +AC_PREREQ(2.61) +AC_INIT(hash-join, 0.1, gmail:yaaang) +AM_INIT_AUTOMAKE(cppcommons, 0.1) +AC_CONFIG_SRCDIR([src/hashjoin.cc]) +AC_CONFIG_HEADER([config.h]) + +# Checks for programs. +AC_PROG_CXX + +# Checks for libraries. +#### AC_CHECK_LIB([profile], [main]) +AC_CHECK_LIB([pthread], [pthread_create]) + +# Checks for header files. +AC_CHECK_HEADERS([fcntl.h sys/time.h]) +AC_CHECK_HEADERS([boost/any.h]) +AC_CHECK_HEADERS([commons/check.h]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_HEADER_STDBOOL +AC_C_CONST +AC_C_INLINE +AC_TYPE_SIZE_T + +# Checks for library functions. +AC_HEADER_STDC +AC_CHECK_FUNCS([strchr]) + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-29 16:49:38
|
Revision: 548 http://assorted.svn.sourceforge.net/assorted/?rev=548&view=rev Author: yangzhang Date: 2008-02-29 08:49:43 -0800 (Fri, 29 Feb 2008) Log Message: ----------- publisher publishes data Modified Paths: -------------- hash-join/trunk/publish.bash Removed Paths: ------------- hash-join/trunk/doc/Makefile Deleted: hash-join/trunk/doc/Makefile =================================================================== --- hash-join/trunk/doc/Makefile 2008-02-29 16:49:18 UTC (rev 547) +++ hash-join/trunk/doc/Makefile 2008-02-29 16:49:43 UTC (rev 548) @@ -1,24 +0,0 @@ -PROJECT := hash-join -WEBDIR := assorted/htdocs/$(PROJECT) -HTMLFRAG := ../../../assorted-site/trunk -PANDOC = pandoc -s -S --tab-stop=2 -c ../main.css -H $(HTMLFRAG)/header.html -A $(HTMLFRAG)/google-footer.html -o $@ $^ - -all: index.html analysis.html - -index.html: ../README - $(PANDOC) - -analysis.html: analysis.txt - $(PANDOC) - -publish: analysis.html index.html - ssh shell-sf mkdir -p $(WEBDIR)/ - scp $^ shell-sf:$(WEBDIR)/ - -publish-data: ../tools/data/*.pdf - scp $^ shell-sf:$(WEBDIR)/ - -clean: - rm -f index.html analysis.html - -.PHONY: clean publish publish-data Modified: hash-join/trunk/publish.bash =================================================================== --- hash-join/trunk/publish.bash 2008-02-29 16:49:18 UTC (rev 547) +++ hash-join/trunk/publish.bash 2008-02-29 16:49:43 UTC (rev 548) @@ -3,5 +3,5 @@ project=hash-join clean=false websrcs=( README doc/analysis.txt ) -webfiles=() +webfiles=( tools/data/*.pdf ) . assorted.bash "$@" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-03-03 04:45:41
|
Revision: 580 http://assorted.svn.sourceforge.net/assorted/?rev=580&view=rev Author: yangzhang Date: 2008-03-02 20:45:44 -0800 (Sun, 02 Mar 2008) Log Message: ----------- updated for new publisher Modified Paths: -------------- hash-join/trunk/README hash-join/trunk/publish.bash Modified: hash-join/trunk/README =================================================================== --- hash-join/trunk/README 2008-03-03 04:41:41 UTC (rev 579) +++ hash-join/trunk/README 2008-03-03 04:45:44 UTC (rev 580) @@ -1,6 +1,3 @@ -% Parallel Hash Join -% Yang Zhang - Overview -------- @@ -55,7 +52,7 @@ $ svn --quiet co https://assorted.svn.sourceforge.net/svnroot/assorted/hash-join/trunk hash-join $ ln -s "$PWD/cpp-commons/src/commons" hash-join/src/ $ cd hash-join/src/ - $ make hashjoin-opt + $ CPATH="$PWD" make hashjoin-opt $ out/hashjoin-opt 16 $MOVIEDATA/{movies,actresses}.dat Supporting Tools Modified: hash-join/trunk/publish.bash =================================================================== --- hash-join/trunk/publish.bash 2008-03-03 04:41:41 UTC (rev 579) +++ hash-join/trunk/publish.bash 2008-03-03 04:45:44 UTC (rev 580) @@ -1,7 +1,10 @@ #!/usr/bin/env bash -project=hash-join -clean=false +fullname='Parallel Hash-Join' +version=0.1 +license=gpl3 websrcs=( README doc/analysis.txt ) webfiles=( tools/data/*.pdf ) +rels=( src-tgz: ) +nodl=true . assorted.bash "$@" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |