Thread: [Assorted-commits] SF.net SVN: assorted: [356] numa-bench/trunk/src
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-02-10 21:48:13
|
Revision: 356 http://assorted.svn.sourceforge.net/assorted/?rev=356&view=rev Author: yangzhang Date: 2008-02-10 13:48:17 -0800 (Sun, 10 Feb 2008) Log Message: ----------- updates Modified Paths: -------------- numa-bench/trunk/src/Makefile numa-bench/trunk/src/malloc.cc Added Paths: ----------- numa-bench/trunk/src/openmp.cc numa-bench/trunk/src/threads.cc Modified: numa-bench/trunk/src/Makefile =================================================================== --- numa-bench/trunk/src/Makefile 2008-02-10 18:52:12 UTC (rev 355) +++ numa-bench/trunk/src/Makefile 2008-02-10 21:48:17 UTC (rev 356) @@ -1,7 +1,8 @@ COMMONS := $(wildcard commons/*.h) -CXX = g++ -I. -lnuma -o $@ $^ +CXX = g++-4.2 -I. -lnuma -lpthread -o $@ $^ -all: avail cache +# all: avail cache malloc threads +all: malloc avail: avail.cc $(COMMONS) $(CXX) @@ -9,6 +10,15 @@ cache: cache.cc $(COMMONS) $(CXX) +malloc: malloc.cc $(COMMONS) + $(CXX) + +threads: threads.cc $(COMMONS) + $(CXX) + +openmp: openmp.cc + $(CXX) -fopenmp + clean: rm -f avail cache Modified: numa-bench/trunk/src/malloc.cc =================================================================== --- numa-bench/trunk/src/malloc.cc 2008-02-10 18:52:12 UTC (rev 355) +++ numa-bench/trunk/src/malloc.cc 2008-02-10 21:48:17 UTC (rev 356) @@ -2,32 +2,73 @@ #include <cstdlib> #include <iostream> -#include <time.h> -#include <pthread.h> +#include <sched.h> + +#include <commons/check.h> +#include <commons/threads.h> +#include <commons/time.h> + +using namespace commons; using namespace std; const size_t size = 10000000; -void -touch(void *pp) +void* +chew(void* pp) { - char *p = (char*) pp; + char* p = (char*) pp; const int reps = 100; - time_t t0 = time(NULL); + pid_t pid = gettid(); + timer t(": "); + + // Pin this thread to the right processor. + cpu_set_t cs; + CPU_ZERO(&cs); + CPU_SET(1, &cs); + sched_setaffinity(pid, sizeof(cs), &cs); + + // TODO: try shuffling indexes for (int c = 0; c < reps; c++) { for (size_t i = 0; i < size; i++) { p[i] = i; } } - time_t t1 = time(NULL); - cout << t1 - t0 << endl; + + // Print the elapsed time; + cout << pid; + t.print(); + return NULL; } int -main() +main(int argc, char** argv) { + if (argc < 2) { + cerr << "malloc <nthreads>" << endl; + return 1; + } + + const int n = atoi(argv[1]); void *p = malloc(size); - touch(p); + + // warmup + chew(p); + pthread_t ts[n]; + + // start thread on each core + for (int i = 0; i < n; i++) { + check(pthread_create(&ts[i], NULL, chew, p) == 0); + } + waitall(ts, n); return 0; + + // THRASH + + // spawn workers + for (int i = 0; i < n; i++) { + check(pthread_create(&ts[i], NULL, chew, p) == 0); + } + waitall(ts, n); + return 0; } Added: numa-bench/trunk/src/openmp.cc =================================================================== --- numa-bench/trunk/src/openmp.cc (rev 0) +++ numa-bench/trunk/src/openmp.cc 2008-02-10 21:48:17 UTC (rev 356) @@ -0,0 +1,7 @@ +#pragma omp + +int +main() +{ + return 0; +} Added: numa-bench/trunk/src/threads.cc =================================================================== --- numa-bench/trunk/src/threads.cc (rev 0) +++ numa-bench/trunk/src/threads.cc 2008-02-10 21:48:17 UTC (rev 356) @@ -0,0 +1,30 @@ +/** + * Demonstrates that each thread has its own unique TID. + */ + +#include <iostream> + +#include <pthread.h> +#include <sys/types.h> +#include <commons/threads.h> + +using namespace std; +using namespace commons; + +void* +print_tid(void*) +{ + pid_t pid = gettid(); + (pid_t)syscall(__NR_gettid); + cout << pid << endl; +} + +int +main() +{ + pthread_t t; + pthread_create(&t, NULL, &print_tid, NULL); + print_tid(NULL); + pthread_join(t, NULL); + return 0; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-12 19:17:49
|
Revision: 394 http://assorted.svn.sourceforge.net/assorted/?rev=394&view=rev Author: yangzhang Date: 2008-02-12 11:16:18 -0800 (Tue, 12 Feb 2008) Log Message: ----------- added custom_alloc test Modified Paths: -------------- numa-bench/trunk/src/Makefile Added Paths: ----------- numa-bench/trunk/src/custom_alloc.cc Modified: numa-bench/trunk/src/Makefile =================================================================== --- numa-bench/trunk/src/Makefile 2008-02-12 18:01:24 UTC (rev 393) +++ numa-bench/trunk/src/Makefile 2008-02-12 19:16:18 UTC (rev 394) @@ -16,6 +16,9 @@ threads: threads.cc $(COMMONS) $(CXX) +custom_alloc: custom_alloc.cc + $(CXX) + openmp: openmp.cc $(CXX) -fopenmp Added: numa-bench/trunk/src/custom_alloc.cc =================================================================== --- numa-bench/trunk/src/custom_alloc.cc (rev 0) +++ numa-bench/trunk/src/custom_alloc.cc 2008-02-12 19:16:18 UTC (rev 394) @@ -0,0 +1,60 @@ +// TODO: add in test of tbb allocator. + +#include <commons/region.h> +#include <commons/time.h> +#include <commons/boost/threads.h> +#include <boost/bind.hpp> + +using namespace boost; +using namespace commons; + +void +f(region_alloc<int>* a, int count) +{ + timer t("region: "); + int* j = NULL; + for (int i = 0; i < count; i++) { + j = a->allocate(1); + } + t.print(); + cout << j << endl; +} + +void +g(int count) +{ + timer t("new: "); + int* j = NULL; + for (int i = 0; i < count; i++) { + j = new int; + } + t.print(); + cout << j << endl; +} + +int +main() +{ + int ncores = 16; + int count = 1000000; + pthread_t ts[ncores]; + + // This is much faster. + { + region_alloc<int> as[ncores]; + for (int i = 0; i < ncores; i++) { + check((ts[i] = spawn(bind(f, &as[i], count))) != 0); + } + waitall(ts, ncores); + } + + // This is much slower. + { + for (int i = 0; i < ncores; i++) { + check((ts[i] = spawn(bind(g, count))) != 0); + } + waitall(ts, ncores); + } + + return 0; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-24 06:52:49
|
Revision: 490 http://assorted.svn.sourceforge.net/assorted/?rev=490&view=rev Author: yangzhang Date: 2008-02-23 22:52:53 -0800 (Sat, 23 Feb 2008) Log Message: ----------- moved to simple-build Added Paths: ----------- numa-bench/trunk/src/build Removed Paths: ------------- numa-bench/trunk/src/Makefile Deleted: numa-bench/trunk/src/Makefile =================================================================== --- numa-bench/trunk/src/Makefile 2008-02-24 06:49:38 UTC (rev 489) +++ numa-bench/trunk/src/Makefile 2008-02-24 06:52:53 UTC (rev 490) @@ -1,34 +0,0 @@ -COMMONS := $(wildcard commons/*.h) -CXX = g++-4.2 -O3 -I. -lnuma -lpthread -o $@ $< - -# all: avail cache malloc threads -all: malloc - -avail: avail.cc $(COMMONS) - $(CXX) - -cache: cache.cc $(COMMONS) - $(CXX) - -malloc-dbg: malloc.cc $(COMMONS) - $(CXX) -g3 -O0 - -malloc: malloc.cc $(COMMONS) - $(CXX) - -threads: threads.cc $(COMMONS) - $(CXX) - -custom_alloc: custom_alloc.cc - $(CXX) - -openmp: openmp.cc - $(CXX) -fopenmp - -bthreads: bthreads.cc - $(CXX) -lboost_thread-gcc41-mt - -clean: - rm -f avail cache - -.PHONY: clean Added: numa-bench/trunk/src/build =================================================================== --- numa-bench/trunk/src/build (rev 0) +++ numa-bench/trunk/src/build 2008-02-24 06:52:53 UTC (rev 490) @@ -0,0 +1,6 @@ +malloc: + srcs: [malloc.cc] + +avail: + srcs: [avail.cc] + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-26 08:36:17
|
Revision: 509 http://assorted.svn.sourceforge.net/assorted/?rev=509&view=rev Author: yangzhang Date: 2008-02-26 00:36:09 -0800 (Tue, 26 Feb 2008) Log Message: ----------- experimenting with (most promising) build systems Added Paths: ----------- numa-bench/trunk/src/CMakeLists.txt numa-bench/trunk/src/Jamroot Added: numa-bench/trunk/src/CMakeLists.txt =================================================================== --- numa-bench/trunk/src/CMakeLists.txt (rev 0) +++ numa-bench/trunk/src/CMakeLists.txt 2008-02-26 08:36:09 UTC (rev 509) @@ -0,0 +1,7 @@ +project (numa-bench) + +add_executable (malloc malloc.cc) +target_link_libraries (malloc pthread) + +add_executable (avail avail.cc) +target_link_libraries (avail numa) Added: numa-bench/trunk/src/Jamroot =================================================================== --- numa-bench/trunk/src/Jamroot (rev 0) +++ numa-bench/trunk/src/Jamroot 2008-02-26 08:36:09 UTC (rev 509) @@ -0,0 +1,2 @@ +lib pthread ; +exe malloc : malloc.cc pthread : <toolset>gcc:<cxxflags>-g3 ; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-26 19:48:39
|
Revision: 513 http://assorted.svn.sourceforge.net/assorted/?rev=513&view=rev Author: yangzhang Date: 2008-02-26 11:48:37 -0800 (Tue, 26 Feb 2008) Log Message: ----------- renamed malloc to chew Modified Paths: -------------- numa-bench/trunk/src/CMakeLists.txt numa-bench/trunk/src/GNUmakefile numa-bench/trunk/src/Jamroot numa-bench/trunk/src/build Added Paths: ----------- numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Removed Paths: ------------- numa-bench/trunk/src/malloc.bash numa-bench/trunk/src/malloc.cc Modified: numa-bench/trunk/src/CMakeLists.txt =================================================================== --- numa-bench/trunk/src/CMakeLists.txt 2008-02-26 19:41:17 UTC (rev 512) +++ numa-bench/trunk/src/CMakeLists.txt 2008-02-26 19:48:37 UTC (rev 513) @@ -1,7 +1,7 @@ project (numa-bench) -add_executable (malloc malloc.cc) -target_link_libraries (malloc pthread) +add_executable (chew chew.cc) +target_link_libraries (chew pthread) add_executable (avail avail.cc) target_link_libraries (avail numa) Modified: numa-bench/trunk/src/GNUmakefile =================================================================== --- numa-bench/trunk/src/GNUmakefile 2008-02-26 19:41:17 UTC (rev 512) +++ numa-bench/trunk/src/GNUmakefile 2008-02-26 19:48:37 UTC (rev 513) @@ -1,16 +1,16 @@ OUTDIR := /home/yang/work/assorted/numa-bench/trunk/src/out SRCDIR := /home/yang/work/assorted/numa-bench/trunk/src SRCPATH := /home/yang/work/assorted/numa-bench/trunk/src -all: malloc avail -PREFIX_0 := /opt/malloc +all: chew avail +PREFIX_0 := /opt/chew BINDIR_0 := $(PREFIX_0)/bin LIBS_0 := -lpthread -SRCS_0 := $(SRCDIR)/malloc.cc -ORIGTARGET_0 := malloc +SRCS_0 := $(SRCDIR)/chew.cc +ORIGTARGET_0 := chew AUTOLIBS_0 := LANG_0 := cpp -TARGET_0 := malloc-dbg +TARGET_0 := chew-dbg FLAGS_0 := -g3 OBJDIR_0 := $(OUTDIR)/$(TARGET_0)-obj @@ -59,15 +59,15 @@ $(ORIGTARGET_0): $(TARGET_0) .PHONY: $(ORIGTARGET_0) -PREFIX_1 := /opt/malloc +PREFIX_1 := /opt/chew BINDIR_1 := $(PREFIX_1)/bin LIBS_1 := -lpthread -SRCS_1 := $(SRCDIR)/malloc.cc -ORIGTARGET_1 := malloc +SRCS_1 := $(SRCDIR)/chew.cc +ORIGTARGET_1 := chew AUTOLIBS_1 := LANG_1 := cpp -TARGET_1 := malloc-opt +TARGET_1 := chew-opt FLAGS_1 := -O3 OBJDIR_1 := $(OUTDIR)/$(TARGET_1)-obj @@ -116,15 +116,15 @@ $(ORIGTARGET_1): $(TARGET_1) .PHONY: $(ORIGTARGET_1) -PREFIX_2 := /opt/malloc +PREFIX_2 := /opt/chew BINDIR_2 := $(PREFIX_2)/bin LIBS_2 := -lpthread -SRCS_2 := $(SRCDIR)/malloc.cc -ORIGTARGET_2 := malloc +SRCS_2 := $(SRCDIR)/chew.cc +ORIGTARGET_2 := chew AUTOLIBS_2 := LANG_2 := cpp -TARGET_2 := malloc-gprof +TARGET_2 := chew-gprof FLAGS_2 := -pg OBJDIR_2 := $(OUTDIR)/$(TARGET_2)-obj Modified: numa-bench/trunk/src/Jamroot =================================================================== --- numa-bench/trunk/src/Jamroot 2008-02-26 19:41:17 UTC (rev 512) +++ numa-bench/trunk/src/Jamroot 2008-02-26 19:48:37 UTC (rev 513) @@ -1,2 +1,2 @@ lib pthread ; -exe malloc : malloc.cc pthread : <toolset>gcc:<cxxflags>-g3 ; +exe chew : chew.cc pthread : <toolset>gcc:<cxxflags>-g3 ; Modified: numa-bench/trunk/src/build =================================================================== --- numa-bench/trunk/src/build 2008-02-26 19:41:17 UTC (rev 512) +++ numa-bench/trunk/src/build 2008-02-26 19:48:37 UTC (rev 513) @@ -1,5 +1,5 @@ -malloc: - srcs: [malloc.cc] +chew: + srcs: [chew.cc] libs: [pthread] avail: Copied: numa-bench/trunk/src/chew.bash (from rev 511, numa-bench/trunk/src/malloc.bash) =================================================================== --- numa-bench/trunk/src/chew.bash (rev 0) +++ numa-bench/trunk/src/chew.bash 2008-02-26 19:48:37 UTC (rev 513) @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -o errexit -o nounset + +make -s chew-opt + +function run { + for i in {1..3} + do out/chew-opt "$@" + done +} + +KB=000 MB=000000 GB=000000000 + +# ncores size nreps shuffle par pin local write cross + +echo writes +run 16 100$MB 1 0 0 1 0 1 0 +run 16 1000$MB 1 0 0 1 0 1 0 +run 16 100$MB 10 0 0 1 0 1 0 +run 16 100$MB 1 1 0 1 0 1 0 + +echo reads +run 16 1000$MB 1 0 0 1 0 0 0 +run 16 100$MB 1 1 0 1 0 0 0 + +for n in 1 2 4 8 12 16 ; do + echo par + run $n 10$MB 1 0 1 1 0 0 0 + run $n 10$MB 1 1 1 1 0 0 0 + run $n 10$MB 1 0 1 1 1 0 0 + run $n 10$MB 1 1 1 1 1 0 0 + run $n 10$MB 1 0 1 1 0 1 0 + run $n 10$MB 1 1 1 1 0 1 0 + run $n 10$MB 1 0 1 1 1 1 0 + run $n 10$MB 1 1 1 1 1 1 0 + + echo cross + run $n 10$MB 1 0 1 1 0 0 1 + run $n 10$MB 1 1 1 1 0 0 1 + run $n 10$MB 1 0 1 1 0 1 1 + run $n 10$MB 1 1 1 1 0 1 1 +done Copied: numa-bench/trunk/src/chew.cc (from rev 512, numa-bench/trunk/src/malloc.cc) =================================================================== --- numa-bench/trunk/src/chew.cc (rev 0) +++ numa-bench/trunk/src/chew.cc 2008-02-26 19:48:37 UTC (rev 513) @@ -0,0 +1,246 @@ +#include <fstream> +#include <iostream> + +#include <sched.h> + +#include <boost/bind.hpp> + +#include <commons/check.h> +#include <commons/rand.h> +#include <commons/threads.h> +#include <commons/time.h> +#include <commons/boost/threads.h> + +using namespace boost; +using namespace commons; +using namespace std; + +pthread_barrier_t cross_barrier; + +struct config +{ + /** + * The number of cores to test. This is a parameter (rather than + * auto-detected) because it additionally serves to mean the number of cores + * we want to test in parallel. As this program evolves, these may be + * separated. + */ + const int ncores; + + /** + * Size in bytes of the buffer to chew. + */ + const size_t size; + + /** + * Number of repetitions to chew. + */ + const int nreps; + + /** + * Perform rand access, otherwise sequential scan. + */ + const bool shuffle; + + /** + * Chew in parallel, otherwise each core chews serially. + */ + const bool par; + + /** + * Pin thread i to core i, otherwise let the OS manage things. + */ + const bool pin; + + /** + * Chew my own memory, otherwise chew the given (shared) memory. + */ + const bool local; + + /** + * Do writes, otherwise just do reads. + */ + const bool write; + + /** + * Test cross-communication (use partitions), otherwise use either the + * global/local buffer. + */ + const bool cross; +}; + +void*** partitions; +int global_sum; + +/** + * \param p The buffer to chew. + * \param config The experiment configuration. + * \param len Length of the buffer. + */ +void +chew1(void* pp, config config, size_t len) +{ + int* p = (int*) pp; + const size_t count = len / sizeof(int); + int sum = 0; + posix_rand rand(current_time_millis() ^ gettid()); + if (config.write) { + // Write to the region. + if (config.shuffle) { + // Random access into the memory region. + for (unsigned int c = 0; c < config.nreps; c++) { + for (size_t i = 0; i < count; i++) { + // NOTE: Using r as the index assumes that rand generates large-enough + // values. + int r = rand(); + sum += p[r % count] += r; + } + } + } else { + // Sequential scan through the memory region. + for (unsigned int c = 0; c < config.nreps; c++) { + for (size_t i = 0; i < count; i++) { + sum += p[i] += rand(); + } + } + } + } else { + // Only read from the region. + if (config.shuffle) { + // Random access into the memory region. + for (unsigned int c = 0; c < config.nreps; c++) { + for (size_t i = 0; i < count; i++) { + // NOTE: Using r as the index assumes that rand generates large-enough + // values. + sum += p[rand() % count]; + } + } + } else { + // Sequential scan through the memory region. + for (unsigned int c = 0; c < config.nreps; c++) { + for (size_t i = 0; i < count; i++) { + sum += p[i] + rand(); + } + } + } + } + global_sum += sum; +} + +/** + * \param pp The start of the buffer to chew. + * \param cpu Which CPU to pin our thread to. + * \param config The experiment configuration parameters. + * \param label Prefix for the elapsed time output. + */ +void* +chew(void* pp, unsigned int cpu, const config & config, bool warmup) +{ + // Pin this thread to cpu `cpu`. + if (config.pin) { + pin_thread(cpu); + } + + void* p = config.local ? malloc(config.size) : pp; + timer t(": "); + + if (!warmup && config.cross) { + size_t len = config.size / config.ncores; + for (int i = 0; i < config.ncores; i++) { + partitions[cpu][i] = new char[len]; + } + int barrier_result = pthread_barrier_wait(&cross_barrier); + check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || barrier_result == 0); + // TODO: make this more interesting than just a sequential traversal over + // the partitions. + for (int i = 0; i < config.ncores; i++) { + chew1(partitions[i][cpu], config, len); + } + } else { + chew1(p, config, config.size); + } + + // Print the elapsed time and "result". + if (warmup) cout << "warmup: "; + cout << cpu; + t.print(); + + if (config.local) free(p); + + return NULL; +} + +int +main(int argc, char** argv) +{ + // So that our global shared malloc takes place on the CPU 0's node. + pin_thread(0); + + if (argc < 10) { + cerr << argv[0] << + " <ncores> <size> <nreps> <shuffle> <par> <pin> <local> <write>" << endl; + return 1; + } + + // Parse command-line arguments. TODO + const config config = { + atoi(argv[1]), + atoi(argv[2]), + atoi(argv[3]), + atoi(argv[4]), + atoi(argv[5]), + atoi(argv[6]), + atoi(argv[7]), + atoi(argv[8]), + atoi(argv[9]) + }; + + cout << "config:" + << " ncores " << config.ncores + << " size " << config.size + << " nreps " << config.nreps + << " shuffle " << config.shuffle + << " par " << config.par + << " pin " << config.pin + << " local " << config.local + << " write " << config.write + << " cross " << config.cross << endl; + + checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough"); + + void *p = malloc(config.size); + check(p != NULL); + + if (config.cross) { + partitions = new void**[config.ncores]; + for (unsigned int i = 0; i < config.ncores; i++) + partitions[i] = new void*[config.ncores]; + } + + // Warmup. + chew(p, 0, config, true); + + if (config.par) { + // Chew the memory area from each core in parallel (and also chew own). + pthread_t ts[config.ncores]; + check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncores)); + for (int i = 0; i < config.ncores; i++) { + ts[i] = spawn(bind(chew, p, i, ref(config), false)); + } + for (int i = 0; i < config.ncores; i++) { + check(pthread_join(ts[i], NULL) == 0); + } + check(0 == pthread_barrier_destroy(&cross_barrier)); + } else { + // Chew the memory area from each core in sequence. + for (int i = 0; i < config.ncores; i++) { + chew(p, i, config, false); + } + } + + free(p); + ofstream trash("/dev/null"); + trash << "result: " << global_sum << endl; + + return 0; +} Deleted: numa-bench/trunk/src/malloc.bash =================================================================== --- numa-bench/trunk/src/malloc.bash 2008-02-26 19:41:17 UTC (rev 512) +++ numa-bench/trunk/src/malloc.bash 2008-02-26 19:48:37 UTC (rev 513) @@ -1,43 +0,0 @@ -#!/usr/bin/env bash - -set -o errexit -o nounset - -make -s malloc-opt - -function run { - for i in {1..3} - do out/malloc-opt "$@" - done -} - -KB=000 MB=000000 GB=000000000 - -# ncores size nreps shuffle par pin local write cross - -echo writes -run 16 100$MB 1 0 0 1 0 1 0 -run 16 1000$MB 1 0 0 1 0 1 0 -run 16 100$MB 10 0 0 1 0 1 0 -run 16 100$MB 1 1 0 1 0 1 0 - -echo reads -run 16 1000$MB 1 0 0 1 0 0 0 -run 16 100$MB 1 1 0 1 0 0 0 - -for n in 1 2 4 8 12 16 ; do - echo par - run $n 10$MB 1 0 1 1 0 0 0 - run $n 10$MB 1 1 1 1 0 0 0 - run $n 10$MB 1 0 1 1 1 0 0 - run $n 10$MB 1 1 1 1 1 0 0 - run $n 10$MB 1 0 1 1 0 1 0 - run $n 10$MB 1 1 1 1 0 1 0 - run $n 10$MB 1 0 1 1 1 1 0 - run $n 10$MB 1 1 1 1 1 1 0 - - echo cross - run $n 10$MB 1 0 1 1 0 0 1 - run $n 10$MB 1 1 1 1 0 0 1 - run $n 10$MB 1 0 1 1 0 1 1 - run $n 10$MB 1 1 1 1 0 1 1 -done Deleted: numa-bench/trunk/src/malloc.cc =================================================================== --- numa-bench/trunk/src/malloc.cc 2008-02-26 19:41:17 UTC (rev 512) +++ numa-bench/trunk/src/malloc.cc 2008-02-26 19:48:37 UTC (rev 513) @@ -1,246 +0,0 @@ -#include <fstream> -#include <iostream> - -#include <sched.h> - -#include <boost/bind.hpp> - -#include <commons/check.h> -#include <commons/rand.h> -#include <commons/threads.h> -#include <commons/time.h> -#include <commons/boost/threads.h> - -using namespace boost; -using namespace commons; -using namespace std; - -pthread_barrier_t cross_barrier; - -struct config -{ - /** - * The number of cores to test. This is a parameter (rather than - * auto-detected) because it additionally serves to mean the number of cores - * we want to test in parallel. As this program evolves, these may be - * separated. - */ - const int ncores; - - /** - * Size in bytes of the buffer to chew. - */ - const size_t size; - - /** - * Number of repetitions to chew. - */ - const int nreps; - - /** - * Perform rand access, otherwise sequential scan. - */ - const bool shuffle; - - /** - * Chew in parallel, otherwise each core chews serially. - */ - const bool par; - - /** - * Pin thread i to core i, otherwise let the OS manage things. - */ - const bool pin; - - /** - * Chew my own memory, otherwise chew the given (shared) memory. - */ - const bool local; - - /** - * Do writes, otherwise just do reads. - */ - const bool write; - - /** - * Test cross-communication (use partitions), otherwise use either the - * global/local buffer. - */ - const bool cross; -}; - -void*** partitions; -int global_sum; - -/** - * \param p The buffer to chew. - * \param config The experiment configuration. - * \param len Length of the buffer. - */ -void -chew1(void* pp, config config, size_t len) -{ - int* p = (int*) pp; - const size_t count = len / sizeof(int); - int sum = 0; - posix_rand rand(current_time_millis() ^ gettid()); - if (config.write) { - // Write to the region. - if (config.shuffle) { - // Random access into the memory region. - for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { - // NOTE: Using r as the index assumes that rand generates large-enough - // values. - int r = rand(); - sum += p[r % count] += r; - } - } - } else { - // Sequential scan through the memory region. - for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { - sum += p[i] += rand(); - } - } - } - } else { - // Only read from the region. - if (config.shuffle) { - // Random access into the memory region. - for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { - // NOTE: Using r as the index assumes that rand generates large-enough - // values. - sum += p[rand() % count]; - } - } - } else { - // Sequential scan through the memory region. - for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { - sum += p[i] + rand(); - } - } - } - } - global_sum += sum; -} - -/** - * \param pp The start of the buffer to chew. - * \param cpu Which CPU to pin our thread to. - * \param config The experiment configuration parameters. - * \param label Prefix for the elapsed time output. - */ -void* -chew(void* pp, unsigned int cpu, const config & config, bool warmup) -{ - // Pin this thread to cpu `cpu`. - if (config.pin) { - pin_thread(cpu); - } - - void* p = config.local ? malloc(config.size) : pp; - timer t(": "); - - if (!warmup && config.cross) { - size_t len = config.size / config.ncores; - for (int i = 0; i < config.ncores; i++) { - partitions[cpu][i] = new char[len]; - } - int barrier_result = pthread_barrier_wait(&cross_barrier); - check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || barrier_result == 0); - // TODO: make this more interesting than just a sequential traversal over - // the partitions. - for (int i = 0; i < config.ncores; i++) { - chew1(partitions[i][cpu], config, len); - } - } else { - chew1(p, config, config.size); - } - - // Print the elapsed time and "result". - if (warmup) cout << "warmup: "; - cout << cpu; - t.print(); - - if (config.local) free(p); - - return NULL; -} - -int -main(int argc, char** argv) -{ - // So that our global shared malloc takes place on the CPU 0's node. - pin_thread(0); - - if (argc < 10) { - cerr << argv[0] << - " <ncores> <size> <nreps> <shuffle> <par> <pin> <local> <write>" << endl; - return 1; - } - - // Parse command-line arguments. TODO - const config config = { - atoi(argv[1]), - atoi(argv[2]), - atoi(argv[3]), - atoi(argv[4]), - atoi(argv[5]), - atoi(argv[6]), - atoi(argv[7]), - atoi(argv[8]), - atoi(argv[9]) - }; - - cout << "config:" - << " ncores " << config.ncores - << " size " << config.size - << " nreps " << config.nreps - << " shuffle " << config.shuffle - << " par " << config.par - << " pin " << config.pin - << " local " << config.local - << " write " << config.write - << " cross " << config.cross << endl; - - checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough"); - - void *p = malloc(config.size); - check(p != NULL); - - if (config.cross) { - partitions = new void**[config.ncores]; - for (unsigned int i = 0; i < config.ncores; i++) - partitions[i] = new void*[config.ncores]; - } - - // Warmup. - chew(p, 0, config, true); - - if (config.par) { - // Chew the memory area from each core in parallel (and also chew own). - pthread_t ts[config.ncores]; - check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncores)); - for (int i = 0; i < config.ncores; i++) { - ts[i] = spawn(bind(chew, p, i, ref(config), false)); - } - for (int i = 0; i < config.ncores; i++) { - check(pthread_join(ts[i], NULL) == 0); - } - check(0 == pthread_barrier_destroy(&cross_barrier)); - } else { - // Chew the memory area from each core in sequence. - for (int i = 0; i < config.ncores; i++) { - chew(p, i, config, false); - } - } - - free(p); - ofstream trash("/dev/null"); - trash << "result: " << global_sum << endl; - - return 0; -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-02-26 20:30:58
|
Revision: 514 http://assorted.svn.sourceforge.net/assorted/?rev=514&view=rev Author: yangzhang Date: 2008-02-26 12:31:00 -0800 (Tue, 26 Feb 2008) Log Message: ----------- added round-robin node allocation; renamed cores to cpus Modified Paths: -------------- numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Modified: numa-bench/trunk/src/chew.bash =================================================================== --- numa-bench/trunk/src/chew.bash 2008-02-26 19:48:37 UTC (rev 513) +++ numa-bench/trunk/src/chew.bash 2008-02-26 20:31:00 UTC (rev 514) @@ -12,32 +12,32 @@ KB=000 MB=000000 GB=000000000 -# ncores size nreps shuffle par pin local write cross +# ncpus size nreps shuffle par pin local write cross rrnodes echo writes -run 16 100$MB 1 0 0 1 0 1 0 -run 16 1000$MB 1 0 0 1 0 1 0 -run 16 100$MB 10 0 0 1 0 1 0 -run 16 100$MB 1 1 0 1 0 1 0 +run 16 100$MB 1 0 0 1 0 1 0 1 +run 16 1000$MB 1 0 0 1 0 1 0 1 +run 16 100$MB 10 0 0 1 0 1 0 1 +run 16 100$MB 1 1 0 1 0 1 0 1 echo reads -run 16 1000$MB 1 0 0 1 0 0 0 -run 16 100$MB 1 1 0 1 0 0 0 +run 16 1000$MB 1 0 0 1 0 0 0 1 +run 16 100$MB 1 1 0 1 0 0 0 1 for n in 1 2 4 8 12 16 ; do echo par - run $n 10$MB 1 0 1 1 0 0 0 - run $n 10$MB 1 1 1 1 0 0 0 - run $n 10$MB 1 0 1 1 1 0 0 - run $n 10$MB 1 1 1 1 1 0 0 - run $n 10$MB 1 0 1 1 0 1 0 - run $n 10$MB 1 1 1 1 0 1 0 - run $n 10$MB 1 0 1 1 1 1 0 - run $n 10$MB 1 1 1 1 1 1 0 + run $n 10$MB 1 0 1 1 0 0 0 1 + run $n 10$MB 1 1 1 1 0 0 0 1 + run $n 10$MB 1 0 1 1 1 0 0 1 + run $n 10$MB 1 1 1 1 1 0 0 1 + run $n 10$MB 1 0 1 1 0 1 0 1 + run $n 10$MB 1 1 1 1 0 1 0 1 + run $n 10$MB 1 0 1 1 1 1 0 1 + run $n 10$MB 1 1 1 1 1 1 0 1 echo cross - run $n 10$MB 1 0 1 1 0 0 1 - run $n 10$MB 1 1 1 1 0 0 1 - run $n 10$MB 1 0 1 1 0 1 1 - run $n 10$MB 1 1 1 1 0 1 1 + run $n 10$MB 1 0 1 1 0 0 1 1 + run $n 10$MB 1 1 1 1 0 0 1 1 + run $n 10$MB 1 0 1 1 0 1 1 1 + run $n 10$MB 1 1 1 1 0 1 1 1 done Modified: numa-bench/trunk/src/chew.cc =================================================================== --- numa-bench/trunk/src/chew.cc 2008-02-26 19:48:37 UTC (rev 513) +++ numa-bench/trunk/src/chew.cc 2008-02-26 20:31:00 UTC (rev 514) @@ -20,12 +20,12 @@ struct config { /** - * The number of cores to test. This is a parameter (rather than - * auto-detected) because it additionally serves to mean the number of cores + * The number of CPUs to test. This is a parameter (rather than + * auto-detected) because it additionally serves to mean the number of CPUs * we want to test in parallel. As this program evolves, these may be * separated. */ - const int ncores; + const unsigned int ncpus; /** * Size in bytes of the buffer to chew. @@ -35,7 +35,7 @@ /** * Number of repetitions to chew. */ - const int nreps; + const unsigned int nreps; /** * Perform rand access, otherwise sequential scan. @@ -43,12 +43,13 @@ const bool shuffle; /** - * Chew in parallel, otherwise each core chews serially. + * Chew in parallel, otherwise each worker chews serially. */ const bool par; /** - * Pin thread i to core i, otherwise let the OS manage things. + * Pin workers to CPUs (which CPU determined by whether we're using + * round-robin CPU allocation), otherwise let the OS manage things. */ const bool pin; @@ -67,6 +68,17 @@ * global/local buffer. */ const bool cross; + + /** + * When using fewer than the maximum number of CPUs, allocate nodes in + * round-robin fashion. + */ + const bool rrnodes; + + /** + * The number of nodes on this machine. + */ + const unsigned int nnodes; }; void*** partitions; @@ -129,15 +141,21 @@ /** * \param pp The start of the buffer to chew. - * \param cpu Which CPU to pin our thread to. + * \param worker From this we can determine which CPU to pin our thread to. * \param config The experiment configuration parameters. * \param label Prefix for the elapsed time output. */ void* -chew(void* pp, unsigned int cpu, const config & config, bool warmup) +chew(void* pp, unsigned int worker, const config & config, bool warmup) { - // Pin this thread to cpu `cpu`. + // Pin this thread to a CPU. if (config.pin) { + unsigned int ncpus = config.ncpus, + nnodes = config.nnodes, + cpuspernode = ncpus / nnodes, + cpu = !config.rrnodes ? + worker : + (cpuspernode * worker) % ncpus + worker / cpuspernode; pin_thread(cpu); } @@ -145,16 +163,17 @@ timer t(": "); if (!warmup && config.cross) { - size_t len = config.size / config.ncores; - for (int i = 0; i < config.ncores; i++) { - partitions[cpu][i] = new char[len]; + size_t len = config.size / config.ncpus; + for (unsigned int i = 0; i < config.ncpus; i++) { + partitions[worker][i] = new char[len]; } int barrier_result = pthread_barrier_wait(&cross_barrier); - check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || barrier_result == 0); + check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || + barrier_result == 0); // TODO: make this more interesting than just a sequential traversal over // the partitions. - for (int i = 0; i < config.ncores; i++) { - chew1(partitions[i][cpu], config, len); + for (unsigned int i = 0; i < config.ncpus; i++) { + chew1(partitions[i][worker], config, len); } } else { chew1(p, config, config.size); @@ -162,7 +181,7 @@ // Print the elapsed time and "result". if (warmup) cout << "warmup: "; - cout << cpu; + cout << worker; t.print(); if (config.local) free(p); @@ -176,27 +195,30 @@ // So that our global shared malloc takes place on the CPU 0's node. pin_thread(0); - if (argc < 10) { + if (argc < 11) { cerr << argv[0] << - " <ncores> <size> <nreps> <shuffle> <par> <pin> <local> <write>" << endl; + " <ncpus> <size> <nreps> <shuffle> <par> <pin>" + " <local> <write> <rrnodes> <nnodes>" << endl; return 1; } - // Parse command-line arguments. TODO - const config config = { - atoi(argv[1]), - atoi(argv[2]), - atoi(argv[3]), - atoi(argv[4]), - atoi(argv[5]), - atoi(argv[6]), - atoi(argv[7]), - atoi(argv[8]), - atoi(argv[9]) + // Parse command-line arguments. + const config config = { + atoi(argv[ 1]), + atoi(argv[ 2]), + atoi(argv[ 3]), + atoi(argv[ 4]), + atoi(argv[ 5]), + atoi(argv[ 6]), + atoi(argv[ 7]), + atoi(argv[ 8]), + atoi(argv[ 9]), + atoi(argv[10]), + atoi(argv[11]) }; cout << "config:" - << " ncores " << config.ncores + << " ncpus " << config.ncpus << " size " << config.size << " nreps " << config.nreps << " shuffle " << config.shuffle @@ -204,7 +226,10 @@ << " pin " << config.pin << " local " << config.local << " write " << config.write - << " cross " << config.cross << endl; + << " cross " << config.cross + << " rrnodes " << config.rrnodes + << " nnodes " << config.nnodes + << endl; checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough"); @@ -212,9 +237,9 @@ check(p != NULL); if (config.cross) { - partitions = new void**[config.ncores]; - for (unsigned int i = 0; i < config.ncores; i++) - partitions[i] = new void*[config.ncores]; + partitions = new void**[config.ncpus]; + for (unsigned int i = 0; i < config.ncpus; i++) + partitions[i] = new void*[config.ncpus]; } // Warmup. @@ -222,18 +247,18 @@ if (config.par) { // Chew the memory area from each core in parallel (and also chew own). - pthread_t ts[config.ncores]; - check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncores)); - for (int i = 0; i < config.ncores; i++) { + pthread_t ts[config.ncpus]; + check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncpus)); + for (unsigned int i = 0; i < config.ncpus; i++) { ts[i] = spawn(bind(chew, p, i, ref(config), false)); } - for (int i = 0; i < config.ncores; i++) { + for (unsigned int i = 0; i < config.ncpus; i++) { check(pthread_join(ts[i], NULL) == 0); } check(0 == pthread_barrier_destroy(&cross_barrier)); } else { // Chew the memory area from each core in sequence. - for (int i = 0; i < config.ncores; i++) { + for (unsigned int i = 0; i < config.ncpus; i++) { chew(p, i, config, false); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-03-04 02:31:12
|
Revision: 594 http://assorted.svn.sourceforge.net/assorted/?rev=594&view=rev Author: yangzhang Date: 2008-03-03 18:29:07 -0800 (Mon, 03 Mar 2008) Log Message: ----------- fix args Modified Paths: -------------- numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Modified: numa-bench/trunk/src/chew.bash =================================================================== --- numa-bench/trunk/src/chew.bash 2008-03-03 23:57:43 UTC (rev 593) +++ numa-bench/trunk/src/chew.bash 2008-03-04 02:29:07 UTC (rev 594) @@ -12,32 +12,32 @@ KB=000 MB=000000 GB=000000000 -# ncpus size nreps shuffle par pin local write cross rrnodes +# ncpus size nreps shuffle par pin local write cross rrnodes nnodes echo writes -run 16 100$MB 1 0 0 1 0 1 0 1 -run 16 1000$MB 1 0 0 1 0 1 0 1 -run 16 100$MB 10 0 0 1 0 1 0 1 -run 16 100$MB 1 1 0 1 0 1 0 1 +run 16 100$MB 1 0 0 1 0 1 0 1 4 +run 16 1000$MB 1 0 0 1 0 1 0 1 4 +run 16 100$MB 10 0 0 1 0 1 0 1 4 +run 16 100$MB 1 1 0 1 0 1 0 1 4 echo reads -run 16 1000$MB 1 0 0 1 0 0 0 1 -run 16 100$MB 1 1 0 1 0 0 0 1 +run 16 1000$MB 1 0 0 1 0 0 0 1 4 +run 16 100$MB 1 1 0 1 0 0 0 1 4 for n in 1 2 4 8 12 16 ; do echo par - run $n 10$MB 1 0 1 1 0 0 0 1 - run $n 10$MB 1 1 1 1 0 0 0 1 - run $n 10$MB 1 0 1 1 1 0 0 1 - run $n 10$MB 1 1 1 1 1 0 0 1 - run $n 10$MB 1 0 1 1 0 1 0 1 - run $n 10$MB 1 1 1 1 0 1 0 1 - run $n 10$MB 1 0 1 1 1 1 0 1 - run $n 10$MB 1 1 1 1 1 1 0 1 + run $n 10$MB 1 0 1 1 0 0 0 1 4 + run $n 10$MB 1 1 1 1 0 0 0 1 4 + run $n 10$MB 1 0 1 1 1 0 0 1 4 + run $n 10$MB 1 1 1 1 1 0 0 1 4 + run $n 10$MB 1 0 1 1 0 1 0 1 4 + run $n 10$MB 1 1 1 1 0 1 0 1 4 + run $n 10$MB 1 0 1 1 1 1 0 1 4 + run $n 10$MB 1 1 1 1 1 1 0 1 4 echo cross - run $n 10$MB 1 0 1 1 0 0 1 1 - run $n 10$MB 1 1 1 1 0 0 1 1 - run $n 10$MB 1 0 1 1 0 1 1 1 - run $n 10$MB 1 1 1 1 0 1 1 1 + run $n 10$MB 1 0 1 1 0 0 1 1 4 + run $n 10$MB 1 1 1 1 0 0 1 1 4 + run $n 10$MB 1 0 1 1 0 1 1 1 4 + run $n 10$MB 1 1 1 1 0 1 1 1 4 done Modified: numa-bench/trunk/src/chew.cc =================================================================== --- numa-bench/trunk/src/chew.cc 2008-03-03 23:57:43 UTC (rev 593) +++ numa-bench/trunk/src/chew.cc 2008-03-04 02:29:07 UTC (rev 594) @@ -195,7 +195,7 @@ // So that our global shared malloc takes place on the CPU 0's node. pin_thread(0); - if (argc < 11) { + if (argc < 12) { cerr << argv[0] << " <ncpus> <size> <nreps> <shuffle> <par> <pin>" " <local> <write> <rrnodes> <nnodes>" << endl; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-03-04 06:31:49
|
Revision: 600 http://assorted.svn.sourceforge.net/assorted/?rev=600&view=rev Author: yangzhang Date: 2008-03-03 22:31:56 -0800 (Mon, 03 Mar 2008) Log Message: ----------- updated experiments Modified Paths: -------------- numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Modified: numa-bench/trunk/src/chew.bash =================================================================== --- numa-bench/trunk/src/chew.bash 2008-03-04 04:12:34 UTC (rev 599) +++ numa-bench/trunk/src/chew.bash 2008-03-04 06:31:56 UTC (rev 600) @@ -12,32 +12,32 @@ KB=000 MB=000000 GB=000000000 -# ncpus size nreps shuffle par pin local write cross rrnodes nnodes +# ncpus size nreps shuffle par pin local write cross rrnodes nnodes ncpus echo writes -run 16 100$MB 1 0 0 1 0 1 0 1 4 -run 16 1000$MB 1 0 0 1 0 1 0 1 4 -run 16 100$MB 10 0 0 1 0 1 0 1 4 -run 16 100$MB 1 1 0 1 0 1 0 1 4 +run 16 100$MB 1 0 0 1 0 1 0 1 4 16 +run 16 1000$MB 1 0 0 1 0 1 0 1 4 16 +run 16 100$MB 10 0 0 1 0 1 0 1 4 16 +run 16 100$MB 1 1 0 1 0 1 0 1 4 16 echo reads -run 16 1000$MB 1 0 0 1 0 0 0 1 4 -run 16 100$MB 1 1 0 1 0 0 0 1 4 +run 16 1000$MB 1 0 0 1 0 0 0 1 4 16 +run 16 100$MB 1 1 0 1 0 0 0 1 4 16 for n in 1 2 4 8 12 16 ; do echo par - run $n 10$MB 1 0 1 1 0 0 0 1 4 - run $n 10$MB 1 1 1 1 0 0 0 1 4 - run $n 10$MB 1 0 1 1 1 0 0 1 4 - run $n 10$MB 1 1 1 1 1 0 0 1 4 - run $n 10$MB 1 0 1 1 0 1 0 1 4 - run $n 10$MB 1 1 1 1 0 1 0 1 4 - run $n 10$MB 1 0 1 1 1 1 0 1 4 - run $n 10$MB 1 1 1 1 1 1 0 1 4 + run $n 10$MB 1 0 1 1 0 0 0 1 4 16 + run $n 10$MB 1 1 1 1 0 0 0 1 4 16 + run $n 10$MB 1 0 1 1 1 0 0 1 4 16 + run $n 10$MB 1 1 1 1 1 0 0 1 4 16 + run $n 10$MB 1 0 1 1 0 1 0 1 4 16 + run $n 10$MB 1 1 1 1 0 1 0 1 4 16 + run $n 10$MB 1 0 1 1 1 1 0 1 4 16 + run $n 10$MB 1 1 1 1 1 1 0 1 4 16 echo cross - run $n 10$MB 1 0 1 1 0 0 1 1 4 - run $n 10$MB 1 1 1 1 0 0 1 1 4 - run $n 10$MB 1 0 1 1 0 1 1 1 4 - run $n 10$MB 1 1 1 1 0 1 1 1 4 + run $n 10$MB 1 0 1 1 0 0 1 1 4 16 + run $n 10$MB 1 1 1 1 0 0 1 1 4 16 + run $n 10$MB 1 0 1 1 0 1 1 1 4 16 + run $n 10$MB 1 1 1 1 0 1 1 1 4 16 done Modified: numa-bench/trunk/src/chew.cc =================================================================== --- numa-bench/trunk/src/chew.cc 2008-03-04 04:12:34 UTC (rev 599) +++ numa-bench/trunk/src/chew.cc 2008-03-04 06:31:56 UTC (rev 600) @@ -16,16 +16,14 @@ using namespace std; pthread_barrier_t cross_barrier; +pthread_mutex_t iomutex; struct config { /** - * The number of CPUs to test. This is a parameter (rather than - * auto-detected) because it additionally serves to mean the number of CPUs - * we want to test in parallel. As this program evolves, these may be - * separated. + * The number of workers to run (usually this means the CPUs to test). */ - const unsigned int ncpus; + const unsigned int nworkers; /** * Size in bytes of the buffer to chew. @@ -77,8 +75,15 @@ /** * The number of nodes on this machine. + * TODO: auto-detect */ const unsigned int nnodes; + + /** + * The number of CPUs on this machine. + * TODO: auto-detect + */ + const unsigned int ncpus; }; void*** partitions; @@ -95,15 +100,18 @@ int* p = (int*) pp; const size_t count = len / sizeof(int); int sum = 0; + + // TODO: see these with random numbers generated from a global (serial) rand + // NOTE: Using rand as the index assumes that rand generates large-enough + // values. posix_rand rand(current_time_millis() ^ gettid()); + if (config.write) { // Write to the region. if (config.shuffle) { // Random access into the memory region. for (unsigned int c = 0; c < config.nreps; c++) { for (size_t i = 0; i < count; i++) { - // NOTE: Using r as the index assumes that rand generates large-enough - // values. int r = rand(); sum += p[r % count] += r; } @@ -122,8 +130,6 @@ // Random access into the memory region. for (unsigned int c = 0; c < config.nreps; c++) { for (size_t i = 0; i < count; i++) { - // NOTE: Using r as the index assumes that rand generates large-enough - // values. sum += p[rand() % count]; } } @@ -150,10 +156,11 @@ { // Pin this thread to a CPU. if (config.pin) { + // Round-robin is only applicable if we're doing a parallel test. unsigned int ncpus = config.ncpus, nnodes = config.nnodes, cpuspernode = ncpus / nnodes, - cpu = !config.rrnodes ? + cpu = !config.par || !config.rrnodes ? worker : (cpuspernode * worker) % ncpus + worker / cpuspernode; pin_thread(cpu); @@ -163,8 +170,8 @@ timer t(": "); if (!warmup && config.cross) { - size_t len = config.size / config.ncpus; - for (unsigned int i = 0; i < config.ncpus; i++) { + size_t len = config.size / config.nworkers; + for (unsigned int i = 0; i < config.nworkers; i++) { partitions[worker][i] = new char[len]; } int barrier_result = pthread_barrier_wait(&cross_barrier); @@ -172,7 +179,7 @@ barrier_result == 0); // TODO: make this more interesting than just a sequential traversal over // the partitions. - for (unsigned int i = 0; i < config.ncpus; i++) { + for (unsigned int i = 0; i < config.nworkers; i++) { chew1(partitions[i][worker], config, len); } } else { @@ -180,9 +187,11 @@ } // Print the elapsed time and "result". + check(pthread_mutex_lock(&iomutex) == 0); if (warmup) cout << "warmup: "; cout << worker; t.print(); + check(pthread_mutex_unlock(&iomutex) == 0); if (config.local) free(p); @@ -195,9 +204,9 @@ // So that our global shared malloc takes place on the CPU 0's node. pin_thread(0); - if (argc < 12) { + if (argc < 13) { cerr << argv[0] << - " <ncpus> <size> <nreps> <shuffle> <par> <pin>" + " <nworkers> <size> <nreps> <shuffle> <par> <pin>" " <local> <write> <rrnodes> <nnodes>" << endl; return 1; } @@ -214,30 +223,34 @@ atoi(argv[ 8]), atoi(argv[ 9]), atoi(argv[10]), - atoi(argv[11]) + atoi(argv[11]), + atoi(argv[12]) }; cout << "config:" - << " ncpus " << config.ncpus - << " size " << config.size - << " nreps " << config.nreps - << " shuffle " << config.shuffle - << " par " << config.par - << " pin " << config.pin - << " local " << config.local - << " write " << config.write - << " cross " << config.cross - << " rrnodes " << config.rrnodes - << " nnodes " << config.nnodes + << " nworkers " << config.nworkers + << " size " << config.size + << " nreps " << config.nreps + << " shuffle " << config.shuffle + << " par " << config.par + << " pin " << config.pin + << " local " << config.local + << " write " << config.write + << " cross " << config.cross + << " rrnodes " << config.rrnodes + << " nnodes " << config.nnodes + << " ncpus " << config.ncpus << endl; void *p = malloc(config.size); check(p != NULL); + check(pthread_mutex_init(&iomutex, NULL) == 0); + if (config.cross) { - partitions = new void**[config.ncpus]; - for (unsigned int i = 0; i < config.ncpus; i++) - partitions[i] = new void*[config.ncpus]; + partitions = new void**[config.nworkers]; + for (unsigned int i = 0; i < config.nworkers; i++) + partitions[i] = new void*[config.nworkers]; } // Warmup. @@ -245,18 +258,18 @@ if (config.par) { // Chew the memory area from each core in parallel (and also chew own). - pthread_t ts[config.ncpus]; - check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncpus)); - for (unsigned int i = 0; i < config.ncpus; i++) { + pthread_t ts[config.nworkers]; + check(0 == pthread_barrier_init(&cross_barrier, NULL, config.nworkers)); + for (unsigned int i = 0; i < config.nworkers; i++) { ts[i] = spawn(bind(chew, p, i, ref(config), false)); } - for (unsigned int i = 0; i < config.ncpus; i++) { + for (unsigned int i = 0; i < config.nworkers; i++) { check(pthread_join(ts[i], NULL) == 0); } check(0 == pthread_barrier_destroy(&cross_barrier)); } else { // Chew the memory area from each core in sequence. - for (unsigned int i = 0; i < config.ncpus; i++) { + for (unsigned int i = 0; i < config.nworkers; i++) { chew(p, i, config, false); } } @@ -265,5 +278,7 @@ ofstream trash("/dev/null"); trash << "result: " << global_sum << endl; + check(pthread_mutex_destroy(&iomutex) == 0); + return 0; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-03-05 05:49:10
|
Revision: 601 http://assorted.svn.sourceforge.net/assorted/?rev=601&view=rev Author: yangzhang Date: 2008-03-04 21:49:10 -0800 (Tue, 04 Mar 2008) Log Message: ----------- updated chew Modified Paths: -------------- numa-bench/trunk/src/build numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Modified: numa-bench/trunk/src/build =================================================================== --- numa-bench/trunk/src/build 2008-03-04 06:31:56 UTC (rev 600) +++ numa-bench/trunk/src/build 2008-03-05 05:49:10 UTC (rev 601) @@ -1,8 +1,7 @@ chew: srcs: [chew.cc] - libs: [pthread] + libs: [pthread, numa] avail: srcs: [avail.cc] libs: [pthread] - Modified: numa-bench/trunk/src/chew.bash =================================================================== --- numa-bench/trunk/src/chew.bash 2008-03-04 06:31:56 UTC (rev 600) +++ numa-bench/trunk/src/chew.bash 2008-03-05 05:49:10 UTC (rev 601) @@ -2,42 +2,67 @@ set -o errexit -o nounset -make -s chew-opt +reps=3 -function run { - for i in {1..3} - do out/chew-opt "$@" +make -s chew-dbg + +run() { + for i in {1..$reps} + do out/chew-dbg "$@" done } +K=000 M=000000 G=000000000 KB=000 MB=000000 GB=000000000 -# ncpus size nreps shuffle par pin local write cross rrnodes nnodes ncpus +# ncpus size opcount nreps shuffle par pin local write cross rrnodes nnodes ncpus -echo writes -run 16 100$MB 1 0 0 1 0 1 0 1 4 16 -run 16 1000$MB 1 0 0 1 0 1 0 1 4 16 -run 16 100$MB 10 0 0 1 0 1 0 1 4 16 -run 16 100$MB 1 1 0 1 0 1 0 1 4 16 +if true ; then -echo reads -run 16 1000$MB 1 0 0 1 0 0 0 1 4 16 -run 16 100$MB 1 1 0 1 0 0 0 1 4 16 +#echo writes +#run 16 100$MB 100$M 1 0 0 1 0 1 0 1 4 16 +#run 16 1000$MB 1000$M 1 0 0 1 0 1 0 1 4 16 +#run 16 100$MB 100$M 10 0 0 1 0 1 0 1 4 16 +#run 16 1000$GB 100$M 1 1 0 1 0 1 0 1 4 16 +# +#echo reads +#run 16 1000$MB 100$M 1 0 0 1 0 0 0 1 4 16 +#run 16 1000$MB 10$M 1 1 0 1 0 0 0 1 4 16 for n in 1 2 4 8 12 16 ; do echo par - run $n 10$MB 1 0 1 1 0 0 0 1 4 16 - run $n 10$MB 1 1 1 1 0 0 0 1 4 16 - run $n 10$MB 1 0 1 1 1 0 0 1 4 16 - run $n 10$MB 1 1 1 1 1 0 0 1 4 16 - run $n 10$MB 1 0 1 1 0 1 0 1 4 16 - run $n 10$MB 1 1 1 1 0 1 0 1 4 16 - run $n 10$MB 1 0 1 1 1 1 0 1 4 16 - run $n 10$MB 1 1 1 1 1 1 0 1 4 16 + run $n 1000$MB 10$M 1 0 1 1 0 0 0 1 4 16 + run $n 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 + run $n 1000$MB 10$M 1 0 1 1 1 0 0 1 4 16 + run $n 1000$MB 10$M 1 1 1 1 1 0 0 1 4 16 + run $n 1000$MB 10$M 1 0 1 1 0 1 0 1 4 16 + run $n 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 + run $n 1000$MB 10$M 1 0 1 1 1 1 0 1 4 16 + run $n 1000$MB 10$M 1 1 1 1 1 1 0 1 4 16 - echo cross - run $n 10$MB 1 0 1 1 0 0 1 1 4 16 - run $n 10$MB 1 1 1 1 0 0 1 1 4 16 - run $n 10$MB 1 0 1 1 0 1 1 1 4 16 - run $n 10$MB 1 1 1 1 0 1 1 1 4 16 +# echo cross +# run $n 1000$MB 10$M 1 0 1 1 0 0 1 1 4 16 +# run $n 1000$MB 10$M 1 1 1 1 0 0 1 1 4 16 +# run $n 1000$MB 10$M 1 0 1 1 0 1 1 1 4 16 +# run $n 1000$MB 10$M 1 1 1 1 0 1 1 1 4 16 done + +else + +#run 1 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 +#run 2 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 +#run 3 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 +#run 4 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 +#run 8 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 +#run 12 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 +#run 16 1000$MB 10$M 1 1 1 1 0 1 0 1 4 16 + +run 1 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +#run 2 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +#run 3 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +#run 4 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +#run 8 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +#run 12 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +run 16 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 + +fi Modified: numa-bench/trunk/src/chew.cc =================================================================== --- numa-bench/trunk/src/chew.cc 2008-03-04 06:31:56 UTC (rev 600) +++ numa-bench/trunk/src/chew.cc 2008-03-05 05:49:10 UTC (rev 601) @@ -3,6 +3,8 @@ #include <sched.h> +#include <numa.h> + #include <boost/bind.hpp> #include <commons/check.h> @@ -15,8 +17,12 @@ using namespace commons; using namespace std; -pthread_barrier_t cross_barrier; +// TODO Make into command line flags? +const bool debug = false, pretouch = false, do_warmup = false, use_numa = false; +pthread_barrier_t cross_barrier, startup_barrier; pthread_mutex_t iomutex; +void*** partitions; +int global_sum; struct config { @@ -31,6 +37,11 @@ const size_t size; /** + * The number of operations. + */ + const unsigned int opcount; + + /** * Number of repetitions to chew. */ const unsigned int nreps; @@ -86,16 +97,25 @@ const unsigned int ncpus; }; -void*** partitions; -int global_sum; +void * +alloc(size_t sz) +{ + return use_numa ? numa_alloc_local(sz) : malloc(sz); +} +void +dealloc(void *p, size_t sz) +{ + return use_numa ? numa_free(p, sz) : free(p); +} + /** * \param p The buffer to chew. * \param config The experiment configuration. * \param len Length of the buffer. */ void -chew1(void* pp, config config, size_t len) +chew1(void* pp, config config, size_t len, unsigned int seed) { int* p = (int*) pp; const size_t count = len / sizeof(int); @@ -104,14 +124,14 @@ // TODO: see these with random numbers generated from a global (serial) rand // NOTE: Using rand as the index assumes that rand generates large-enough // values. - posix_rand rand(current_time_millis() ^ gettid()); + posix_rand rand(current_time_millis() ^ gettid() ^ seed); if (config.write) { // Write to the region. if (config.shuffle) { // Random access into the memory region. for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { + for (size_t i = 0; i < config.opcount; i++) { int r = rand(); sum += p[r % count] += r; } @@ -119,7 +139,7 @@ } else { // Sequential scan through the memory region. for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { + for (size_t i = 0; i < config.opcount; i++) { sum += p[i] += rand(); } } @@ -129,14 +149,14 @@ if (config.shuffle) { // Random access into the memory region. for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { + for (size_t i = 0; i < config.opcount; i++) { sum += p[rand() % count]; } } } else { // Sequential scan through the memory region. for (unsigned int c = 0; c < config.nreps; c++) { - for (size_t i = 0; i < count; i++) { + for (size_t i = 0; i < config.opcount; i++) { sum += p[i] + rand(); } } @@ -166,7 +186,29 @@ pin_thread(cpu); } - void* p = config.local ? malloc(config.size) : pp; + void* p = config.local ? alloc(config.size) : pp; + if (pretouch) { + int *is = (int*) p; + for (size_t i = 0; i < config.size / sizeof(int); i++) { + is[i] = i; + } + int sum = 0; + for (size_t i = 0; i < config.size / sizeof(int); i++) { + sum += is[i]; + } + global_sum += sum; + } + if (debug) { + check(pthread_mutex_lock(&iomutex) == 0); + cout << worker << " alloc " << p << endl; + check(pthread_mutex_unlock(&iomutex) == 0); + } + if (!warmup) { + int barrier_result = pthread_barrier_wait(&startup_barrier); + check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || + barrier_result == 0); + } + posix_rand rand(current_time_millis()); timer t(": "); if (!warmup && config.cross) { @@ -180,10 +222,10 @@ // TODO: make this more interesting than just a sequential traversal over // the partitions. for (unsigned int i = 0; i < config.nworkers; i++) { - chew1(partitions[i][worker], config, len); + chew1(partitions[i][worker], config, len, rand()); } } else { - chew1(p, config, config.size); + chew1(p, config, config.size, rand()); } // Print the elapsed time and "result". @@ -193,7 +235,7 @@ t.print(); check(pthread_mutex_unlock(&iomutex) == 0); - if (config.local) free(p); + if (config.local) dealloc(p, config.size); return NULL; } @@ -206,8 +248,8 @@ if (argc < 13) { cerr << argv[0] << - " <nworkers> <size> <nreps> <shuffle> <par> <pin>" - " <local> <write> <rrnodes> <nnodes>" << endl; + " <nworkers> <size> <opcount> <nreps> <shuffle> <par> <pin>" + " <local> <write> <rrnodes> <nnodes> <ncpus>" << endl; return 1; } @@ -224,12 +266,14 @@ atoi(argv[ 9]), atoi(argv[10]), atoi(argv[11]), - atoi(argv[12]) + atoi(argv[12]), + atoi(argv[13]) }; cout << "config:" << " nworkers " << config.nworkers << " size " << config.size + << " opcount " << config.opcount << " nreps " << config.nreps << " shuffle " << config.shuffle << " par " << config.par @@ -242,6 +286,8 @@ << " ncpus " << config.ncpus << endl; + check(config.shuffle || config.opcount <= config.size / sizeof(int)); + void *p = malloc(config.size); check(p != NULL); @@ -254,12 +300,13 @@ } // Warmup. - chew(p, 0, config, true); + if (do_warmup) chew(p, 0, config, true); if (config.par) { // Chew the memory area from each core in parallel (and also chew own). pthread_t ts[config.nworkers]; check(0 == pthread_barrier_init(&cross_barrier, NULL, config.nworkers)); + check(0 == pthread_barrier_init(&startup_barrier, NULL, config.nworkers)); for (unsigned int i = 0; i < config.nworkers; i++) { ts[i] = spawn(bind(chew, p, i, ref(config), false)); } @@ -267,6 +314,7 @@ check(pthread_join(ts[i], NULL) == 0); } check(0 == pthread_barrier_destroy(&cross_barrier)); + check(0 == pthread_barrier_destroy(&startup_barrier)); } else { // Chew the memory area from each core in sequence. for (unsigned int i = 0; i < config.nworkers; i++) { @@ -278,6 +326,8 @@ ofstream trash("/dev/null"); trash << "result: " << global_sum << endl; + cout << "result: " << global_sum << endl; + check(pthread_mutex_destroy(&iomutex) == 0); return 0; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <yan...@us...> - 2008-03-05 19:30:12
|
Revision: 608 http://assorted.svn.sourceforge.net/assorted/?rev=608&view=rev Author: yangzhang Date: 2008-03-05 11:30:17 -0800 (Wed, 05 Mar 2008) Log Message: ----------- latest experiments Modified Paths: -------------- numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Modified: numa-bench/trunk/src/chew.bash =================================================================== --- numa-bench/trunk/src/chew.bash 2008-03-05 07:09:45 UTC (rev 607) +++ numa-bench/trunk/src/chew.bash 2008-03-05 19:30:17 UTC (rev 608) @@ -2,12 +2,12 @@ set -o errexit -o nounset -reps=3 +reps=1 make -s chew-dbg run() { - for i in {1..$reps} + for ((i = 0; i < reps; i++)) do out/chew-dbg "$@" done } @@ -61,8 +61,8 @@ #run 2 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 #run 3 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 #run 4 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 -#run 8 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 -#run 12 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +run 8 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 +run 12 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 run 16 1000$MB 10$M 1 1 1 1 0 0 0 1 4 16 fi Modified: numa-bench/trunk/src/chew.cc =================================================================== --- numa-bench/trunk/src/chew.cc 2008-03-05 07:09:45 UTC (rev 607) +++ numa-bench/trunk/src/chew.cc 2008-03-05 19:30:17 UTC (rev 608) @@ -18,7 +18,7 @@ using namespace std; // TODO Make into command line flags? -const bool debug = false, pretouch = false, do_warmup = false, use_numa = false; +const bool debug = false, pretouch = true, do_warmup = false, use_numa = false; pthread_barrier_t cross_barrier, startup_barrier; pthread_mutex_t iomutex; void*** partitions; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |