[Assorted-commits] SF.net SVN: assorted: [600] numa-bench/trunk/src
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-03-04 06:31:49
|
Revision: 600 http://assorted.svn.sourceforge.net/assorted/?rev=600&view=rev Author: yangzhang Date: 2008-03-03 22:31:56 -0800 (Mon, 03 Mar 2008) Log Message: ----------- updated experiments Modified Paths: -------------- numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Modified: numa-bench/trunk/src/chew.bash =================================================================== --- numa-bench/trunk/src/chew.bash 2008-03-04 04:12:34 UTC (rev 599) +++ numa-bench/trunk/src/chew.bash 2008-03-04 06:31:56 UTC (rev 600) @@ -12,32 +12,32 @@ KB=000 MB=000000 GB=000000000 -# ncpus size nreps shuffle par pin local write cross rrnodes nnodes +# ncpus size nreps shuffle par pin local write cross rrnodes nnodes ncpus echo writes -run 16 100$MB 1 0 0 1 0 1 0 1 4 -run 16 1000$MB 1 0 0 1 0 1 0 1 4 -run 16 100$MB 10 0 0 1 0 1 0 1 4 -run 16 100$MB 1 1 0 1 0 1 0 1 4 +run 16 100$MB 1 0 0 1 0 1 0 1 4 16 +run 16 1000$MB 1 0 0 1 0 1 0 1 4 16 +run 16 100$MB 10 0 0 1 0 1 0 1 4 16 +run 16 100$MB 1 1 0 1 0 1 0 1 4 16 echo reads -run 16 1000$MB 1 0 0 1 0 0 0 1 4 -run 16 100$MB 1 1 0 1 0 0 0 1 4 +run 16 1000$MB 1 0 0 1 0 0 0 1 4 16 +run 16 100$MB 1 1 0 1 0 0 0 1 4 16 for n in 1 2 4 8 12 16 ; do echo par - run $n 10$MB 1 0 1 1 0 0 0 1 4 - run $n 10$MB 1 1 1 1 0 0 0 1 4 - run $n 10$MB 1 0 1 1 1 0 0 1 4 - run $n 10$MB 1 1 1 1 1 0 0 1 4 - run $n 10$MB 1 0 1 1 0 1 0 1 4 - run $n 10$MB 1 1 1 1 0 1 0 1 4 - run $n 10$MB 1 0 1 1 1 1 0 1 4 - run $n 10$MB 1 1 1 1 1 1 0 1 4 + run $n 10$MB 1 0 1 1 0 0 0 1 4 16 + run $n 10$MB 1 1 1 1 0 0 0 1 4 16 + run $n 10$MB 1 0 1 1 1 0 0 1 4 16 + run $n 10$MB 1 1 1 1 1 0 0 1 4 16 + run $n 10$MB 1 0 1 1 0 1 0 1 4 16 + run $n 10$MB 1 1 1 1 0 1 0 1 4 16 + run $n 10$MB 1 0 1 1 1 1 0 1 4 16 + run $n 10$MB 1 1 1 1 1 1 0 1 4 16 echo cross - run $n 10$MB 1 0 1 1 0 0 1 1 4 - run $n 10$MB 1 1 1 1 0 0 1 1 4 - run $n 10$MB 1 0 1 1 0 1 1 1 4 - run $n 10$MB 1 1 1 1 0 1 1 1 4 + run $n 10$MB 1 0 1 1 0 0 1 1 4 16 + run $n 10$MB 1 1 1 1 0 0 1 1 4 16 + run $n 10$MB 1 0 1 1 0 1 1 1 4 16 + run $n 10$MB 1 1 1 1 0 1 1 1 4 16 done Modified: numa-bench/trunk/src/chew.cc =================================================================== --- numa-bench/trunk/src/chew.cc 2008-03-04 04:12:34 UTC (rev 599) +++ numa-bench/trunk/src/chew.cc 2008-03-04 06:31:56 UTC (rev 600) @@ -16,16 +16,14 @@ using namespace std; pthread_barrier_t cross_barrier; +pthread_mutex_t iomutex; struct config { /** - * The number of CPUs to test. This is a parameter (rather than - * auto-detected) because it additionally serves to mean the number of CPUs - * we want to test in parallel. As this program evolves, these may be - * separated. + * The number of workers to run (usually this means the CPUs to test). */ - const unsigned int ncpus; + const unsigned int nworkers; /** * Size in bytes of the buffer to chew. @@ -77,8 +75,15 @@ /** * The number of nodes on this machine. + * TODO: auto-detect */ const unsigned int nnodes; + + /** + * The number of CPUs on this machine. + * TODO: auto-detect + */ + const unsigned int ncpus; }; void*** partitions; @@ -95,15 +100,18 @@ int* p = (int*) pp; const size_t count = len / sizeof(int); int sum = 0; + + // TODO: see these with random numbers generated from a global (serial) rand + // NOTE: Using rand as the index assumes that rand generates large-enough + // values. posix_rand rand(current_time_millis() ^ gettid()); + if (config.write) { // Write to the region. if (config.shuffle) { // Random access into the memory region. for (unsigned int c = 0; c < config.nreps; c++) { for (size_t i = 0; i < count; i++) { - // NOTE: Using r as the index assumes that rand generates large-enough - // values. int r = rand(); sum += p[r % count] += r; } @@ -122,8 +130,6 @@ // Random access into the memory region. for (unsigned int c = 0; c < config.nreps; c++) { for (size_t i = 0; i < count; i++) { - // NOTE: Using r as the index assumes that rand generates large-enough - // values. sum += p[rand() % count]; } } @@ -150,10 +156,11 @@ { // Pin this thread to a CPU. if (config.pin) { + // Round-robin is only applicable if we're doing a parallel test. unsigned int ncpus = config.ncpus, nnodes = config.nnodes, cpuspernode = ncpus / nnodes, - cpu = !config.rrnodes ? + cpu = !config.par || !config.rrnodes ? worker : (cpuspernode * worker) % ncpus + worker / cpuspernode; pin_thread(cpu); @@ -163,8 +170,8 @@ timer t(": "); if (!warmup && config.cross) { - size_t len = config.size / config.ncpus; - for (unsigned int i = 0; i < config.ncpus; i++) { + size_t len = config.size / config.nworkers; + for (unsigned int i = 0; i < config.nworkers; i++) { partitions[worker][i] = new char[len]; } int barrier_result = pthread_barrier_wait(&cross_barrier); @@ -172,7 +179,7 @@ barrier_result == 0); // TODO: make this more interesting than just a sequential traversal over // the partitions. - for (unsigned int i = 0; i < config.ncpus; i++) { + for (unsigned int i = 0; i < config.nworkers; i++) { chew1(partitions[i][worker], config, len); } } else { @@ -180,9 +187,11 @@ } // Print the elapsed time and "result". + check(pthread_mutex_lock(&iomutex) == 0); if (warmup) cout << "warmup: "; cout << worker; t.print(); + check(pthread_mutex_unlock(&iomutex) == 0); if (config.local) free(p); @@ -195,9 +204,9 @@ // So that our global shared malloc takes place on the CPU 0's node. pin_thread(0); - if (argc < 12) { + if (argc < 13) { cerr << argv[0] << - " <ncpus> <size> <nreps> <shuffle> <par> <pin>" + " <nworkers> <size> <nreps> <shuffle> <par> <pin>" " <local> <write> <rrnodes> <nnodes>" << endl; return 1; } @@ -214,30 +223,34 @@ atoi(argv[ 8]), atoi(argv[ 9]), atoi(argv[10]), - atoi(argv[11]) + atoi(argv[11]), + atoi(argv[12]) }; cout << "config:" - << " ncpus " << config.ncpus - << " size " << config.size - << " nreps " << config.nreps - << " shuffle " << config.shuffle - << " par " << config.par - << " pin " << config.pin - << " local " << config.local - << " write " << config.write - << " cross " << config.cross - << " rrnodes " << config.rrnodes - << " nnodes " << config.nnodes + << " nworkers " << config.nworkers + << " size " << config.size + << " nreps " << config.nreps + << " shuffle " << config.shuffle + << " par " << config.par + << " pin " << config.pin + << " local " << config.local + << " write " << config.write + << " cross " << config.cross + << " rrnodes " << config.rrnodes + << " nnodes " << config.nnodes + << " ncpus " << config.ncpus << endl; void *p = malloc(config.size); check(p != NULL); + check(pthread_mutex_init(&iomutex, NULL) == 0); + if (config.cross) { - partitions = new void**[config.ncpus]; - for (unsigned int i = 0; i < config.ncpus; i++) - partitions[i] = new void*[config.ncpus]; + partitions = new void**[config.nworkers]; + for (unsigned int i = 0; i < config.nworkers; i++) + partitions[i] = new void*[config.nworkers]; } // Warmup. @@ -245,18 +258,18 @@ if (config.par) { // Chew the memory area from each core in parallel (and also chew own). - pthread_t ts[config.ncpus]; - check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncpus)); - for (unsigned int i = 0; i < config.ncpus; i++) { + pthread_t ts[config.nworkers]; + check(0 == pthread_barrier_init(&cross_barrier, NULL, config.nworkers)); + for (unsigned int i = 0; i < config.nworkers; i++) { ts[i] = spawn(bind(chew, p, i, ref(config), false)); } - for (unsigned int i = 0; i < config.ncpus; i++) { + for (unsigned int i = 0; i < config.nworkers; i++) { check(pthread_join(ts[i], NULL) == 0); } check(0 == pthread_barrier_destroy(&cross_barrier)); } else { // Chew the memory area from each core in sequence. - for (unsigned int i = 0; i < config.ncpus; i++) { + for (unsigned int i = 0; i < config.nworkers; i++) { chew(p, i, config, false); } } @@ -265,5 +278,7 @@ ofstream trash("/dev/null"); trash << "result: " << global_sum << endl; + check(pthread_mutex_destroy(&iomutex) == 0); + return 0; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |