[Assorted-commits] SF.net SVN: assorted: [514] numa-bench/trunk/src
Brought to you by:
yangzhang
From: <yan...@us...> - 2008-02-26 20:30:58
|
Revision: 514 http://assorted.svn.sourceforge.net/assorted/?rev=514&view=rev Author: yangzhang Date: 2008-02-26 12:31:00 -0800 (Tue, 26 Feb 2008) Log Message: ----------- added round-robin node allocation; renamed cores to cpus Modified Paths: -------------- numa-bench/trunk/src/chew.bash numa-bench/trunk/src/chew.cc Modified: numa-bench/trunk/src/chew.bash =================================================================== --- numa-bench/trunk/src/chew.bash 2008-02-26 19:48:37 UTC (rev 513) +++ numa-bench/trunk/src/chew.bash 2008-02-26 20:31:00 UTC (rev 514) @@ -12,32 +12,32 @@ KB=000 MB=000000 GB=000000000 -# ncores size nreps shuffle par pin local write cross +# ncpus size nreps shuffle par pin local write cross rrnodes echo writes -run 16 100$MB 1 0 0 1 0 1 0 -run 16 1000$MB 1 0 0 1 0 1 0 -run 16 100$MB 10 0 0 1 0 1 0 -run 16 100$MB 1 1 0 1 0 1 0 +run 16 100$MB 1 0 0 1 0 1 0 1 +run 16 1000$MB 1 0 0 1 0 1 0 1 +run 16 100$MB 10 0 0 1 0 1 0 1 +run 16 100$MB 1 1 0 1 0 1 0 1 echo reads -run 16 1000$MB 1 0 0 1 0 0 0 -run 16 100$MB 1 1 0 1 0 0 0 +run 16 1000$MB 1 0 0 1 0 0 0 1 +run 16 100$MB 1 1 0 1 0 0 0 1 for n in 1 2 4 8 12 16 ; do echo par - run $n 10$MB 1 0 1 1 0 0 0 - run $n 10$MB 1 1 1 1 0 0 0 - run $n 10$MB 1 0 1 1 1 0 0 - run $n 10$MB 1 1 1 1 1 0 0 - run $n 10$MB 1 0 1 1 0 1 0 - run $n 10$MB 1 1 1 1 0 1 0 - run $n 10$MB 1 0 1 1 1 1 0 - run $n 10$MB 1 1 1 1 1 1 0 + run $n 10$MB 1 0 1 1 0 0 0 1 + run $n 10$MB 1 1 1 1 0 0 0 1 + run $n 10$MB 1 0 1 1 1 0 0 1 + run $n 10$MB 1 1 1 1 1 0 0 1 + run $n 10$MB 1 0 1 1 0 1 0 1 + run $n 10$MB 1 1 1 1 0 1 0 1 + run $n 10$MB 1 0 1 1 1 1 0 1 + run $n 10$MB 1 1 1 1 1 1 0 1 echo cross - run $n 10$MB 1 0 1 1 0 0 1 - run $n 10$MB 1 1 1 1 0 0 1 - run $n 10$MB 1 0 1 1 0 1 1 - run $n 10$MB 1 1 1 1 0 1 1 + run $n 10$MB 1 0 1 1 0 0 1 1 + run $n 10$MB 1 1 1 1 0 0 1 1 + run $n 10$MB 1 0 1 1 0 1 1 1 + run $n 10$MB 1 1 1 1 0 1 1 1 done Modified: numa-bench/trunk/src/chew.cc =================================================================== --- numa-bench/trunk/src/chew.cc 2008-02-26 19:48:37 UTC (rev 513) +++ numa-bench/trunk/src/chew.cc 2008-02-26 20:31:00 UTC (rev 514) @@ -20,12 +20,12 @@ struct config { /** - * The number of cores to test. This is a parameter (rather than - * auto-detected) because it additionally serves to mean the number of cores + * The number of CPUs to test. This is a parameter (rather than + * auto-detected) because it additionally serves to mean the number of CPUs * we want to test in parallel. As this program evolves, these may be * separated. */ - const int ncores; + const unsigned int ncpus; /** * Size in bytes of the buffer to chew. @@ -35,7 +35,7 @@ /** * Number of repetitions to chew. */ - const int nreps; + const unsigned int nreps; /** * Perform rand access, otherwise sequential scan. @@ -43,12 +43,13 @@ const bool shuffle; /** - * Chew in parallel, otherwise each core chews serially. + * Chew in parallel, otherwise each worker chews serially. */ const bool par; /** - * Pin thread i to core i, otherwise let the OS manage things. + * Pin workers to CPUs (which CPU determined by whether we're using + * round-robin CPU allocation), otherwise let the OS manage things. */ const bool pin; @@ -67,6 +68,17 @@ * global/local buffer. */ const bool cross; + + /** + * When using fewer than the maximum number of CPUs, allocate nodes in + * round-robin fashion. + */ + const bool rrnodes; + + /** + * The number of nodes on this machine. + */ + const unsigned int nnodes; }; void*** partitions; @@ -129,15 +141,21 @@ /** * \param pp The start of the buffer to chew. - * \param cpu Which CPU to pin our thread to. + * \param worker From this we can determine which CPU to pin our thread to. * \param config The experiment configuration parameters. * \param label Prefix for the elapsed time output. */ void* -chew(void* pp, unsigned int cpu, const config & config, bool warmup) +chew(void* pp, unsigned int worker, const config & config, bool warmup) { - // Pin this thread to cpu `cpu`. + // Pin this thread to a CPU. if (config.pin) { + unsigned int ncpus = config.ncpus, + nnodes = config.nnodes, + cpuspernode = ncpus / nnodes, + cpu = !config.rrnodes ? + worker : + (cpuspernode * worker) % ncpus + worker / cpuspernode; pin_thread(cpu); } @@ -145,16 +163,17 @@ timer t(": "); if (!warmup && config.cross) { - size_t len = config.size / config.ncores; - for (int i = 0; i < config.ncores; i++) { - partitions[cpu][i] = new char[len]; + size_t len = config.size / config.ncpus; + for (unsigned int i = 0; i < config.ncpus; i++) { + partitions[worker][i] = new char[len]; } int barrier_result = pthread_barrier_wait(&cross_barrier); - check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || barrier_result == 0); + check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || + barrier_result == 0); // TODO: make this more interesting than just a sequential traversal over // the partitions. - for (int i = 0; i < config.ncores; i++) { - chew1(partitions[i][cpu], config, len); + for (unsigned int i = 0; i < config.ncpus; i++) { + chew1(partitions[i][worker], config, len); } } else { chew1(p, config, config.size); @@ -162,7 +181,7 @@ // Print the elapsed time and "result". if (warmup) cout << "warmup: "; - cout << cpu; + cout << worker; t.print(); if (config.local) free(p); @@ -176,27 +195,30 @@ // So that our global shared malloc takes place on the CPU 0's node. pin_thread(0); - if (argc < 10) { + if (argc < 11) { cerr << argv[0] << - " <ncores> <size> <nreps> <shuffle> <par> <pin> <local> <write>" << endl; + " <ncpus> <size> <nreps> <shuffle> <par> <pin>" + " <local> <write> <rrnodes> <nnodes>" << endl; return 1; } - // Parse command-line arguments. TODO - const config config = { - atoi(argv[1]), - atoi(argv[2]), - atoi(argv[3]), - atoi(argv[4]), - atoi(argv[5]), - atoi(argv[6]), - atoi(argv[7]), - atoi(argv[8]), - atoi(argv[9]) + // Parse command-line arguments. + const config config = { + atoi(argv[ 1]), + atoi(argv[ 2]), + atoi(argv[ 3]), + atoi(argv[ 4]), + atoi(argv[ 5]), + atoi(argv[ 6]), + atoi(argv[ 7]), + atoi(argv[ 8]), + atoi(argv[ 9]), + atoi(argv[10]), + atoi(argv[11]) }; cout << "config:" - << " ncores " << config.ncores + << " ncpus " << config.ncpus << " size " << config.size << " nreps " << config.nreps << " shuffle " << config.shuffle @@ -204,7 +226,10 @@ << " pin " << config.pin << " local " << config.local << " write " << config.write - << " cross " << config.cross << endl; + << " cross " << config.cross + << " rrnodes " << config.rrnodes + << " nnodes " << config.nnodes + << endl; checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough"); @@ -212,9 +237,9 @@ check(p != NULL); if (config.cross) { - partitions = new void**[config.ncores]; - for (unsigned int i = 0; i < config.ncores; i++) - partitions[i] = new void*[config.ncores]; + partitions = new void**[config.ncpus]; + for (unsigned int i = 0; i < config.ncpus; i++) + partitions[i] = new void*[config.ncpus]; } // Warmup. @@ -222,18 +247,18 @@ if (config.par) { // Chew the memory area from each core in parallel (and also chew own). - pthread_t ts[config.ncores]; - check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncores)); - for (int i = 0; i < config.ncores; i++) { + pthread_t ts[config.ncpus]; + check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncpus)); + for (unsigned int i = 0; i < config.ncpus; i++) { ts[i] = spawn(bind(chew, p, i, ref(config), false)); } - for (int i = 0; i < config.ncores; i++) { + for (unsigned int i = 0; i < config.ncpus; i++) { check(pthread_join(ts[i], NULL) == 0); } check(0 == pthread_barrier_destroy(&cross_barrier)); } else { // Chew the memory area from each core in sequence. - for (int i = 0; i < config.ncores; i++) { + for (unsigned int i = 0; i < config.ncpus; i++) { chew(p, i, config, false); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |