[Assorted-commits] SF.net SVN: assorted: [514] numa-bench/trunk/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 514
          http://assorted.svn.sourceforge.net/assorted/?rev=514&view=rev
Author:   yangzhang
Date:     2008-02-26 12:31:00 -0800 (Tue, 26 Feb 2008)

Log Message:
-----------
added round-robin node allocation; renamed cores to cpus

Modified Paths:
--------------
    numa-bench/trunk/src/chew.bash
    numa-bench/trunk/src/chew.cc

Modified: numa-bench/trunk/src/chew.bash
===================================================================

--- numa-bench/trunk/src/chew.bash	2008-02-26 19:48:37 UTC (rev 513)
+++ numa-bench/trunk/src/chew.bash	2008-02-26 20:31:00 UTC (rev 514)
@@ -12,32 +12,32 @@
 
 KB=000 MB=000000 GB=000000000
 
-#   ncores    size nreps shuffle par pin local write cross
+#    ncpus    size nreps shuffle par pin local write cross rrnodes
 
 echo writes
-run     16  100$MB     1       0   0   1     0     1     0
-run     16 1000$MB     1       0   0   1     0     1     0
-run     16  100$MB    10       0   0   1     0     1     0
-run     16  100$MB     1       1   0   1     0     1     0
+run     16  100$MB     1       0   0   1     0     1     0       1
+run     16 1000$MB     1       0   0   1     0     1     0       1
+run     16  100$MB    10       0   0   1     0     1     0       1
+run     16  100$MB     1       1   0   1     0     1     0       1
 
 echo reads
-run     16 1000$MB     1       0   0   1     0     0     0
-run     16  100$MB     1       1   0   1     0     0     0
+run     16 1000$MB     1       0   0   1     0     0     0       1
+run     16  100$MB     1       1   0   1     0     0     0       1
 
 for n in 1 2 4 8 12 16 ; do
   echo par
-  run   $n   10$MB     1       0   1   1     0     0     0
-  run   $n   10$MB     1       1   1   1     0     0     0
-  run   $n   10$MB     1       0   1   1     1     0     0
-  run   $n   10$MB     1       1   1   1     1     0     0
-  run   $n   10$MB     1       0   1   1     0     1     0
-  run   $n   10$MB     1       1   1   1     0     1     0
-  run   $n   10$MB     1       0   1   1     1     1     0
-  run   $n   10$MB     1       1   1   1     1     1     0
+  run   $n   10$MB     1       0   1   1     0     0     0       1
+  run   $n   10$MB     1       1   1   1     0     0     0       1
+  run   $n   10$MB     1       0   1   1     1     0     0       1
+  run   $n   10$MB     1       1   1   1     1     0     0       1
+  run   $n   10$MB     1       0   1   1     0     1     0       1
+  run   $n   10$MB     1       1   1   1     0     1     0       1
+  run   $n   10$MB     1       0   1   1     1     1     0       1
+  run   $n   10$MB     1       1   1   1     1     1     0       1
 
   echo cross
-  run   $n   10$MB     1       0   1   1     0     0     1
-  run   $n   10$MB     1       1   1   1     0     0     1
-  run   $n   10$MB     1       0   1   1     0     1     1
-  run   $n   10$MB     1       1   1   1     0     1     1
+  run   $n   10$MB     1       0   1   1     0     0     1       1
+  run   $n   10$MB     1       1   1   1     0     0     1       1
+  run   $n   10$MB     1       0   1   1     0     1     1       1
+  run   $n   10$MB     1       1   1   1     0     1     1       1
 done

Modified: numa-bench/trunk/src/chew.cc
===================================================================
--- numa-bench/trunk/src/chew.cc	2008-02-26 19:48:37 UTC (rev 513)
+++ numa-bench/trunk/src/chew.cc	2008-02-26 20:31:00 UTC (rev 514)
@@ -20,12 +20,12 @@
 struct config
 {
   /**
-   * The number of cores to test. This is a parameter (rather than
-   * auto-detected) because it additionally serves to mean the number of cores
+   * The number of CPUs to test. This is a parameter (rather than
+   * auto-detected) because it additionally serves to mean the number of CPUs
    * we want to test in parallel. As this program evolves, these may be
    * separated.
    */
-  const int ncores;
+  const unsigned int ncpus;
 
   /**
    * Size in bytes of the buffer to chew.
@@ -35,7 +35,7 @@
   /**
    * Number of repetitions to chew.
    */
-  const int nreps;
+  const unsigned int nreps;
 
   /**
    * Perform rand access, otherwise sequential scan.
@@ -43,12 +43,13 @@
   const bool shuffle;
 
   /**
-   * Chew in parallel, otherwise each core chews serially.
+   * Chew in parallel, otherwise each worker chews serially.
    */
   const bool par;
 
   /**
-   * Pin thread i to core i, otherwise let the OS manage things.
+   * Pin workers to CPUs (which CPU determined by whether we're using
+   * round-robin CPU allocation), otherwise let the OS manage things.
    */
   const bool pin;
 
@@ -67,6 +68,17 @@
    * global/local buffer.
    */
   const bool cross;
+
+  /**
+   * When using fewer than the maximum number of CPUs, allocate nodes in
+   * round-robin fashion.
+   */
+  const bool rrnodes;
+
+  /**
+   * The number of nodes on this machine.
+   */
+  const unsigned int nnodes;
 };
 
 void*** partitions;
@@ -129,15 +141,21 @@
 
 /**
  * \param pp The start of the buffer to chew.
- * \param cpu Which CPU to pin our thread to.
+ * \param worker From this we can determine which CPU to pin our thread to.
  * \param config The experiment configuration parameters.
  * \param label Prefix for the elapsed time output.
  */
 void*
-chew(void* pp, unsigned int cpu, const config & config, bool warmup)
+chew(void* pp, unsigned int worker, const config & config, bool warmup)
 {
-  // Pin this thread to cpu `cpu`.
+  // Pin this thread to a CPU.
   if (config.pin) {
+    unsigned int ncpus = config.ncpus,
+                 nnodes = config.nnodes,
+                 cpuspernode = ncpus / nnodes,
+                 cpu = !config.rrnodes ?
+                       worker :
+                       (cpuspernode * worker) % ncpus + worker / cpuspernode;
     pin_thread(cpu);
   }
 
@@ -145,16 +163,17 @@
   timer t(": ");
 
   if (!warmup && config.cross) {
-    size_t len = config.size / config.ncores;
-    for (int i = 0; i < config.ncores; i++) {
-      partitions[cpu][i] = new char[len];
+    size_t len = config.size / config.ncpus;
+    for (unsigned int i = 0; i < config.ncpus; i++) {
+      partitions[worker][i] = new char[len];
     }
     int barrier_result = pthread_barrier_wait(&cross_barrier);
-    check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || barrier_result == 0);
+    check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD ||
+          barrier_result == 0);
     // TODO: make this more interesting than just a sequential traversal over
     // the partitions.
-    for (int i = 0; i < config.ncores; i++) {
-      chew1(partitions[i][cpu], config, len);
+    for (unsigned int i = 0; i < config.ncpus; i++) {
+      chew1(partitions[i][worker], config, len);
     }
   } else {
     chew1(p, config, config.size);
@@ -162,7 +181,7 @@
 
   // Print the elapsed time and "result".
   if (warmup) cout << "warmup: ";
-  cout << cpu;
+  cout << worker;
   t.print();
 
   if (config.local) free(p);
@@ -176,27 +195,30 @@
   // So that our global shared malloc takes place on the CPU 0's node.
   pin_thread(0);
 
-  if (argc < 10) {
+  if (argc < 11) {
     cerr << argv[0] <<
-      " <ncores> <size> <nreps> <shuffle> <par> <pin> <local> <write>" << endl;
+      " <ncpus> <size> <nreps> <shuffle> <par> <pin>"
+      " <local> <write> <rrnodes> <nnodes>" << endl;
     return 1;
   }
 
-  // Parse command-line arguments. TODO
-  const config config = { 
-    atoi(argv[1]),
-    atoi(argv[2]),
-    atoi(argv[3]),
-    atoi(argv[4]),
-    atoi(argv[5]),
-    atoi(argv[6]),
-    atoi(argv[7]),
-    atoi(argv[8]),
-    atoi(argv[9])
+  // Parse command-line arguments.
+  const config config = {
+    atoi(argv[ 1]),
+    atoi(argv[ 2]),
+    atoi(argv[ 3]),
+    atoi(argv[ 4]),
+    atoi(argv[ 5]),
+    atoi(argv[ 6]),
+    atoi(argv[ 7]),
+    atoi(argv[ 8]),
+    atoi(argv[ 9]),
+    atoi(argv[10]),
+    atoi(argv[11])
   };
 
   cout << "config:"
-       << " ncores "  << config.ncores
+       << " ncpus "   << config.ncpus
        << " size "    << config.size
        << " nreps "   << config.nreps
        << " shuffle " << config.shuffle
@@ -204,7 +226,10 @@
        << " pin "     << config.pin
        << " local "   << config.local
        << " write "   << config.write
-       << " cross "   << config.cross << endl;
+       << " cross "   << config.cross
+       << " rrnodes " << config.rrnodes
+       << " nnodes  " << config.nnodes
+       << endl;
 
   checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough");
 
@@ -212,9 +237,9 @@
   check(p != NULL);
 
   if (config.cross) {
-    partitions = new void**[config.ncores];
-    for (unsigned int i  = 0; i < config.ncores; i++)
-      partitions[i] = new void*[config.ncores];
+    partitions = new void**[config.ncpus];
+    for (unsigned int i  = 0; i < config.ncpus; i++)
+      partitions[i] = new void*[config.ncpus];
   }
 
   // Warmup.
@@ -222,18 +247,18 @@
 
   if (config.par) {
     // Chew the memory area from each core in parallel (and also chew own).
-    pthread_t ts[config.ncores];
-    check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncores));
-    for (int i = 0; i < config.ncores; i++) {
+    pthread_t ts[config.ncpus];
+    check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncpus));
+    for (unsigned int i = 0; i < config.ncpus; i++) {
       ts[i] = spawn(bind(chew, p, i, ref(config), false));
     }
-    for (int i = 0; i < config.ncores; i++) {
+    for (unsigned int i = 0; i < config.ncpus; i++) {
       check(pthread_join(ts[i], NULL) == 0);
     }
     check(0 == pthread_barrier_destroy(&cross_barrier));
   } else {
     // Chew the memory area from each core in sequence.
-    for (int i = 0; i < config.ncores; i++) {
+    for (unsigned int i = 0; i < config.ncpus; i++) {
       chew(p, i, config, false);
     }
   }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.