[Assorted-commits] SF.net SVN: assorted: [600] numa-bench/trunk/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 600
          http://assorted.svn.sourceforge.net/assorted/?rev=600&view=rev
Author:   yangzhang
Date:     2008-03-03 22:31:56 -0800 (Mon, 03 Mar 2008)

Log Message:
-----------
updated experiments

Modified Paths:
--------------
    numa-bench/trunk/src/chew.bash
    numa-bench/trunk/src/chew.cc

Modified: numa-bench/trunk/src/chew.bash
===================================================================

--- numa-bench/trunk/src/chew.bash	2008-03-04 04:12:34 UTC (rev 599)
+++ numa-bench/trunk/src/chew.bash	2008-03-04 06:31:56 UTC (rev 600)
@@ -12,32 +12,32 @@
 
 KB=000 MB=000000 GB=000000000
 
-#    ncpus    size nreps shuffle par pin local write cross rrnodes nnodes
+#    ncpus    size nreps shuffle par pin local write cross rrnodes nnodes ncpus
 
 echo writes
-run     16  100$MB     1       0   0   1     0     1     0       1      4
-run     16 1000$MB     1       0   0   1     0     1     0       1      4
-run     16  100$MB    10       0   0   1     0     1     0       1      4
-run     16  100$MB     1       1   0   1     0     1     0       1      4
+run     16  100$MB     1       0   0   1     0     1     0       1      4    16
+run     16 1000$MB     1       0   0   1     0     1     0       1      4    16
+run     16  100$MB    10       0   0   1     0     1     0       1      4    16
+run     16  100$MB     1       1   0   1     0     1     0       1      4    16
 
 echo reads
-run     16 1000$MB     1       0   0   1     0     0     0       1      4
-run     16  100$MB     1       1   0   1     0     0     0       1      4
+run     16 1000$MB     1       0   0   1     0     0     0       1      4    16
+run     16  100$MB     1       1   0   1     0     0     0       1      4    16
 
 for n in 1 2 4 8 12 16 ; do
   echo par
-  run   $n   10$MB     1       0   1   1     0     0     0       1      4
-  run   $n   10$MB     1       1   1   1     0     0     0       1      4
-  run   $n   10$MB     1       0   1   1     1     0     0       1      4
-  run   $n   10$MB     1       1   1   1     1     0     0       1      4
-  run   $n   10$MB     1       0   1   1     0     1     0       1      4
-  run   $n   10$MB     1       1   1   1     0     1     0       1      4
-  run   $n   10$MB     1       0   1   1     1     1     0       1      4
-  run   $n   10$MB     1       1   1   1     1     1     0       1      4
+  run   $n   10$MB     1       0   1   1     0     0     0       1      4    16
+  run   $n   10$MB     1       1   1   1     0     0     0       1      4    16
+  run   $n   10$MB     1       0   1   1     1     0     0       1      4    16
+  run   $n   10$MB     1       1   1   1     1     0     0       1      4    16
+  run   $n   10$MB     1       0   1   1     0     1     0       1      4    16
+  run   $n   10$MB     1       1   1   1     0     1     0       1      4    16
+  run   $n   10$MB     1       0   1   1     1     1     0       1      4    16
+  run   $n   10$MB     1       1   1   1     1     1     0       1      4    16
 
   echo cross
-  run   $n   10$MB     1       0   1   1     0     0     1       1      4
-  run   $n   10$MB     1       1   1   1     0     0     1       1      4
-  run   $n   10$MB     1       0   1   1     0     1     1       1      4
-  run   $n   10$MB     1       1   1   1     0     1     1       1      4
+  run   $n   10$MB     1       0   1   1     0     0     1       1      4    16
+  run   $n   10$MB     1       1   1   1     0     0     1       1      4    16
+  run   $n   10$MB     1       0   1   1     0     1     1       1      4    16
+  run   $n   10$MB     1       1   1   1     0     1     1       1      4    16
 done

Modified: numa-bench/trunk/src/chew.cc
===================================================================
--- numa-bench/trunk/src/chew.cc	2008-03-04 04:12:34 UTC (rev 599)
+++ numa-bench/trunk/src/chew.cc	2008-03-04 06:31:56 UTC (rev 600)
@@ -16,16 +16,14 @@
 using namespace std;
 
 pthread_barrier_t cross_barrier;
+pthread_mutex_t iomutex;
 
 struct config
 {
   /**
-   * The number of CPUs to test. This is a parameter (rather than
-   * auto-detected) because it additionally serves to mean the number of CPUs
-   * we want to test in parallel. As this program evolves, these may be
-   * separated.
+   * The number of workers to run (usually this means the CPUs to test).
    */
-  const unsigned int ncpus;
+  const unsigned int nworkers;
 
   /**
    * Size in bytes of the buffer to chew.
@@ -77,8 +75,15 @@
 
   /**
    * The number of nodes on this machine.
+   * TODO: auto-detect
    */
   const unsigned int nnodes;
+
+  /**
+   * The number of CPUs on this machine.
+   * TODO: auto-detect
+   */
+  const unsigned int ncpus;
 };
 
 void*** partitions;
@@ -95,15 +100,18 @@
   int* p = (int*) pp;
   const size_t count = len / sizeof(int);
   int sum = 0;
+
+  // TODO: see these with random numbers generated from a global (serial) rand
+  // NOTE: Using rand as the index assumes that rand generates large-enough
+  // values.
   posix_rand rand(current_time_millis() ^ gettid());
+
   if (config.write) {
     // Write to the region.
     if (config.shuffle) {
       // Random access into the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
         for (size_t i = 0; i < count; i++) {
-          // NOTE: Using r as the index assumes that rand generates large-enough
-          // values.
           int r = rand();
           sum += p[r % count] += r;
         }
@@ -122,8 +130,6 @@
       // Random access into the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
         for (size_t i = 0; i < count; i++) {
-          // NOTE: Using r as the index assumes that rand generates large-enough
-          // values.
           sum += p[rand() % count];
         }
       }
@@ -150,10 +156,11 @@
 {
   // Pin this thread to a CPU.
   if (config.pin) {
+    // Round-robin is only applicable if we're doing a parallel test.
     unsigned int ncpus = config.ncpus,
                  nnodes = config.nnodes,
                  cpuspernode = ncpus / nnodes,
-                 cpu = !config.rrnodes ?
+                 cpu = !config.par || !config.rrnodes ?
                        worker :
                        (cpuspernode * worker) % ncpus + worker / cpuspernode;
     pin_thread(cpu);
@@ -163,8 +170,8 @@
   timer t(": ");
 
   if (!warmup && config.cross) {
-    size_t len = config.size / config.ncpus;
-    for (unsigned int i = 0; i < config.ncpus; i++) {
+    size_t len = config.size / config.nworkers;
+    for (unsigned int i = 0; i < config.nworkers; i++) {
       partitions[worker][i] = new char[len];
     }
     int barrier_result = pthread_barrier_wait(&cross_barrier);
@@ -172,7 +179,7 @@
           barrier_result == 0);
     // TODO: make this more interesting than just a sequential traversal over
     // the partitions.
-    for (unsigned int i = 0; i < config.ncpus; i++) {
+    for (unsigned int i = 0; i < config.nworkers; i++) {
       chew1(partitions[i][worker], config, len);
     }
   } else {
@@ -180,9 +187,11 @@
   }
 
   // Print the elapsed time and "result".
+  check(pthread_mutex_lock(&iomutex) == 0);
   if (warmup) cout << "warmup: ";
   cout << worker;
   t.print();
+  check(pthread_mutex_unlock(&iomutex) == 0);
 
   if (config.local) free(p);
 
@@ -195,9 +204,9 @@
   // So that our global shared malloc takes place on the CPU 0's node.
   pin_thread(0);
 
-  if (argc < 12) {
+  if (argc < 13) {
     cerr << argv[0] <<
-      " <ncpus> <size> <nreps> <shuffle> <par> <pin>"
+      " <nworkers> <size> <nreps> <shuffle> <par> <pin>"
       " <local> <write> <rrnodes> <nnodes>" << endl;
     return 1;
   }
@@ -214,30 +223,34 @@
     atoi(argv[ 8]),
     atoi(argv[ 9]),
     atoi(argv[10]),
-    atoi(argv[11])
+    atoi(argv[11]),
+    atoi(argv[12])
   };
 
   cout << "config:"
-       << " ncpus "   << config.ncpus
-       << " size "    << config.size
-       << " nreps "   << config.nreps
-       << " shuffle " << config.shuffle
-       << " par "     << config.par
-       << " pin "     << config.pin
-       << " local "   << config.local
-       << " write "   << config.write
-       << " cross "   << config.cross
-       << " rrnodes " << config.rrnodes
-       << " nnodes  " << config.nnodes
+       << " nworkers " << config.nworkers
+       << " size "     << config.size
+       << " nreps "    << config.nreps
+       << " shuffle "  << config.shuffle
+       << " par "      << config.par
+       << " pin "      << config.pin
+       << " local "    << config.local
+       << " write "    << config.write
+       << " cross "    << config.cross
+       << " rrnodes "  << config.rrnodes
+       << " nnodes "   << config.nnodes
+       << " ncpus "    << config.ncpus
        << endl;
 
   void *p = malloc(config.size);
   check(p != NULL);
 
+  check(pthread_mutex_init(&iomutex, NULL) == 0);
+
   if (config.cross) {
-    partitions = new void**[config.ncpus];
-    for (unsigned int i  = 0; i < config.ncpus; i++)
-      partitions[i] = new void*[config.ncpus];
+    partitions = new void**[config.nworkers];
+    for (unsigned int i  = 0; i < config.nworkers; i++)
+      partitions[i] = new void*[config.nworkers];
   }
 
   // Warmup.
@@ -245,18 +258,18 @@
 
   if (config.par) {
     // Chew the memory area from each core in parallel (and also chew own).
-    pthread_t ts[config.ncpus];
-    check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncpus));
-    for (unsigned int i = 0; i < config.ncpus; i++) {
+    pthread_t ts[config.nworkers];
+    check(0 == pthread_barrier_init(&cross_barrier, NULL, config.nworkers));
+    for (unsigned int i = 0; i < config.nworkers; i++) {
       ts[i] = spawn(bind(chew, p, i, ref(config), false));
     }
-    for (unsigned int i = 0; i < config.ncpus; i++) {
+    for (unsigned int i = 0; i < config.nworkers; i++) {
       check(pthread_join(ts[i], NULL) == 0);
     }
     check(0 == pthread_barrier_destroy(&cross_barrier));
   } else {
     // Chew the memory area from each core in sequence.
-    for (unsigned int i = 0; i < config.ncpus; i++) {
+    for (unsigned int i = 0; i < config.nworkers; i++) {
       chew(p, i, config, false);
     }
   }
@@ -265,5 +278,7 @@
   ofstream trash("/dev/null");
   trash << "result: " << global_sum << endl;
 
+  check(pthread_mutex_destroy(&iomutex) == 0);
+
   return 0;
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.