[Assorted-commits] SF.net SVN: assorted: [601] numa-bench/trunk/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 601
          http://assorted.svn.sourceforge.net/assorted/?rev=601&view=rev
Author:   yangzhang
Date:     2008-03-04 21:49:10 -0800 (Tue, 04 Mar 2008)

Log Message:
-----------
updated chew

Modified Paths:
--------------
    numa-bench/trunk/src/build
    numa-bench/trunk/src/chew.bash
    numa-bench/trunk/src/chew.cc

Modified: numa-bench/trunk/src/build
===================================================================

--- numa-bench/trunk/src/build	2008-03-04 06:31:56 UTC (rev 600)
+++ numa-bench/trunk/src/build	2008-03-05 05:49:10 UTC (rev 601)
@@ -1,8 +1,7 @@
 chew:
   srcs: [chew.cc]
-  libs: [pthread]
+  libs: [pthread, numa]
 
 avail:
   srcs: [avail.cc]
   libs: [pthread]
-

Modified: numa-bench/trunk/src/chew.bash
===================================================================
--- numa-bench/trunk/src/chew.bash	2008-03-04 06:31:56 UTC (rev 600)
+++ numa-bench/trunk/src/chew.bash	2008-03-05 05:49:10 UTC (rev 601)
@@ -2,42 +2,67 @@
 
 set -o errexit -o nounset
 
-make -s chew-opt
+reps=3
 
-function run {
-  for i in {1..3}
-  do out/chew-opt "$@"
+make -s chew-dbg
+
+run() {
+  for i in {1..$reps}
+  do out/chew-dbg "$@"
   done
 }
 
+K=000  M=000000  G=000000000
 KB=000 MB=000000 GB=000000000
 
-#    ncpus    size nreps shuffle par pin local write cross rrnodes nnodes ncpus
+#    ncpus    size opcount nreps shuffle par pin local write cross rrnodes nnodes ncpus
 
-echo writes
-run     16  100$MB     1       0   0   1     0     1     0       1      4    16
-run     16 1000$MB     1       0   0   1     0     1     0       1      4    16
-run     16  100$MB    10       0   0   1     0     1     0       1      4    16
-run     16  100$MB     1       1   0   1     0     1     0       1      4    16
+if true ; then
 
-echo reads
-run     16 1000$MB     1       0   0   1     0     0     0       1      4    16
-run     16  100$MB     1       1   0   1     0     0     0       1      4    16
+#echo writes
+#run     16  100$MB   100$M     1       0   0   1     0     1     0       1      4    16
+#run     16 1000$MB  1000$M     1       0   0   1     0     1     0       1      4    16
+#run     16  100$MB   100$M    10       0   0   1     0     1     0       1      4    16
+#run     16 1000$GB   100$M     1       1   0   1     0     1     0       1      4    16
+#
+#echo reads
+#run     16 1000$MB   100$M     1       0   0   1     0     0     0       1      4    16
+#run     16 1000$MB    10$M     1       1   0   1     0     0     0       1      4    16
 
 for n in 1 2 4 8 12 16 ; do
   echo par
-  run   $n   10$MB     1       0   1   1     0     0     0       1      4    16
-  run   $n   10$MB     1       1   1   1     0     0     0       1      4    16
-  run   $n   10$MB     1       0   1   1     1     0     0       1      4    16
-  run   $n   10$MB     1       1   1   1     1     0     0       1      4    16
-  run   $n   10$MB     1       0   1   1     0     1     0       1      4    16
-  run   $n   10$MB     1       1   1   1     0     1     0       1      4    16
-  run   $n   10$MB     1       0   1   1     1     1     0       1      4    16
-  run   $n   10$MB     1       1   1   1     1     1     0       1      4    16
+  run   $n 1000$MB    10$M     1       0   1   1     0     0     0       1      4    16
+  run   $n 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+  run   $n 1000$MB    10$M     1       0   1   1     1     0     0       1      4    16
+  run   $n 1000$MB    10$M     1       1   1   1     1     0     0       1      4    16
+  run   $n 1000$MB    10$M     1       0   1   1     0     1     0       1      4    16
+  run   $n 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+  run   $n 1000$MB    10$M     1       0   1   1     1     1     0       1      4    16
+  run   $n 1000$MB    10$M     1       1   1   1     1     1     0       1      4    16
 
-  echo cross
-  run   $n   10$MB     1       0   1   1     0     0     1       1      4    16
-  run   $n   10$MB     1       1   1   1     0     0     1       1      4    16
-  run   $n   10$MB     1       0   1   1     0     1     1       1      4    16
-  run   $n   10$MB     1       1   1   1     0     1     1       1      4    16
+#  echo cross
+#  run   $n 1000$MB    10$M     1       0   1   1     0     0     1       1      4    16
+#  run   $n 1000$MB    10$M     1       1   1   1     0     0     1       1      4    16
+#  run   $n 1000$MB    10$M     1       0   1   1     0     1     1       1      4    16
+#  run   $n 1000$MB    10$M     1       1   1   1     0     1     1       1      4    16
 done
+
+else
+
+#run       1 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+#run       2 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+#run       3 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+#run       4 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+#run       8 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+#run      12 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+#run      16 1000$MB    10$M     1       1   1   1     0     1     0       1      4    16
+
+run       1 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+#run       2 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+#run       3 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+#run       4 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+#run       8 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+#run      12 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+run      16 1000$MB    10$M     1       1   1   1     0     0     0       1      4    16
+
+fi

Modified: numa-bench/trunk/src/chew.cc
===================================================================
--- numa-bench/trunk/src/chew.cc	2008-03-04 06:31:56 UTC (rev 600)
+++ numa-bench/trunk/src/chew.cc	2008-03-05 05:49:10 UTC (rev 601)
@@ -3,6 +3,8 @@
 
 #include <sched.h>
 
+#include <numa.h>
+
 #include <boost/bind.hpp>
 
 #include <commons/check.h>
@@ -15,8 +17,12 @@
 using namespace commons;
 using namespace std;
 
-pthread_barrier_t cross_barrier;
+// TODO Make into command line flags?
+const bool debug = false, pretouch = false, do_warmup = false, use_numa = false;
+pthread_barrier_t cross_barrier, startup_barrier;
 pthread_mutex_t iomutex;
+void*** partitions;
+int global_sum;
 
 struct config
 {
@@ -31,6 +37,11 @@
   const size_t size;
 
   /**
+   * The number of operations.
+   */
+  const unsigned int opcount;
+
+  /**
    * Number of repetitions to chew.
    */
   const unsigned int nreps;
@@ -86,16 +97,25 @@
   const unsigned int ncpus;
 };
 
-void*** partitions;
-int global_sum;
+void *
+alloc(size_t sz)
+{
+  return use_numa ? numa_alloc_local(sz) : malloc(sz);
+}
 
+void
+dealloc(void *p, size_t sz)
+{
+  return use_numa ? numa_free(p, sz) : free(p);
+}
+
 /**
  * \param p The buffer to chew.
  * \param config The experiment configuration.
  * \param len Length of the buffer.
  */
 void
-chew1(void* pp, config config, size_t len)
+chew1(void* pp, config config, size_t len, unsigned int seed)
 {
   int* p = (int*) pp;
   const size_t count = len / sizeof(int);
@@ -104,14 +124,14 @@
   // TODO: see these with random numbers generated from a global (serial) rand
   // NOTE: Using rand as the index assumes that rand generates large-enough
   // values.
-  posix_rand rand(current_time_millis() ^ gettid());
+  posix_rand rand(current_time_millis() ^ gettid() ^ seed);
 
   if (config.write) {
     // Write to the region.
     if (config.shuffle) {
       // Random access into the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
-        for (size_t i = 0; i < count; i++) {
+        for (size_t i = 0; i < config.opcount; i++) {
           int r = rand();
           sum += p[r % count] += r;
         }
@@ -119,7 +139,7 @@
     } else {
       // Sequential scan through the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
-        for (size_t i = 0; i < count; i++) {
+        for (size_t i = 0; i < config.opcount; i++) {
           sum += p[i] += rand();
         }
       }
@@ -129,14 +149,14 @@
     if (config.shuffle) {
       // Random access into the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
-        for (size_t i = 0; i < count; i++) {
+        for (size_t i = 0; i < config.opcount; i++) {
           sum += p[rand() % count];
         }
       }
     } else {
       // Sequential scan through the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
-        for (size_t i = 0; i < count; i++) {
+        for (size_t i = 0; i < config.opcount; i++) {
           sum += p[i] + rand();
         }
       }
@@ -166,7 +186,29 @@
     pin_thread(cpu);
   }
 
-  void* p = config.local ? malloc(config.size) : pp;
+  void* p = config.local ? alloc(config.size) : pp;
+  if (pretouch) {
+    int *is = (int*) p;
+    for (size_t i = 0; i < config.size / sizeof(int); i++) {
+      is[i] = i;
+    }
+    int sum = 0;
+    for (size_t i = 0; i < config.size / sizeof(int); i++) {
+      sum += is[i];
+    }
+    global_sum += sum;
+  }
+  if (debug) {
+    check(pthread_mutex_lock(&iomutex) == 0);
+    cout << worker << " alloc " << p << endl;
+    check(pthread_mutex_unlock(&iomutex) == 0);
+  }
+  if (!warmup) {
+    int barrier_result = pthread_barrier_wait(&startup_barrier);
+    check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD ||
+          barrier_result == 0);
+  }
+  posix_rand rand(current_time_millis());
   timer t(": ");
 
   if (!warmup && config.cross) {
@@ -180,10 +222,10 @@
     // TODO: make this more interesting than just a sequential traversal over
     // the partitions.
     for (unsigned int i = 0; i < config.nworkers; i++) {
-      chew1(partitions[i][worker], config, len);
+      chew1(partitions[i][worker], config, len, rand());
     }
   } else {
-    chew1(p, config, config.size);
+    chew1(p, config, config.size, rand());
   }
 
   // Print the elapsed time and "result".
@@ -193,7 +235,7 @@
   t.print();
   check(pthread_mutex_unlock(&iomutex) == 0);
 
-  if (config.local) free(p);
+  if (config.local) dealloc(p, config.size);
 
   return NULL;
 }
@@ -206,8 +248,8 @@
 
   if (argc < 13) {
     cerr << argv[0] <<
-      " <nworkers> <size> <nreps> <shuffle> <par> <pin>"
-      " <local> <write> <rrnodes> <nnodes>" << endl;
+      " <nworkers> <size> <opcount> <nreps> <shuffle> <par> <pin>"
+      " <local> <write> <rrnodes> <nnodes> <ncpus>" << endl;
     return 1;
   }
 
@@ -224,12 +266,14 @@
     atoi(argv[ 9]),
     atoi(argv[10]),
     atoi(argv[11]),
-    atoi(argv[12])
+    atoi(argv[12]),
+    atoi(argv[13])
   };
 
   cout << "config:"
        << " nworkers " << config.nworkers
        << " size "     << config.size
+       << " opcount "  << config.opcount
        << " nreps "    << config.nreps
        << " shuffle "  << config.shuffle
        << " par "      << config.par
@@ -242,6 +286,8 @@
        << " ncpus "    << config.ncpus
        << endl;
 
+  check(config.shuffle || config.opcount <= config.size / sizeof(int));
+
   void *p = malloc(config.size);
   check(p != NULL);
 
@@ -254,12 +300,13 @@
   }
 
   // Warmup.
-  chew(p, 0, config, true);
+  if (do_warmup) chew(p, 0, config, true);
 
   if (config.par) {
     // Chew the memory area from each core in parallel (and also chew own).
     pthread_t ts[config.nworkers];
     check(0 == pthread_barrier_init(&cross_barrier, NULL, config.nworkers));
+    check(0 == pthread_barrier_init(&startup_barrier, NULL, config.nworkers));
     for (unsigned int i = 0; i < config.nworkers; i++) {
       ts[i] = spawn(bind(chew, p, i, ref(config), false));
     }
@@ -267,6 +314,7 @@
       check(pthread_join(ts[i], NULL) == 0);
     }
     check(0 == pthread_barrier_destroy(&cross_barrier));
+    check(0 == pthread_barrier_destroy(&startup_barrier));
   } else {
     // Chew the memory area from each core in sequence.
     for (unsigned int i = 0; i < config.nworkers; i++) {
@@ -278,6 +326,8 @@
   ofstream trash("/dev/null");
   trash << "result: " << global_sum << endl;
 
+  cout << "result: " << global_sum << endl;
+
   check(pthread_mutex_destroy(&iomutex) == 0);
 
   return 0;


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.