Thread: [Assorted-commits] SF.net SVN: assorted: [365] numa-bench/trunk/src/malloc.cc

Brought to you by: yangzhang

assorted-commits

[Assorted-commits] SF.net SVN: assorted: [365] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-11 04:57:26

Revision: 365
          http://assorted.svn.sourceforge.net/assorted/?rev=365&view=rev
Author:   yangzhang
Date:     2008-02-10 20:57:31 -0800 (Sun, 10 Feb 2008)

Log Message:
-----------
added cpu pinning to malloc

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-11 04:57:12 UTC (rev 364)
+++ numa-bench/trunk/src/malloc.cc	2008-02-11 04:57:31 UTC (rev 365)
@@ -5,17 +5,21 @@
 
 #include <sched.h>
 
+#include <boost/bind.hpp>
+
 #include <commons/check.h>
 #include <commons/threads.h>
 #include <commons/time.h>
+#include <commons/boost/threads.h>
 
+using namespace boost;
 using namespace commons;
 using namespace std;
 
 const size_t size = 10000000;
 
 void*
-chew(void* pp)
+chew(void* pp, int core)
 {
   char* p = (char*) pp;
   const int reps = 100;
@@ -25,17 +29,16 @@
   // Pin this thread to the right processor.
   cpu_set_t cs;
   CPU_ZERO(&cs);
-  CPU_SET(1, &cs);
+  CPU_SET(core, &cs);
   sched_setaffinity(pid, sizeof(cs), &cs);
 
-  // TODO: try shuffling indexes
   for (int c = 0; c < reps; c++) {
     for (size_t i = 0; i < size; i++) {
       p[i] = i;
     }
   }
 
-  // Print the elapsed time;
+  // Print the elapsed time.
   cout << pid;
   t.print();
   return NULL;
@@ -53,21 +56,23 @@
   void *p = malloc(size);
 
   // warmup
-  chew(p);
+  chew(p, 0);
   pthread_t ts[n];
 
   // start thread on each core
   for (int i = 0; i < n; i++) {
-    check(pthread_create(&ts[i], NULL, chew, p) == 0);
+    pthread_t t;
+    check((t = spawn(bind(chew, p, i))) != 0);
+    check(pthread_join(t, NULL) == 0);
   }
-  waitall(ts, n);
+  // waitall(ts, n);
   return 0;
 
   // THRASH
 
   // spawn workers
   for (int i = 0; i < n; i++) {
-    check(pthread_create(&ts[i], NULL, chew, p) == 0);
+    check((ts[i] = spawn(bind(chew, p, i))) == 0);
   }
   waitall(ts, n);
   return 0;


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [386] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-12 01:48:02

Revision: 386
          http://assorted.svn.sourceforge.net/assorted/?rev=386&view=rev
Author:   yangzhang
Date:     2008-02-11 17:47:59 -0800 (Mon, 11 Feb 2008)

Log Message:
-----------
nice malloc test

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-12 01:22:31 UTC (rev 385)
+++ numa-bench/trunk/src/malloc.cc	2008-02-12 01:47:59 UTC (rev 386)
@@ -1,5 +1,22 @@
-// Does malloc tend to allocate locally?
+// Questions this program answers:
+//
+// - Does malloc tend to allocate locally?
+//   - Yes. Times working from local node is lower.
+// - How much does working from another node affect throughput?
+//   - A bit: 647x from local, 649x from neighbor, 651x from remote
+// - Is there difference from repeatedly fetching the same (large) area n times
+//   vs. fetching an area n times larger?
+//   - No. The times are identical for 1GB*1 and 100MB*10.
+// - How much difference is there between sequential scan and random access?
+//   - Huge difference. Also magnifies the locality effects more.
+//   - 1700 from local, 1990 from one neighbor, 2020 from another neighbor,
+//     and 2310 from remote.
+// - Can we observe prefetching's effects? (Random access but chew the full
+//   cache line of data.)
+//   - TODO!
 
+// TODO: use real shuffling? or is rand ok?
+
 #include <cstdlib>
 #include <iostream>
 
@@ -16,30 +33,44 @@
 using namespace commons;
 using namespace std;
 
-const size_t size = 10000000;
-
+/**
+ * \param pp The start of the buffer to chew.
+ * \param core Which core to pin our thread to.
+ * \param size The size of the buffer.
+ * \param nreps The number of times to chew through the buffer.
+ * \param shuffle If false, sequentially chew through; otherwise, randomly
+ *     shuffle the indexes we chew through.
+ */
 void*
-chew(void* pp, int core)
+chew(void* pp, int core, size_t size, int nreps, bool shuffle)
 {
   char* p = (char*) pp;
-  const int reps = 100;
   pid_t pid = gettid();
   timer t(": ");
 
-  // Pin this thread to the right processor.
+  // Pin this thread to core `core`.
   cpu_set_t cs;
   CPU_ZERO(&cs);
   CPU_SET(core, &cs);
   sched_setaffinity(pid, sizeof(cs), &cs);
 
-  for (int c = 0; c < reps; c++) {
-    for (size_t i = 0; i < size; i++) {
-      p[i] = i;
+  // Write sequentially to the memory region.
+  if (shuffle) {
+    for (int c = 0; c < nreps; c++) {
+      for (size_t i = 0; i < size; i++) {
+        p[rand() % size] = i;
+      }
     }
+  } else {
+    for (int c = 0; c < nreps; c++) {
+      for (size_t i = 0; i < size; i++) {
+        p[i] = i;
+      }
+    }
   }
 
   // Print the elapsed time.
-  cout << pid;
+  cout << core;
   t.print();
   return NULL;
 }
@@ -47,33 +78,31 @@
 int
 main(int argc, char** argv)
 {
-  if (argc < 2) {
-    cerr << "malloc <nthreads>" << endl;
+  if (argc < 5) {
+    cerr << argv[0] << " <ncores> <size> <nreps> <shuffle>" << endl;
     return 1;
   }
 
-  const int n = atoi(argv[1]);
+  // Parse command-line arguments.
+  const int ncores = atoi(argv[1]);
+  const size_t size = atoi(argv[2]);
+  const int nreps = atoi(argv[3]);
+  const bool shuffle = atoi(argv[4]);
+
   void *p = malloc(size);
 
-  // warmup
-  chew(p, 0);
-  pthread_t ts[n];
+  // Warmup.
+  cout << "warmup: ";
+  chew(p, 0, size, nreps, shuffle);
 
-  // start thread on each core
-  for (int i = 0; i < n; i++) {
+  // Chew the memory area from each core.
+  for (int i = 0; i < ncores; i++) {
     pthread_t t;
-    check((t = spawn(bind(chew, p, i))) != 0);
+    check((t = spawn(bind(chew, p, i, size, nreps, shuffle))) != 0);
     check(pthread_join(t, NULL) == 0);
   }
-  // waitall(ts, n);
-  return 0;
 
-  // THRASH
+  free(p);
 
-  // spawn workers
-  for (int i = 0; i < n; i++) {
-    check((ts[i] = spawn(bind(chew, p, i))) == 0);
-  }
-  waitall(ts, n);
   return 0;
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [398] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-13 03:26:39

Revision: 398
          http://assorted.svn.sourceforge.net/assorted/?rev=398&view=rev
Author:   yangzhang
Date:     2008-02-12 19:26:37 -0800 (Tue, 12 Feb 2008)

Log Message:
-----------
calling rand() in seq to make the test more fair; writing by ints not chars; added some notes

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-12 22:15:13 UTC (rev 397)
+++ numa-bench/trunk/src/malloc.cc	2008-02-13 03:26:37 UTC (rev 398)
@@ -11,6 +11,8 @@
 //   - Huge difference. Also magnifies the locality effects more.
 //   - 1700 from local, 1990 from one neighbor, 2020 from another neighbor,
 //     and 2310 from remote.
+// - What's the difference between reading and writing?
+//   - TODO!
 // - Can we observe prefetching's effects? (Random access but chew the full
 //   cache line of data.)
 //   - TODO!
@@ -42,9 +44,10 @@
  *     shuffle the indexes we chew through.
  */
 void*
-chew(void* pp, int core, size_t size, int nreps, bool shuffle)
+chew(void* pp, unsigned int core, size_t size, unsigned int nreps, bool shuffle)
 {
-  char* p = (char*) pp;
+  int* p = (int*) pp;
+  const size_t count = size / sizeof(int);
   pid_t pid = gettid();
   timer t(": ");
 
@@ -56,15 +59,18 @@
 
   // Write sequentially to the memory region.
   if (shuffle) {
-    for (int c = 0; c < nreps; c++) {
-      for (size_t i = 0; i < size; i++) {
-        p[rand() % size] = i;
+    for (unsigned int c = 0; c < nreps; c++) {
+      for (size_t i = 0; i < count; i++) {
+        // NOTE: Using r as the index assumes that rand generates large-enough
+        // values.
+        int r = rand();
+        p[r % count] += r;
       }
     }
   } else {
-    for (int c = 0; c < nreps; c++) {
-      for (size_t i = 0; i < size; i++) {
-        p[i] = i;
+    for (unsigned int c = 0; c < nreps; c++) {
+      for (size_t i = 0; i < count; i++) {
+        p[i] += rand();
       }
     }
   }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [401] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-13 07:58:08

Revision: 401
          http://assorted.svn.sourceforge.net/assorted/?rev=401&view=rev
Author:   yangzhang
Date:     2008-02-12 23:58:13 -0800 (Tue, 12 Feb 2008)

Log Message:
-----------
beefed up the test

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-13 07:58:00 UTC (rev 400)
+++ numa-bench/trunk/src/malloc.cc	2008-02-13 07:58:13 UTC (rev 401)
@@ -1,7 +1,7 @@
 // Questions this program answers:
 //
 // - Does malloc tend to allocate locally?
-//   - Yes. Times working from local node is lower.
+//   - TODO!
 // - How much does working from another node affect throughput?
 //   - A bit: 647x from local, 649x from neighbor, 651x from remote
 // - Is there difference from repeatedly fetching the same (large) area n times
@@ -21,6 +21,7 @@
 
 #include <cstdlib>
 #include <iostream>
+#include <iomanip>
 
 #include <sched.h>
 
@@ -35,77 +36,166 @@
 using namespace commons;
 using namespace std;
 
+struct config
+{
+  /**
+   * The number of cores to test. This is a parameter (rather than
+   * auto-detected) because it additionally serves to mean the number of cores
+   * we want to test in parallel. As this program evolves, these may be
+   * separated.
+   */
+  const int ncores;
+
+  /**
+   * Size in bytes of the buffer to chew.
+   */
+  const size_t size;
+
+  /**
+   * Number of repetitions to chew.
+   */
+  const int nreps;
+
+  /**
+   * Perform rand access, otherwise sequential scan.
+   */
+  const bool shuffle;
+
+  /**
+   * Chew in parallel, otherwise each core chews serially.
+   */
+  const bool par;
+
+  /**
+   * Pin thread i to core i, otherwise let the OS manage things.
+   */
+  const bool pin;
+
+  /**
+   * Chew my own memory, otherwise chew the given (shared) memory.
+   */
+  const bool local;
+
+  /**
+   * Do writes, otherwise just do reads.
+   */
+  const bool write;
+};
+
 /**
  * \param pp The start of the buffer to chew.
- * \param core Which core to pin our thread to.
- * \param size The size of the buffer.
- * \param nreps The number of times to chew through the buffer.
- * \param shuffle If false, sequentially chew through; otherwise, randomly
- *     shuffle the indexes we chew through.
+ * \param cpu Which CPU to pin our thread to.
+ * \param config The experiment configuration parameters.
  */
 void*
-chew(void* pp, unsigned int core, size_t size, unsigned int nreps, bool shuffle)
+chew(void* pp, unsigned int cpu, const config & config, const char* label)
 {
-  int* p = (int*) pp;
-  const size_t count = size / sizeof(int);
-  pid_t pid = gettid();
+  int* p = (int*) (config.local ? malloc(config.size) : pp);
+  const size_t count = config.size / sizeof(int);
   timer t(": ");
 
-  // Pin this thread to core `core`.
-  cpu_set_t cs;
-  CPU_ZERO(&cs);
-  CPU_SET(core, &cs);
-  sched_setaffinity(pid, sizeof(cs), &cs);
+  // Pin this thread to cpu `cpu`.
+  if (config.pin) {
+    pin_thread(cpu);
+  }
 
-  // Write sequentially to the memory region.
-  if (shuffle) {
-    for (unsigned int c = 0; c < nreps; c++) {
-      for (size_t i = 0; i < count; i++) {
-        // NOTE: Using r as the index assumes that rand generates large-enough
-        // values.
-        int r = rand();
-        p[r % count] += r;
+  if (config.write) {
+    // Write to the region.
+    if (config.shuffle) {
+      // Random access into the memory region.
+      for (unsigned int c = 0; c < config.nreps; c++) {
+        for (size_t i = 0; i < count; i++) {
+          // NOTE: Using r as the index assumes that rand generates large-enough
+          // values.
+          int r = rand();
+          p[r % count] += r;
+        }
       }
+    } else {
+      // Sequential scan through the memory region.
+      for (unsigned int c = 0; c < config.nreps; c++) {
+        for (size_t i = 0; i < count; i++) {
+          p[i] += rand();
+        }
+      }
     }
   } else {
-    for (unsigned int c = 0; c < nreps; c++) {
-      for (size_t i = 0; i < count; i++) {
-        p[i] += rand();
+    // Only read from the region.
+    int sum = 0;
+    if (config.shuffle) {
+      // Random access into the memory region.
+      for (unsigned int c = 0; c < config.nreps; c++) {
+        for (size_t i = 0; i < count; i++) {
+          // NOTE: Using r as the index assumes that rand generates large-enough
+          // values.
+          sum += p[rand() % count];
+        }
       }
+    } else {
+      // Sequential scan through the memory region.
+      for (unsigned int c = 0; c < config.nreps; c++) {
+        for (size_t i = 0; i < count; i++) {
+          sum += p[i] + rand();
+        }
+      }
     }
+    cout << sum << endl;
   }
 
   // Print the elapsed time.
-  cout << core;
+  cout << label << cpu;
   t.print();
+
+  if (config.local) free(p);
+
   return NULL;
 }
 
 int
 main(int argc, char** argv)
 {
-  if (argc < 5) {
-    cerr << argv[0] << " <ncores> <size> <nreps> <shuffle>" << endl;
+  // So that our global shared malloc takes place on the CPU 0's node.
+  pin_thread(0);
+
+  if (argc < 9) {
+    cerr << argv[0] <<
+      " <ncores> <size> <nreps> <shuffle> <par> <pin> <local> <write>" << endl;
     return 1;
   }
 
-  // Parse command-line arguments.
-  const int ncores = atoi(argv[1]);
-  const size_t size = atoi(argv[2]);
-  const int nreps = atoi(argv[3]);
-  const bool shuffle = atoi(argv[4]);
+  // Parse command-line arguments. TODO
+  const config config = { 
+    atoi(argv[1]),
+    atoi(argv[2]),
+    atoi(argv[3]),
+    atoi(argv[4]),
+    atoi(argv[5]),
+    atoi(argv[6]),
+    atoi(argv[7]),
+    atoi(argv[8])
+  };
 
-  void *p = malloc(size);
+  checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough");
 
+  void *p = malloc(config.size);
+
   // Warmup.
-  cout << "warmup: ";
-  chew(p, 0, size, nreps, shuffle);
+  chew(p, 0, config, "warmup: ");
 
-  // Chew the memory area from each core.
-  for (int i = 0; i < ncores; i++) {
-    pthread_t t;
-    check((t = spawn(bind(chew, p, i, size, nreps, shuffle))) != 0);
-    check(pthread_join(t, NULL) == 0);
+  if (config.par) {
+    // Chew the memory area from each core in parallel (and also chew own).
+    pthread_t ts[config.ncores];
+    for (int i = 0; i < config.ncores; i++) {
+      ts[i] = spawn(bind(chew, p, i, ref(config), ""));
+    }
+    for (int i = 0; i < config.ncores; i++) {
+      check(pthread_join(ts[i], NULL) == 0);
+    }
+  } else {
+    // Chew the memory area from each core in sequence.
+    for (int i = 0; i < config.ncores; i++) {
+      chew(p, i, config, "");
+    }
   }
 
   free(p);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [403] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-13 07:59:08

Revision: 403
          http://assorted.svn.sourceforge.net/assorted/?rev=403&view=rev
Author:   yangzhang
Date:     2008-02-12 23:59:10 -0800 (Tue, 12 Feb 2008)

Log Message:
-----------
tweak

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-13 07:58:40 UTC (rev 402)
+++ numa-bench/trunk/src/malloc.cc	2008-02-13 07:59:10 UTC (rev 403)
@@ -21,7 +21,6 @@
 
 #include <cstdlib>
 #include <iostream>
-#include <iomanip>
 
 #include <sched.h>
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [404] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-13 08:14:07

Revision: 404
          http://assorted.svn.sourceforge.net/assorted/?rev=404&view=rev
Author:   yangzhang
Date:     2008-02-13 00:14:10 -0800 (Wed, 13 Feb 2008)

Log Message:
-----------
added config logging; added result (sum) printing

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-13 07:59:10 UTC (rev 403)
+++ numa-bench/trunk/src/malloc.cc	2008-02-13 08:14:10 UTC (rev 404)
@@ -98,6 +98,7 @@
     pin_thread(cpu);
   }
 
+  int sum = 0;
   if (config.write) {
     // Write to the region.
     if (config.shuffle) {
@@ -107,20 +108,19 @@
           // NOTE: Using r as the index assumes that rand generates large-enough
           // values.
           int r = rand();
-          p[r % count] += r;
+          sum += p[r % count] += r;
         }
       }
     } else {
       // Sequential scan through the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
         for (size_t i = 0; i < count; i++) {
-          p[i] += rand();
+          sum += p[i] += rand();
         }
       }
     }
   } else {
     // Only read from the region.
-    int sum = 0;
     if (config.shuffle) {
       // Random access into the memory region.
       for (unsigned int c = 0; c < config.nreps; c++) {
@@ -138,12 +138,12 @@
         }
       }
     }
-    cout << sum << endl;
   }
 
-  // Print the elapsed time.
+  // Print the elapsed time and "result".
   cout << label << cpu;
   t.print();
+  cout << "result: " << sum;
 
   if (config.local) free(p);
 
@@ -174,6 +174,16 @@
     atoi(argv[8])
   };
 
+  cout << "config:"
+       << " ncores "  << config.ncores
+       << " size "    << config.size
+       << " nreps "   << config.nreps
+       << " shuffle " << config.shuffle
+       << " par "     << config.par
+       << " pin "     << config.pin
+       << " local "   << config.local
+       << " write "   << config.write << endl;
+
   checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough");
 
   void *p = malloc(config.size);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [407] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-13 17:53:35

Revision: 407
          http://assorted.svn.sourceforge.net/assorted/?rev=407&view=rev
Author:   yangzhang
Date:     2008-02-13 09:53:30 -0800 (Wed, 13 Feb 2008)

Log Message:
-----------
added cross-comm

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-13 17:53:16 UTC (rev 406)
+++ numa-bench/trunk/src/malloc.cc	2008-02-13 17:53:30 UTC (rev 407)
@@ -20,6 +20,7 @@
 // TODO: use real shuffling? or is rand ok?
 
 #include <cstdlib>
+#include <fstream>
 #include <iostream>
 
 #include <sched.h>
@@ -35,6 +36,8 @@
 using namespace commons;
 using namespace std;
 
+pthread_barrier_t cross_barrier;
+
 struct config
 {
   /**
@@ -79,25 +82,27 @@
    * Do writes, otherwise just do reads.
    */
   const bool write;
+
+  /**
+   * Test cross-communication (use partitions), otherwise use either the
+   * global/local buffer.
+   */
+  const bool cross;
 };
 
+void*** partitions;
+int global_sum;
+
 /**
- * \param pp The start of the buffer to chew.
- * \param cpu Which CPU to pin our thread to.
- * \param config The experiment configuration parameters.
+ * \param p The buffer to chew.
+ * \param config The experiment configuration.
+ * \param len Length of the buffer.
  */
-void*
-chew(void* pp, unsigned int cpu, const config & config, const char* label)
+void
+chew1(void* pp, config config, size_t len)
 {
-  int* p = (int*) (config.local ? malloc(config.size) : pp);
-  const size_t count = config.size / sizeof(int);
-  timer t(": ");
-
-  // Pin this thread to cpu `cpu`.
-  if (config.pin) {
-    pin_thread(cpu);
-  }
-
+  int* p = (int*) pp;
+  const size_t count = len / sizeof(int);
   int sum = 0;
   if (config.write) {
     // Write to the region.
@@ -139,11 +144,44 @@
       }
     }
   }
+  global_sum += sum;
+}
 
+/**
+ * \param pp The start of the buffer to chew.
+ * \param cpu Which CPU to pin our thread to.
+ * \param config The experiment configuration parameters.
+ * \param label Prefix for the elapsed time output.
+ */
+void*
+chew(void* pp, unsigned int cpu, const config & config, bool warmup)
+{
+  // Pin this thread to cpu `cpu`.
+  if (config.pin) {
+    pin_thread(cpu);
+  }
+
+  void* p = config.local ? malloc(config.size) : pp;
+  timer t(": ");
+
+  if (!warmup && config.cross) {
+    size_t len = config.size / config.ncores;
+    for (int i = 0; i < config.ncores; i++) {
+      partitions[cpu][i] = new char[len];
+    }
+    int barrier_result = pthread_barrier_wait(&cross_barrier);
+    check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || barrier_result == 0);
+    for (int i = 0; i < config.ncores; i++) {
+      chew1(partitions[i][cpu], config, len);
+    }
+  } else {
+    chew1(p, config, config.size);
+  }
+
   // Print the elapsed time and "result".
-  cout << label << cpu;
+  if (warmup) cout << "warmup: " << endl;
+  cout << cpu;
   t.print();
-  cout << "result: " << sum;
 
   if (config.local) free(p);
 
@@ -156,7 +194,7 @@
   // So that our global shared malloc takes place on the CPU 0's node.
   pin_thread(0);
 
-  if (argc < 9) {
+  if (argc < 10) {
     cerr << argv[0] <<
       " <ncores> <size> <nreps> <shuffle> <par> <pin> <local> <write>" << endl;
     return 1;
@@ -171,7 +209,8 @@
     atoi(argv[5]),
     atoi(argv[6]),
     atoi(argv[7]),
-    atoi(argv[8])
+    atoi(argv[8]),
+    atoi(argv[9])
   };
 
   cout << "config:"
@@ -182,24 +221,34 @@
        << " par "     << config.par
        << " pin "     << config.pin
        << " local "   << config.local
-       << " write "   << config.write << endl;
+       << " write "   << config.write
+       << " cross "   << config.cross << endl;
 
   checkmsg(RAND_MAX > config.size / sizeof(int), "PRNG range not large enough");
 
   void *p = malloc(config.size);
+  check(p != NULL);
 
+  if (config.cross) {
+    partitions = new void**[config.ncores];
+    for (unsigned int i  = 0; i < config.ncores; i++)
+      partitions[i] = new void*[config.ncores];
+  }
+
   // Warmup.
-  chew(p, 0, config, "warmup: ");
+  chew(p, 0, config, true);
 
   if (config.par) {
     // Chew the memory area from each core in parallel (and also chew own).
     pthread_t ts[config.ncores];
+    check(0 == pthread_barrier_init(&cross_barrier, NULL, config.ncores));
     for (int i = 0; i < config.ncores; i++) {
-      ts[i] = spawn(bind(chew, p, i, ref(config), ""));
+      ts[i] = spawn(bind(chew, p, i, ref(config), false));
     }
     for (int i = 0; i < config.ncores; i++) {
       check(pthread_join(ts[i], NULL) == 0);
     }
+    check(0 == pthread_barrier_destroy(&cross_barrier));
   } else {
     // Chew the memory area from each core in sequence.
     for (int i = 0; i < config.ncores; i++) {
@@ -208,6 +257,8 @@
   }
 
   free(p);
+  ofstream trash("/dev/null");
+  trash << "result: " << global_sum << endl;
 
   return 0;
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [409] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-13 18:04:06

Revision: 409
          http://assorted.svn.sourceforge.net/assorted/?rev=409&view=rev
Author:   yangzhang
Date:     2008-02-13 10:03:22 -0800 (Wed, 13 Feb 2008)

Log Message:
-----------
fixed warmup messages

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-13 18:03:09 UTC (rev 408)
+++ numa-bench/trunk/src/malloc.cc	2008-02-13 18:03:22 UTC (rev 409)
@@ -179,7 +179,7 @@
   }
 
   // Print the elapsed time and "result".
-  if (warmup) cout << "warmup: " << endl;
+  if (warmup) cout << "warmup: ";
   cout << cpu;
   t.print();
 
@@ -252,7 +252,7 @@
   } else {
     // Chew the memory area from each core in sequence.
     for (int i = 0; i < config.ncores; i++) {
-      chew(p, i, config, "");
+      chew(p, i, config, false);
     }
   }
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [419] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-15 01:44:38

Revision: 419
          http://assorted.svn.sourceforge.net/assorted/?rev=419&view=rev
Author:   yangzhang
Date:     2008-02-14 17:44:42 -0800 (Thu, 14 Feb 2008)

Log Message:
-----------
cleaned up content

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-15 01:44:23 UTC (rev 418)
+++ numa-bench/trunk/src/malloc.cc	2008-02-15 01:44:42 UTC (rev 419)
@@ -1,24 +1,3 @@
-// Questions this program answers:
-//
-// - Does malloc tend to allocate locally?
-//   - TODO!
-// - How much does working from another node affect throughput?
-//   - A bit: 647x from local, 649x from neighbor, 651x from remote
-// - Is there difference from repeatedly fetching the same (large) area n times
-//   vs. fetching an area n times larger?
-//   - No. The times are identical for 1GB*1 and 100MB*10.
-// - How much difference is there between sequential scan and random access?
-//   - Huge difference. Also magnifies the locality effects more.
-//   - 1700 from local, 1990 from one neighbor, 2020 from another neighbor,
-//     and 2310 from remote.
-// - What's the difference between reading and writing?
-//   - TODO!
-// - Can we observe prefetching's effects? (Random access but chew the full
-//   cache line of data.)
-//   - TODO!
-
-// TODO: use real shuffling? or is rand ok?
-
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
@@ -171,6 +150,8 @@
     }
     int barrier_result = pthread_barrier_wait(&cross_barrier);
     check(barrier_result == PTHREAD_BARRIER_SERIAL_THREAD || barrier_result == 0);
+    // TODO: make this more interesting than just a sequential traversal over
+    // the partitions.
     for (int i = 0; i < config.ncores; i++) {
       chew1(partitions[i][cpu], config, len);
     }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Assorted-commits] SF.net SVN: assorted: [512] numa-bench/trunk/src/malloc.cc

From: <yan...@us...> - 2008-02-26 19:41:20

Revision: 512
          http://assorted.svn.sourceforge.net/assorted/?rev=512&view=rev
Author:   yangzhang
Date:     2008-02-26 11:41:17 -0800 (Tue, 26 Feb 2008)

Log Message:
-----------
added custom rng

Modified Paths:
--------------
    numa-bench/trunk/src/malloc.cc

Modified: numa-bench/trunk/src/malloc.cc
===================================================================
--- numa-bench/trunk/src/malloc.cc	2008-02-26 19:41:06 UTC (rev 511)
+++ numa-bench/trunk/src/malloc.cc	2008-02-26 19:41:17 UTC (rev 512)
@@ -1,4 +1,3 @@
-#include <cstdlib>
 #include <fstream>
 #include <iostream>
 
@@ -7,6 +6,7 @@
 #include <boost/bind.hpp>
 
 #include <commons/check.h>
+#include <commons/rand.h>
 #include <commons/threads.h>
 #include <commons/time.h>
 #include <commons/boost/threads.h>
@@ -83,6 +83,7 @@
   int* p = (int*) pp;
   const size_t count = len / sizeof(int);
   int sum = 0;
+  posix_rand rand(current_time_millis() ^ gettid());
   if (config.write) {
     // Write to the region.
     if (config.shuffle) {


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.