[X10-commits] SF.net SVN: x10:[12118] trunk

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 12118
          http://x10.svn.sourceforge.net/x10/?rev=12118&view=rev
Author:   sparksparkspark
Date:     2009-11-18 13:52:57 +0000 (Wed, 18 Nov 2009)

Log Message:
-----------
1) Ignore wget failure when building x10rt if there is a tar already present
2) Implement automatic blocks/threads detection at x10rt level

Modified Paths:
--------------
    trunk/x10.dist/samples/KMeansCUDA.x10
    trunk/x10.runtime/src-cpp/x10aux/network.cc
    trunk/x10.runtime/src-cpp/x10aux/network.h
    trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc
    trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc
    trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc
    trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h
    trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h
    trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h
    trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk

Modified: trunk/x10.dist/samples/KMeansCUDA.x10
===================================================================

--- trunk/x10.dist/samples/KMeansCUDA.x10	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.dist/samples/KMeansCUDA.x10	2009-11-18 13:52:57 UTC (rev 12118)
@@ -80,7 +80,7 @@
 
                         val kernel_start_time = System.currentTimeMillis();
                         // classify kernel
-                        val blocks = 8, threads = 64;
+                        val blocks = 4, threads = 256;
                         at (gpu) @CUDA {
                             for ((block) in 0..blocks-1) {
                                 val clustercache = Rail.make[Float](num_clusters*4, clusters_copy);

Modified: trunk/x10.runtime/src-cpp/x10aux/network.cc
===================================================================
--- trunk/x10.runtime/src-cpp/x10aux/network.cc	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10aux/network.cc	2009-11-18 13:52:57 UTC (rev 12118)
@@ -31,6 +31,64 @@
 volatile x10_long x10aux::serialized_bytes = 0;
 volatile x10_long x10aux::deserialized_bytes = 0;
 
+
+const int cuda_kernel_cfgs[] = {
+  /*1024*/ 8, 128,
+  /*1024*/ 4, 256,
+  /*1024*/ 2, 512,
+
+  /*960*/ 5, 192,
+  /*960*/ 3, 320,
+
+  /*896*/ 7, 128,
+  /*896*/ 2, 448,
+
+  /*768*/ 6, 128,
+  /*768*/ 4, 192,
+  /*768*/ 3, 256,
+  /*768*/ 2, 384,
+
+  /*640*/ 5, 128,
+  /*640*/ 2, 320,
+
+  /*576*/ 3, 192,
+
+  /*512*/ 8, 64,
+  /*512*/ 4, 128,
+  /*512*/ 2, 256,
+  /*512*/ 1, 512,
+
+  /*448*/ 7, 64,
+  /*448*/ 1, 448,
+
+  /*384*/ 6, 64,
+  /*384*/ 3, 128,
+  /*384*/ 2, 192,
+  /*384*/ 1, 384,
+
+  /*320*/ 5, 64,
+  /*320*/ 1, 320,
+
+  /*256*/ 4, 64,
+  /*256*/ 2, 128,
+  /*256*/ 1, 256,
+
+  /*192*/ 3, 64,
+  /*192*/ 1, 192,
+
+  /*128*/ 2, 64,
+  /*128*/ 1, 128,
+
+  /*64*/ 1, 64,
+
+  0 /* terminator */
+};
+
+void x10aux::blocks_threads (place p, msg_type t, int shm, int &bs, int &ts, const int *cfgs)
+{ x10rt_blocks_threads(p,t,shm,bs,ts,cfgs); }
+
+
+
 void *kernel_put_finder (const x10rt_msg_params &p, x10rt_copy_sz)
 {
     x10aux::deserialization_buffer buf(static_cast<char*>(p.msg));

Modified: trunk/x10.runtime/src-cpp/x10aux/network.h
===================================================================
--- trunk/x10.runtime/src-cpp/x10aux/network.h	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10aux/network.h	2009-11-18 13:52:57 UTC (rev 12118)
@@ -36,7 +36,10 @@
     inline x10_boolean is_cuda (place p)      { return x10rt_is_cuda(p); }
     inline void event_probe (void)            { x10rt_probe(); }
 
+    extern const int cuda_cfgs[];
+    void blocks_threads (place p, msg_type t, int shm, int &bs, int &ts, const int *cfgs=cuda_cfgs);
 
+
     inline x10_ulong remote_alloc (place p, size_t sz) {
         _X_(ANSI_BOLD<<ANSI_X10RT<<"Remote alloc: "<<ANSI_RESET
             <<"size "<<sz<<" to place: "<<p);

Modified: trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc
===================================================================
--- trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc	2009-11-18 13:52:57 UTC (rev 12118)
@@ -238,8 +238,9 @@
         };
     };
 
-    static void ensure_initialized (void)
+    void ensure_initialized (void)
     {
+        // only do once per process
         static int done = 0;
         if (!done) {
             CU_SAFE(cuInit(0));
@@ -247,6 +248,16 @@
         }
     }
 
+    bool stream_ready (CUstream s)
+    {
+        CUresult r = cuStreamQuery(s);
+        if (r==CUDA_ERROR_NOT_READY) return false;
+        CU_SAFE(r);
+        return true;
+    }
+
+    int round_up (int x, int y) { return (x + (y-1)) / y * y; }
+
 }
 
 struct x10rt_cuda_ctx {
@@ -509,20 +520,20 @@
     pthread_mutex_lock(&big_lock_of_doom);
 
     if (ctx->cbs.arrc <= p.type) {
-        fprintf(stderr,"X10RT: Kernel %llu is invalid.\n", (unsigned long long)p.type);
+        fprintf(stderr,"X10RT: async %lu is invalid.\n", (unsigned long)p.type);
         abort();
     }
-    if (ctx->cbs[p.type].kernel_cbs.pre == NULL) {
-        fprintf(stderr,"X10RT: Kernel %llu has no 'pre' registered.\n", (unsigned long long)p.type);
-        abort();
-    }
     if (ctx->cbs[p.type].kernel_cbs.kernel == NULL) {
-        fprintf(stderr,"X10RT: Kernel %llu has no kernel registered.\n",(unsigned long long)p.type);
+        fprintf(stderr,"X10RT: async %lu is not a CUDA kernel.\n",(unsigned long)p.type);
 
         abort();
     }
+    if (ctx->cbs[p.type].kernel_cbs.pre == NULL) {
+        fprintf(stderr,"X10RT: CUDA Kernel %lu has no 'pre' registered.\n", (unsigned long)p.type);
+        abort();
+    }
     if (ctx->cbs[p.type].kernel_cbs.post == NULL) {
-        fprintf(stderr,"X10RT: Kernel %llu has no 'post' registered.\n",(unsigned long long)p.type);
+        fprintf(stderr,"X10RT: CUDA Kernel %lu has no 'post' registered.\n",(unsigned long)p.type);
         abort();
     }
 
@@ -539,15 +550,69 @@
 }
 
 
+
+void x10rt_cuda_blocks_threads (x10rt_cuda_ctx *ctx, x10rt_msg_type type, int dyn_shm,
+                                int &blocks, int &threads, const int *cfg)
+{
 #ifdef ENABLE_CUDA
-static bool stream_ready (CUstream s)
-{
-    CUresult r = cuStreamQuery(s);
-    if (r==CUDA_ERROR_NOT_READY) return false;
-    CU_SAFE(r);
-    return true;
+    if (ctx->cbs.arrc <= type) {
+        fprintf(stderr,"X10RT: async %lu is invalid.\n", (unsigned long)type);
+        abort();
+    }
+    if (ctx->cbs[type].kernel_cbs.kernel == NULL) {
+        fprintf(stderr,"X10RT: async %lu is not a CUDA kernel.\n",(unsigned long)type);
+        abort();
+    }
+    CUfunction k = ctx->cbs[type].kernel_cbs.kernel;
+
+    pthread_mutex_lock(&big_lock_of_doom);
+    CU_SAFE(cuCtxPushCurrent(ctx->ctx));
+
+    int mps, max_shm;
+    CU_SAFE(cuDeviceGetAttribute(&mps, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, ctx->hw));
+    CU_SAFE(cuDeviceGetAttribute(&max_shm,CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,ctx->hw));
+
+    CUdevprop prop;
+    CU_SAFE(cuDeviceGetProperties(&prop, ctx->hw));
+    int max_regs = prop.regsPerBlock;
+
+    int major, minor;
+    CU_SAFE(cuDeviceComputeCapability(&major, &minor, ctx->hw));
+
+
+    int static_shm, regs;
+    CU_SAFE(cuFuncGetAttribute(&static_shm, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, k));
+    CU_SAFE(cuFuncGetAttribute(&regs, CU_FUNC_ATTRIBUTE_NUM_REGS, k));
+
+    CU_SAFE(cuCtxPopCurrent(NULL));
+    pthread_mutex_unlock(&big_lock_of_doom);
+
+    // round up to 512 bytes (the granularity of shm allocation)
+    int shm = round_up(dyn_shm + static_shm, 512);
+
+    int alloc_size = (minor>=2) ? 512 : 256;
+    int max_threads = (minor>=2) ? 1024 : 768;
+
+    while (*cfg) {
+        int b = *(cfg++);
+        int t = *(cfg++);
+        if (b*shm > max_shm) continue;
+        if (t*b > max_threads) continue;
+        int block_regs = round_up(regs*round_up(t,64), alloc_size);
+        if (b*block_regs > max_regs) continue;
+        blocks = b * mps;
+        threads = t;
+        return;
+    }
+
+    blocks = 0;
+    threads = 0;
+
+#else
+    (void)ctx; (void)msg_type; (void)dyn_shm; (void)blocks; (void)threads; (void)cfg;
+    abort();
+#endif
 }
-#endif
 
 
 void x10rt_cuda_probe (x10rt_cuda_ctx *ctx)

Modified: trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc
===================================================================
--- trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc	2009-11-18 13:52:57 UTC (rev 12118)
@@ -96,6 +96,11 @@
 { x10rt_lgl_remote_op_fence(); }
 
 
+void x10rt_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm,
+                           int &blocks, int &threads, const int *cfg)
+{ x10rt_lgl_blocks_threads (d, type, dyn_shm, blocks, threads, cfg); }
+
+
 void x10rt_probe (void)
 { x10rt_lgl_probe(); }
 

Modified: trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc
===================================================================
--- trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc	2009-11-18 13:52:57 UTC (rev 12118)
@@ -705,6 +705,36 @@
     x10rt_net_remote_op_fence();
 }
 
+void x10rt_lgl_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm,
+                               int &blocks, int &threads, const int *cfg)
+{
+    assert(d < x10rt_lgl_nplaces());
+
+    if (d < x10rt_lgl_nhosts()) {
+        blocks = 8; threads = 1;
+    } else if (x10rt_lgl_parent(d) == x10rt_lgl_here()) {
+        // local accelerator
+        switch (x10rt_lgl_type(d)) {
+            case X10RT_LGL_CUDA: {
+                x10rt_cuda_ctx *cctx = static_cast<x10rt_cuda_ctx*>(g.accel_ctxs[g.index[d]]);
+                x10rt_cuda_blocks_threads(cctx, type, dyn_shm, blocks, threads, cfg);
+            } break;
+            case X10RT_LGL_SPE: {
+                blocks = 8; threads = 1;
+            } break;
+            default: {
+                fprintf(stderr,"Place %lu has invalid type %d in remote_op_xor.\n",
+                               d, x10rt_lgl_type(d));
+                abort();
+            }
+        }
+    } else {
+        fprintf(stderr,"Routing of remote ops still unsupported.\n");
+        abort();
+    }
+}
+
+
 void x10rt_lgl_probe (void)
 {
     x10rt_net_probe();

Modified: trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h
===================================================================
--- trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h	2009-11-18 13:52:57 UTC (rev 12118)
@@ -28,6 +28,9 @@
 
 void x10rt_cuda_send_put (x10rt_cuda_ctx *ctx, x10rt_msg_params &, void *buf, x10rt_copy_sz len);
 
+void x10rt_cuda_blocks_threads (x10rt_cuda_ctx *ctx, x10rt_msg_type type, int dyn_shm,
+                                int &blocks, int &threads, const int *cfg);
+
 void *x10rt_cuda_device_alloc (x10rt_cuda_ctx *ctx, size_t sz);
 void x10rt_cuda_device_free (x10rt_cuda_ctx *ctx, void *ptr);
 

Modified: trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h
===================================================================
--- trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h	2009-11-18 13:52:57 UTC (rev 12118)
@@ -43,6 +43,9 @@
 
 void x10rt_remote_op_fence (void);
 
+void x10rt_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm,
+                           int &blocks, int &threads, const int *cfg);
+
 void x10rt_probe (void);
 
 void x10rt_finalize (void); 

Modified: trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h
===================================================================
--- trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h	2009-11-18 13:52:57 UTC (rev 12118)
@@ -58,6 +58,9 @@
 
 void x10rt_lgl_remote_op_fence (void);
 
+void x10rt_lgl_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm,
+                               int &blocks, int &threads, const int *cfg);
+
 void x10rt_lgl_probe (void);
 
 void x10rt_lgl_finalize (void); 

Modified: trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk
===================================================================
--- trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk	2009-11-18 10:41:46 UTC (rev 12117)
+++ trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk	2009-11-18 13:52:57 UTC (rev 12118)
@@ -101,7 +101,7 @@
 	$(AR) $(ARFLAGS) $@ $(COMMON_OBJS)
 else
 $(SOCKETS_TGZ).phony:
-	$(WGET) -N  "http://dist.codehaus.org/x10/binaryReleases/svn head/$(SOCKETS_TGZ)"
+	-$(WGET) -N  "http://dist.codehaus.org/x10/binaryReleases/svn head/$(SOCKETS_TGZ)"
 
 $(SOCKETS_TGZ): $(SOCKETS_TGZ).phony
 
@@ -142,7 +142,7 @@
 	$(AR) $(ARFLAGS) $@ $(COMMON_OBJS)
 else
 $(LAPI_TGZ).phony:
-	$(WGET) -N  "http://dist.codehaus.org/x10/binaryReleases/svn head/$(LAPI_TGZ)"
+	-$(WGET) -N  "http://dist.codehaus.org/x10/binaryReleases/svn head/$(LAPI_TGZ)"
 
 $(LAPI_TGZ): $(LAPI_TGZ).phony
 
@@ -179,7 +179,7 @@
 	$(AR) $(ARFLAGS) $@ $(COMMON_OBJS)
 else
 $(BGP_TGZ).phony:
-	$(WGET) -N  "http://dist.codehaus.org/x10/binaryReleases/svn head/$(BGP_TGZ)"
+	-$(WGET) -N  "http://dist.codehaus.org/x10/binaryReleases/svn head/$(BGP_TGZ)"
 
 $(BGP_TGZ): $(BGP_TGZ).phony
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[X10-commits] SF.net SVN: x10:[12118] trunk

Performance and Productivity at Scale

[X10-commits] SF.net SVN: x10:[12118] trunk