From: <spa...@us...> - 2009-11-18 13:53:06
|
Revision: 12118 http://x10.svn.sourceforge.net/x10/?rev=12118&view=rev Author: sparksparkspark Date: 2009-11-18 13:52:57 +0000 (Wed, 18 Nov 2009) Log Message: ----------- 1) Ignore wget failure when building x10rt if there is a tar already present 2) Implement automatic blocks/threads detection at x10rt level Modified Paths: -------------- trunk/x10.dist/samples/KMeansCUDA.x10 trunk/x10.runtime/src-cpp/x10aux/network.cc trunk/x10.runtime/src-cpp/x10aux/network.h trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk Modified: trunk/x10.dist/samples/KMeansCUDA.x10 =================================================================== --- trunk/x10.dist/samples/KMeansCUDA.x10 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.dist/samples/KMeansCUDA.x10 2009-11-18 13:52:57 UTC (rev 12118) @@ -80,7 +80,7 @@ val kernel_start_time = System.currentTimeMillis(); // classify kernel - val blocks = 8, threads = 64; + val blocks = 4, threads = 256; at (gpu) @CUDA { for ((block) in 0..blocks-1) { val clustercache = Rail.make[Float](num_clusters*4, clusters_copy); Modified: trunk/x10.runtime/src-cpp/x10aux/network.cc =================================================================== --- trunk/x10.runtime/src-cpp/x10aux/network.cc 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10aux/network.cc 2009-11-18 13:52:57 UTC (rev 12118) @@ -31,6 +31,64 @@ volatile x10_long x10aux::serialized_bytes = 0; volatile x10_long x10aux::deserialized_bytes = 0; + +const int cuda_kernel_cfgs[] = { + /*1024*/ 8, 128, + /*1024*/ 4, 256, + /*1024*/ 2, 512, + + /*960*/ 5, 192, + /*960*/ 3, 320, + + /*896*/ 7, 128, + /*896*/ 2, 448, + + /*768*/ 6, 128, + /*768*/ 4, 192, + /*768*/ 3, 256, + /*768*/ 2, 384, + + /*640*/ 5, 128, + /*640*/ 2, 320, + + /*576*/ 3, 192, + + /*512*/ 8, 64, + /*512*/ 4, 128, + /*512*/ 2, 256, + /*512*/ 1, 512, + + /*448*/ 7, 64, + /*448*/ 1, 448, + + /*384*/ 6, 64, + /*384*/ 3, 128, + /*384*/ 2, 192, + /*384*/ 1, 384, + + /*320*/ 5, 64, + /*320*/ 1, 320, + + /*256*/ 4, 64, + /*256*/ 2, 128, + /*256*/ 1, 256, + + /*192*/ 3, 64, + /*192*/ 1, 192, + + /*128*/ 2, 64, + /*128*/ 1, 128, + + /*64*/ 1, 64, + + 0 /* terminator */ +}; + +void x10aux::blocks_threads (place p, msg_type t, int shm, int &bs, int &ts, const int *cfgs) +{ x10rt_blocks_threads(p,t,shm,bs,ts,cfgs); } + + + void *kernel_put_finder (const x10rt_msg_params &p, x10rt_copy_sz) { x10aux::deserialization_buffer buf(static_cast<char*>(p.msg)); Modified: trunk/x10.runtime/src-cpp/x10aux/network.h =================================================================== --- trunk/x10.runtime/src-cpp/x10aux/network.h 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10aux/network.h 2009-11-18 13:52:57 UTC (rev 12118) @@ -36,7 +36,10 @@ inline x10_boolean is_cuda (place p) { return x10rt_is_cuda(p); } inline void event_probe (void) { x10rt_probe(); } + extern const int cuda_cfgs[]; + void blocks_threads (place p, msg_type t, int shm, int &bs, int &ts, const int *cfgs=cuda_cfgs); + inline x10_ulong remote_alloc (place p, size_t sz) { _X_(ANSI_BOLD<<ANSI_X10RT<<"Remote alloc: "<<ANSI_RESET <<"size "<<sz<<" to place: "<<p); Modified: trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc =================================================================== --- trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10rt/common/x10rt_cuda.cc 2009-11-18 13:52:57 UTC (rev 12118) @@ -238,8 +238,9 @@ }; }; - static void ensure_initialized (void) + void ensure_initialized (void) { + // only do once per process static int done = 0; if (!done) { CU_SAFE(cuInit(0)); @@ -247,6 +248,16 @@ } } + bool stream_ready (CUstream s) + { + CUresult r = cuStreamQuery(s); + if (r==CUDA_ERROR_NOT_READY) return false; + CU_SAFE(r); + return true; + } + + int round_up (int x, int y) { return (x + (y-1)) / y * y; } + } struct x10rt_cuda_ctx { @@ -509,20 +520,20 @@ pthread_mutex_lock(&big_lock_of_doom); if (ctx->cbs.arrc <= p.type) { - fprintf(stderr,"X10RT: Kernel %llu is invalid.\n", (unsigned long long)p.type); + fprintf(stderr,"X10RT: async %lu is invalid.\n", (unsigned long)p.type); abort(); } - if (ctx->cbs[p.type].kernel_cbs.pre == NULL) { - fprintf(stderr,"X10RT: Kernel %llu has no 'pre' registered.\n", (unsigned long long)p.type); - abort(); - } if (ctx->cbs[p.type].kernel_cbs.kernel == NULL) { - fprintf(stderr,"X10RT: Kernel %llu has no kernel registered.\n",(unsigned long long)p.type); + fprintf(stderr,"X10RT: async %lu is not a CUDA kernel.\n",(unsigned long)p.type); abort(); } + if (ctx->cbs[p.type].kernel_cbs.pre == NULL) { + fprintf(stderr,"X10RT: CUDA Kernel %lu has no 'pre' registered.\n", (unsigned long)p.type); + abort(); + } if (ctx->cbs[p.type].kernel_cbs.post == NULL) { - fprintf(stderr,"X10RT: Kernel %llu has no 'post' registered.\n",(unsigned long long)p.type); + fprintf(stderr,"X10RT: CUDA Kernel %lu has no 'post' registered.\n",(unsigned long)p.type); abort(); } @@ -539,15 +550,69 @@ } + +void x10rt_cuda_blocks_threads (x10rt_cuda_ctx *ctx, x10rt_msg_type type, int dyn_shm, + int &blocks, int &threads, const int *cfg) +{ #ifdef ENABLE_CUDA -static bool stream_ready (CUstream s) -{ - CUresult r = cuStreamQuery(s); - if (r==CUDA_ERROR_NOT_READY) return false; - CU_SAFE(r); - return true; + if (ctx->cbs.arrc <= type) { + fprintf(stderr,"X10RT: async %lu is invalid.\n", (unsigned long)type); + abort(); + } + if (ctx->cbs[type].kernel_cbs.kernel == NULL) { + fprintf(stderr,"X10RT: async %lu is not a CUDA kernel.\n",(unsigned long)type); + abort(); + } + CUfunction k = ctx->cbs[type].kernel_cbs.kernel; + + pthread_mutex_lock(&big_lock_of_doom); + CU_SAFE(cuCtxPushCurrent(ctx->ctx)); + + int mps, max_shm; + CU_SAFE(cuDeviceGetAttribute(&mps, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, ctx->hw)); + CU_SAFE(cuDeviceGetAttribute(&max_shm,CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,ctx->hw)); + + CUdevprop prop; + CU_SAFE(cuDeviceGetProperties(&prop, ctx->hw)); + int max_regs = prop.regsPerBlock; + + int major, minor; + CU_SAFE(cuDeviceComputeCapability(&major, &minor, ctx->hw)); + + + int static_shm, regs; + CU_SAFE(cuFuncGetAttribute(&static_shm, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, k)); + CU_SAFE(cuFuncGetAttribute(®s, CU_FUNC_ATTRIBUTE_NUM_REGS, k)); + + CU_SAFE(cuCtxPopCurrent(NULL)); + pthread_mutex_unlock(&big_lock_of_doom); + + // round up to 512 bytes (the granularity of shm allocation) + int shm = round_up(dyn_shm + static_shm, 512); + + int alloc_size = (minor>=2) ? 512 : 256; + int max_threads = (minor>=2) ? 1024 : 768; + + while (*cfg) { + int b = *(cfg++); + int t = *(cfg++); + if (b*shm > max_shm) continue; + if (t*b > max_threads) continue; + int block_regs = round_up(regs*round_up(t,64), alloc_size); + if (b*block_regs > max_regs) continue; + blocks = b * mps; + threads = t; + return; + } + + blocks = 0; + threads = 0; + +#else + (void)ctx; (void)msg_type; (void)dyn_shm; (void)blocks; (void)threads; (void)cfg; + abort(); +#endif } -#endif void x10rt_cuda_probe (x10rt_cuda_ctx *ctx) Modified: trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc =================================================================== --- trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10rt/common/x10rt_front.cc 2009-11-18 13:52:57 UTC (rev 12118) @@ -96,6 +96,11 @@ { x10rt_lgl_remote_op_fence(); } +void x10rt_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm, + int &blocks, int &threads, const int *cfg) +{ x10rt_lgl_blocks_threads (d, type, dyn_shm, blocks, threads, cfg); } + + void x10rt_probe (void) { x10rt_lgl_probe(); } Modified: trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc =================================================================== --- trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10rt/common/x10rt_logical.cc 2009-11-18 13:52:57 UTC (rev 12118) @@ -705,6 +705,36 @@ x10rt_net_remote_op_fence(); } +void x10rt_lgl_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm, + int &blocks, int &threads, const int *cfg) +{ + assert(d < x10rt_lgl_nplaces()); + + if (d < x10rt_lgl_nhosts()) { + blocks = 8; threads = 1; + } else if (x10rt_lgl_parent(d) == x10rt_lgl_here()) { + // local accelerator + switch (x10rt_lgl_type(d)) { + case X10RT_LGL_CUDA: { + x10rt_cuda_ctx *cctx = static_cast<x10rt_cuda_ctx*>(g.accel_ctxs[g.index[d]]); + x10rt_cuda_blocks_threads(cctx, type, dyn_shm, blocks, threads, cfg); + } break; + case X10RT_LGL_SPE: { + blocks = 8; threads = 1; + } break; + default: { + fprintf(stderr,"Place %lu has invalid type %d in remote_op_xor.\n", + d, x10rt_lgl_type(d)); + abort(); + } + } + } else { + fprintf(stderr,"Routing of remote ops still unsupported.\n"); + abort(); + } +} + + void x10rt_lgl_probe (void) { x10rt_net_probe(); Modified: trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h =================================================================== --- trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10rt/include/x10rt_cuda.h 2009-11-18 13:52:57 UTC (rev 12118) @@ -28,6 +28,9 @@ void x10rt_cuda_send_put (x10rt_cuda_ctx *ctx, x10rt_msg_params &, void *buf, x10rt_copy_sz len); +void x10rt_cuda_blocks_threads (x10rt_cuda_ctx *ctx, x10rt_msg_type type, int dyn_shm, + int &blocks, int &threads, const int *cfg); + void *x10rt_cuda_device_alloc (x10rt_cuda_ctx *ctx, size_t sz); void x10rt_cuda_device_free (x10rt_cuda_ctx *ctx, void *ptr); Modified: trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h =================================================================== --- trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10rt/include/x10rt_front.h 2009-11-18 13:52:57 UTC (rev 12118) @@ -43,6 +43,9 @@ void x10rt_remote_op_fence (void); +void x10rt_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm, + int &blocks, int &threads, const int *cfg); + void x10rt_probe (void); void x10rt_finalize (void); Modified: trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h =================================================================== --- trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10rt/include/x10rt_logical.h 2009-11-18 13:52:57 UTC (rev 12118) @@ -58,6 +58,9 @@ void x10rt_lgl_remote_op_fence (void); +void x10rt_lgl_blocks_threads (x10rt_place d, x10rt_msg_type type, int dyn_shm, + int &blocks, int &threads, const int *cfg); + void x10rt_lgl_probe (void); void x10rt_lgl_finalize (void); Modified: trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk =================================================================== --- trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk 2009-11-18 10:41:46 UTC (rev 12117) +++ trunk/x10.runtime/src-cpp/x10rt/pgas/pgas.mk 2009-11-18 13:52:57 UTC (rev 12118) @@ -101,7 +101,7 @@ $(AR) $(ARFLAGS) $@ $(COMMON_OBJS) else $(SOCKETS_TGZ).phony: - $(WGET) -N "http://dist.codehaus.org/x10/binaryReleases/svn head/$(SOCKETS_TGZ)" + -$(WGET) -N "http://dist.codehaus.org/x10/binaryReleases/svn head/$(SOCKETS_TGZ)" $(SOCKETS_TGZ): $(SOCKETS_TGZ).phony @@ -142,7 +142,7 @@ $(AR) $(ARFLAGS) $@ $(COMMON_OBJS) else $(LAPI_TGZ).phony: - $(WGET) -N "http://dist.codehaus.org/x10/binaryReleases/svn head/$(LAPI_TGZ)" + -$(WGET) -N "http://dist.codehaus.org/x10/binaryReleases/svn head/$(LAPI_TGZ)" $(LAPI_TGZ): $(LAPI_TGZ).phony @@ -179,7 +179,7 @@ $(AR) $(ARFLAGS) $@ $(COMMON_OBJS) else $(BGP_TGZ).phony: - $(WGET) -N "http://dist.codehaus.org/x10/binaryReleases/svn head/$(BGP_TGZ)" + -$(WGET) -N "http://dist.codehaus.org/x10/binaryReleases/svn head/$(BGP_TGZ)" $(BGP_TGZ): $(BGP_TGZ).phony This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |