From: <spa...@us...> - 2010-03-20 07:20:06
|
Revision: 13478 http://x10.svn.sourceforge.net/x10/?rev=13478&view=rev Author: sparksparkspark Date: 2010-03-20 07:19:59 +0000 (Sat, 20 Mar 2010) Log Message: ----------- WIP on outstanding CUDA issues Modified Paths: -------------- trunk/x10.compiler/src/x10cuda/visit/CUDACodeGenerator.java trunk/x10.runtime/src-cpp/x10aux/deserialization_dispatcher.h trunk/x10.runtime/src-cpp/x10aux/network.cc trunk/x10.runtime/x10rt/common/x10rt_cuda.cc Modified: trunk/x10.compiler/src/x10cuda/visit/CUDACodeGenerator.java =================================================================== --- trunk/x10.compiler/src/x10cuda/visit/CUDACodeGenerator.java 2010-03-20 05:45:37 UTC (rev 13477) +++ trunk/x10.compiler/src/x10cuda/visit/CUDACodeGenerator.java 2010-03-20 07:19:59 UTC (rev 13478) @@ -579,7 +579,7 @@ if (nodeHasCUDAAnnotation(block)) { - inc.write("static x10_ulong "+SharedVarsMethods.DESERIALIZE_CUDA_METHOD+"("+DESERIALIZATION_BUFFER+" &__buf, x10aux::place __gpu, size_t &__blocks, size_t &__threads, size_t &__shm) {"); + inc.write("static void "+SharedVarsMethods.DESERIALIZE_CUDA_METHOD+"("+DESERIALIZATION_BUFFER+" &__buf, x10aux::place __gpu, size_t &__blocks, size_t &__threads, size_t &__shm, size_t &argc, char *&argv, size_t &cmemc, char *&cmemv) {"); inc.newline(4); inc.begin(0); inc.write(make_ref(cnamet)+" __this = "+cnamet+"::"+DESERIALIZE_METHOD+"<"+cnamet+">(__buf);"); @@ -647,10 +647,16 @@ inc.write(";"); inc.newline(); } - inc.write("x10_ulong __remote_env = x10aux::remote_alloc(__gpu, sizeof(__env));"); inc.newline(); - inc.write("x10aux::cuda_put(__gpu, __remote_env, &__env, sizeof(__env));"); inc.newline(); - inc.write("return __remote_env;"); inc.end(); inc.newline(); - inc.write("}"); inc.newline(); inc.forceNewline(); + if (true) { + inc.write("x10_ulong __remote_env = x10aux::remote_alloc(__gpu, sizeof(__env));"); inc.newline(); + inc.write("x10aux::cuda_put(__gpu, __remote_env, &__env, sizeof(__env));"); inc.newline(); + inc.write("::memcpy(argv, &__remote_env, sizeof (void*));"); inc.newline(); + inc.write("argc = sizeof(void*);"); inc.end(); inc.newline(); + } else { + inc.write("memcpy(argv, __env, sizeof(__env));"); inc.newline(); + inc.write("argc = sizeof(__env);"); inc.end(); inc.newline(); + } + inc.write("}"); inc.newline(); inc.forceNewline(); } } @@ -660,14 +666,8 @@ } @Override - public void visit(ArrayInit_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override public void visit(Assert_c n) { - // TODO Auto-generated method stub + assert !generatingKernel() : "Throwing exceptions not allowed in @CUDA code."; super.visit(n); } @@ -685,7 +685,7 @@ @Override public void visit(Await_c n) { - // TODO Auto-generated method stub + assert !generatingKernel() : "Await not allowed in @CUDA code."; super.visit(n); } @@ -715,7 +715,7 @@ @Override public void visit(Catch_c n) { - // TODO Auto-generated method stub + assert !generatingKernel() : "Catching exceptions not allowed in @CUDA code."; super.visit(n); } @@ -726,14 +726,8 @@ } @Override - public void visit(ClassBody_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override public void visit(ClosureCall_c c) { - // TODO Auto-generated method stub + assert !generatingKernel() : "Closure calls not allowed in @CUDA code."; super.visit(c); } @@ -744,18 +738,6 @@ } @Override - public void visit(ConstantDistMaker_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override - public void visit(ConstructorDecl_c dec) { - // TODO Auto-generated method stub - super.visit(dec); - } - - @Override public void visit(Do_c n) { // TODO Auto-generated method stub super.visit(n); @@ -780,12 +762,6 @@ } @Override - public void visit(FieldDecl_c dec) { - // TODO Auto-generated method stub - super.visit(dec); - } - - @Override public void visit(FloatLit_c n) { // TODO Auto-generated method stub super.visit(n); @@ -822,12 +798,6 @@ } @Override - public void visit(Import_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override public void visit(Initializer_c n) { // TODO Auto-generated method stub super.visit(n); @@ -877,60 +847,12 @@ } @Override - public void visit(LocalClassDecl_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override - public void visit(LocalDecl_c dec) { - // TODO Auto-generated method stub - super.visit(dec); - } - - @Override - public void visit(MethodDecl_c dec) { - // TODO Auto-generated method stub - super.visit(dec); - } - - @Override - public void visit(Node n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override public void visit(NullLit_c n) { // TODO Auto-generated method stub super.visit(n); } @Override - public void visit(PackageNode_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override - public void visit(ParExpr_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override - public void visit(PropertyDecl_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override - public void visit(RegionMaker_c n) { - // TODO Auto-generated method stub - super.visit(n); - } - - @Override public void visit(Return_c ret) { // TODO Auto-generated method stub super.visit(ret); @@ -962,13 +884,13 @@ @Override public void visit(Throw_c n) { - // TODO Auto-generated method stub + assert !generatingKernel() : "Throwing exceptions not allowed in @CUDA code."; super.visit(n); } @Override public void visit(Try_c n) { - // TODO Auto-generated method stub + assert !generatingKernel() : "Catching exceptions not allowed in @CUDA code."; super.visit(n); } @@ -1004,7 +926,8 @@ @Override public void visit(X10Call_c n) { - // TODO Auto-generated method stub + // In fact they are allowed, as long as they are implemented with @Native + //assert !generatingKernel() : "Calling functions not allowed in @CUDA code."; super.visit(n); } @@ -1028,7 +951,7 @@ @Override public void visit(X10Instanceof_c n) { - // TODO Auto-generated method stub + assert !generatingKernel() : "Runtime types not available in @CUDA code."; super.visit(n); } Modified: trunk/x10.runtime/src-cpp/x10aux/deserialization_dispatcher.h =================================================================== --- trunk/x10.runtime/src-cpp/x10aux/deserialization_dispatcher.h 2010-03-20 05:45:37 UTC (rev 13477) +++ trunk/x10.runtime/src-cpp/x10aux/deserialization_dispatcher.h 2010-03-20 07:19:59 UTC (rev 13478) @@ -26,8 +26,8 @@ typedef ref<x10::lang::Object> (*Deserializer)(deserialization_buffer &buf); template<> inline const char *typeName<Deserializer>() { return "Deserializer"; } - typedef x10_ulong (*CUDAPre)(deserialization_buffer &buf, place p, - size_t &blocks, size_t &threads, size_t &shm); + typedef void (*CUDAPre)(deserialization_buffer &buf, place p, + size_t &blocks, size_t &threads, size_t &shm, size_t &argc, char *&argv, size_t &cmemc, char *&cmemv); template<> inline const char *typeName<CUDAPre>() { return "CUDAPre"; } typedef void *(*BufferFinder)(deserialization_buffer &buf, x10_int len); Modified: trunk/x10.runtime/src-cpp/x10aux/network.cc =================================================================== --- trunk/x10.runtime/src-cpp/x10aux/network.cc 2010-03-20 05:45:37 UTC (rev 13477) +++ trunk/x10.runtime/src-cpp/x10aux/network.cc 2010-03-20 07:19:59 UTC (rev 13478) @@ -294,16 +294,14 @@ // note: high bytes thrown away in implicit conversion serialization_id_t sid = x10aux::DeserializationDispatcher::getSerializationId(p->type); x10aux::CUDAPre pre = x10aux::DeserializationDispatcher::getCUDAPre(sid); - x10_ulong env = pre(buf, p->dest_place, *blocks, *threads, *shm); + pre(buf, p->dest_place, *blocks, *threads, *shm, *argc, *argv, *cmemc, *cmemv); assert(buf.consumed() <= p->len); - *argv = (char*)(size_t)env; - *argc = sizeof(void*); } static void cuda_post (const x10rt_msg_params *p, void *env) { _X_(ANSI_X10RT<<"Receiving a kernel post callback, deserialising..."<<ANSI_RESET); - remote_free(p->dest_place, (x10_ulong)(size_t)env); + //remote_free(p->dest_place, (x10_ulong)(size_t)env); x10aux::deserialization_buffer buf(static_cast<char*>(p->msg)); x10aux::ref<x10::lang::Reference> fs = buf.read<x10aux::ref<x10::lang::Reference> >(); x10aux::ref<x10::lang::Runtime> rt = x10::lang::PlaceLocalHandle_methods<x10aux::ref<x10::lang::Runtime> >::apply(x10::lang::Runtime::FMGL(runtime)); Modified: trunk/x10.runtime/x10rt/common/x10rt_cuda.cc =================================================================== --- trunk/x10.runtime/x10rt/common/x10rt_cuda.cc 2010-03-20 05:45:37 UTC (rev 13477) +++ trunk/x10.runtime/x10rt/common/x10rt_cuda.cc 2010-03-20 07:19:59 UTC (rev 13478) @@ -22,7 +22,7 @@ // TODO: fine grained synchronisation, lock free datastructures pthread_mutex_t big_lock_of_doom; - static inline void DEBUG(const char *fmt, ...) { + inline void DEBUG(const char *fmt, ...) { (void) fmt; va_list ap; va_start(ap, fmt); @@ -108,7 +108,7 @@ /* }}} */ - static size_t dma_slice_sz (void) { + size_t dma_slice_sz (void) { static size_t sz = 0; if (sz == 0) { const char *env_var = "X10RT_CUDA_DMA_SLICE"; @@ -275,6 +275,7 @@ void *pinned_mem2; void *front; void *back; + size_t commit; op_queue<x10rt_cuda_kernel> kernel_q; op_queue<x10rt_cuda_copy> dma_q; Table<x10rt_functions> cbs; @@ -433,7 +434,6 @@ #endif } - void *x10rt_cuda_device_alloc (x10rt_cuda_ctx *ctx, size_t len) { @@ -441,6 +441,8 @@ pthread_mutex_lock(&big_lock_of_doom); CU_SAFE(cuCtxPushCurrent(ctx->ctx)); CUdeviceptr ptr; + ctx->commit += len; + //fprintf(stderr,"CUDA committed memory: %llu bytes\n", (unsigned long long)ctx->commit); CU_SAFE(cuMemAlloc(&ptr, len)); CU_SAFE(cuCtxPopCurrent(NULL)); pthread_mutex_unlock(&big_lock_of_doom); @@ -684,7 +686,8 @@ CUfunction k = ctx->cbs[type].kernel_cbs.kernel; // y and z params we leave as 1, as threads can vary from 1 to 512 CU_SAFE(cuFuncSetBlockShape(k, kop->threads, 1, 1)); - CU_SAFE(cuParamSetv(k, 0, &kop->argv, kop->argc)); + //fprintf(stderr,"%p<<<%d,%d,%d>>> argc: %d argv: %p\n", (void*)k, kop->blocks, kop->threads, kop->shm, kop->argc, *(void**)kop->argv); + CU_SAFE(cuParamSetv(k, 0, &kop->argv[0], kop->argc)); CU_SAFE(cuParamSetSize(k, kop->argc)); CU_SAFE(cuFuncSetSharedSize(k, kop->shm)); CU_SAFE(cuLaunchGridAsync(k, kop->blocks, 1, ctx->kernel_q.stream)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |