From: <gs...@us...> - 2010-08-13 18:25:21
|
Revision: 74 http://cudawrapper.svn.sourceforge.net/cudawrapper/?rev=74&view=rev Author: gshi Date: 2010-08-13 18:25:14 +0000 (Fri, 13 Aug 2010) Log Message: ----------- disable profiling by default to enable it, set the evnrioment variable CUDA_WRAPPER_PROFILE_ENABLE Modified Paths: -------------- trunk/Makefile trunk/cuda_wrapper.c Modified: trunk/Makefile =================================================================== --- trunk/Makefile 2010-05-26 18:29:58 UTC (rev 73) +++ trunk/Makefile 2010-08-13 18:25:14 UTC (rev 74) @@ -100,4 +100,3 @@ install -cD -m 700 wrapper_terminate /usr/local/cuda_wrapper/bin/wrapper_terminate install -cD wrapper_query /usr/local/cuda_wrapper/bin/wrapper_query install -cD cuda_memscrubber /usr/local/cuda_wrapper/bin/cuda_memscrubber - Modified: trunk/cuda_wrapper.c =================================================================== --- trunk/cuda_wrapper.c 2010-05-26 18:29:58 UTC (rev 73) +++ trunk/cuda_wrapper.c 2010-08-13 18:25:14 UTC (rev 74) @@ -84,6 +84,7 @@ static __host__ cudaError_t CUDARTAPI (*real_cudaChooseDevice)(int *device, const struct cudaDeviceProp *prop); static __host__ cudaError_t CUDARTAPI (*real_cudaSetValidDevices)(int *device, int n); static __host__ cudaError_t CUDARTAPI (*real_cudaLaunch)(const char* entry); +static __host__ cudaError_t CUDARTAPI (*real_cudaConfigureCall)(dim3, dim3, size_t, cudaStream_t); /* These event API are not wrappered but we need to resolve them so that we can use them * even if the user application is not linked to the cuda runtime library. @@ -110,6 +111,7 @@ static int cuda_exec = 0; extern int cuda_wrapper_magic_job_enabled; +static __thread cudaStream_t kernel_stream = (cudaStream_t)0; static __thread cudaEvent_t start_event, stop_event; static __thread double gpu_kernel_time =0; static __thread int cuda_exec_type=0; @@ -118,6 +120,7 @@ static pthread_mutex_t total_gpu_kernel_time_lock = PTHREAD_MUTEX_INITIALIZER; static int cuda_wrapper_num_affinity_disabled = 0; static int cuda_device_constrain_disabled = 0; +static int profile_enabled = 0; #define CUDA_GDB_ENV_STRING "__CUDA_GDB_RUNNING__" @@ -262,6 +265,10 @@ if (getenv("CUDA_WRAPPER_DEVICE_CONSTRAIN_DISABLED")){ cuda_device_constrain_disabled=1; } + if (getenv("CUDA_WRAPPER_PROFILE_ENABLE")){ + profile_enabled = 1; + } + /* Attach to the shared memory segment for this node */ wrapper_info = wrapper_attach_shmem_segment(); if (!real_dlopen){ @@ -284,6 +291,7 @@ real_cudaChooseDevice = get_symbol(cuda_hdl, "cudaChooseDevice"); real_cudaSetValidDevices = get_symbol(cuda_hdl, "cudaSetValidDevices"); real_cudaLaunch = get_symbol(cuda_hdl, "cudaLaunch"); + real_cudaConfigureCall = get_symbol(cuda_hdl, "cudaConfigureCall"); real_cudaEventCreate = get_symbol(cuda_hdl, "cudaEventCreate"); real_cudaEventDestroy = get_symbol(cuda_hdl, "cudaEventDestroy"); @@ -532,7 +540,7 @@ if(cuda_exec && wrapper_info ){ wrapper_info->pid = -1; - if (cuda_exec_type == CUDA_EXEC_RUNTIME){ + if (cuda_exec_type == CUDA_EXEC_RUNTIME && profile_enabled){ if (real_cudaEventQuery(stop_event) == cudaSuccess){ float tmp; if (real_cudaEventElapsedTime(&tmp, start_event, stop_event) == cudaSuccess){ @@ -1116,6 +1124,16 @@ } __host__ cudaError_t CUDARTAPI +cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, cudaStream_t stream) +{ + + kernel_stream = stream; + return real_cudaConfigureCall(gridDim, blockDim, sharedMem, stream); + +} + + +__host__ cudaError_t CUDARTAPI cudaLaunch(const char* entry) { @@ -1132,6 +1150,10 @@ wrapper_info->cuda_exec_type |= CUDA_EXEC_RUNTIME; cuda_exec_type |= CUDA_EXEC_RUNTIME; + if (!profile_enabled){ + return real_cudaLaunch(entry); + } + static __thread int first_time = 1; if (first_time){ real_cudaEventCreate(&start_event); @@ -1139,14 +1161,12 @@ } - //printf("%s is called in file %s\n", __FUNCTION__, __FILE__); - cudaError_t ret; if (first_time){ - real_cudaEventRecord(start_event, 0); + real_cudaEventRecord(start_event, kernel_stream); ret = (*real_cudaLaunch)(entry); - real_cudaEventRecord(stop_event, 0); + real_cudaEventRecord(stop_event, kernel_stream); }else{ if (real_cudaEventQuery(stop_event) == cudaSuccess){ float tmp; @@ -1158,9 +1178,9 @@ real_cudaEventCreate(&stop_event); gpu_kernel_time += 0.001*tmp; - real_cudaEventRecord(start_event, 0); + real_cudaEventRecord(start_event, kernel_stream); ret = (*real_cudaLaunch)(entry); - real_cudaEventRecord(stop_event, 0); + real_cudaEventRecord(stop_event, kernel_stream); }else{ @@ -1172,7 +1192,7 @@ real_cudaEventDestroy(stop_event); real_cudaEventCreate(&stop_event); - real_cudaEventRecord(stop_event,0); + real_cudaEventRecord(stop_event,kernel_stream); } } @@ -1224,7 +1244,7 @@ static void update_gpu_kernel_time() { - if(cuda_exec && wrapper_info && (cuda_exec_type == CUDA_EXEC_RUNTIME) ){ + if(cuda_exec && wrapper_info && (cuda_exec_type == CUDA_EXEC_RUNTIME) && profile_enabled ){ if (real_cudaEventQuery(stop_event) == cudaSuccess){ float tmp; real_cudaEventElapsedTime(&tmp, start_event, stop_event); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |