[Cudawrapper-svn] SF.net SVN: cudawrapper:[74] trunk

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 74
          http://cudawrapper.svn.sourceforge.net/cudawrapper/?rev=74&view=rev
Author:   gshi
Date:     2010-08-13 18:25:14 +0000 (Fri, 13 Aug 2010)

Log Message:
-----------
disable profiling by default
to enable it, set the evnrioment variable CUDA_WRAPPER_PROFILE_ENABLE

Modified Paths:
--------------
    trunk/Makefile
    trunk/cuda_wrapper.c

Modified: trunk/Makefile
===================================================================

--- trunk/Makefile	2010-05-26 18:29:58 UTC (rev 73)
+++ trunk/Makefile	2010-08-13 18:25:14 UTC (rev 74)
@@ -100,4 +100,3 @@
 	install -cD -m 700 wrapper_terminate /usr/local/cuda_wrapper/bin/wrapper_terminate
 	install -cD wrapper_query /usr/local/cuda_wrapper/bin/wrapper_query
 	install -cD cuda_memscrubber /usr/local/cuda_wrapper/bin/cuda_memscrubber
-

Modified: trunk/cuda_wrapper.c
===================================================================
--- trunk/cuda_wrapper.c	2010-05-26 18:29:58 UTC (rev 73)
+++ trunk/cuda_wrapper.c	2010-08-13 18:25:14 UTC (rev 74)
@@ -84,6 +84,7 @@
 static __host__ cudaError_t CUDARTAPI (*real_cudaChooseDevice)(int *device, const struct cudaDeviceProp *prop);
 static __host__ cudaError_t CUDARTAPI (*real_cudaSetValidDevices)(int *device,  int n);
 static __host__ cudaError_t CUDARTAPI (*real_cudaLaunch)(const char* entry);
+static __host__ cudaError_t CUDARTAPI (*real_cudaConfigureCall)(dim3, dim3, size_t, cudaStream_t);
 
 /* These event API are not wrappered but we need to resolve them so that we can use them
  * even if the user application is not linked to the cuda runtime library.
@@ -110,6 +111,7 @@
 static int cuda_exec = 0;
 extern int cuda_wrapper_magic_job_enabled;
 
+static __thread cudaStream_t kernel_stream = (cudaStream_t)0;
 static __thread cudaEvent_t start_event, stop_event;
 static __thread double gpu_kernel_time =0;
 static __thread int cuda_exec_type=0;
@@ -118,6 +120,7 @@
 static pthread_mutex_t total_gpu_kernel_time_lock = PTHREAD_MUTEX_INITIALIZER;
 static int cuda_wrapper_num_affinity_disabled = 0;
 static int cuda_device_constrain_disabled = 0;
+static int profile_enabled = 0;
 
 #define CUDA_GDB_ENV_STRING "__CUDA_GDB_RUNNING__"
 
@@ -262,6 +265,10 @@
     if (getenv("CUDA_WRAPPER_DEVICE_CONSTRAIN_DISABLED")){
       cuda_device_constrain_disabled=1;
     }
+    if (getenv("CUDA_WRAPPER_PROFILE_ENABLE")){
+	profile_enabled = 1;
+    }
+
     /* Attach to the shared memory segment for this node */
     wrapper_info = wrapper_attach_shmem_segment();
     if (!real_dlopen){
@@ -284,6 +291,7 @@
     real_cudaChooseDevice	 = get_symbol(cuda_hdl, "cudaChooseDevice");
     real_cudaSetValidDevices = get_symbol(cuda_hdl, "cudaSetValidDevices");
     real_cudaLaunch		 = get_symbol(cuda_hdl, "cudaLaunch");    
+    real_cudaConfigureCall	 = get_symbol(cuda_hdl, "cudaConfigureCall");    
     
     real_cudaEventCreate  = get_symbol(cuda_hdl, "cudaEventCreate");
     real_cudaEventDestroy = get_symbol(cuda_hdl, "cudaEventDestroy");
@@ -532,7 +540,7 @@
 
     if(cuda_exec && wrapper_info ){
 	wrapper_info->pid = -1;
-	if (cuda_exec_type == CUDA_EXEC_RUNTIME){
+	if (cuda_exec_type == CUDA_EXEC_RUNTIME && profile_enabled){
 	    if (real_cudaEventQuery(stop_event) == cudaSuccess){
 		float tmp;
 		if (real_cudaEventElapsedTime(&tmp, start_event, stop_event) == cudaSuccess){
@@ -1116,6 +1124,16 @@
 }
 
 __host__ cudaError_t CUDARTAPI
+cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, cudaStream_t stream)
+{
+    
+    kernel_stream = stream;
+    return real_cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
+    
+}
+
+
+__host__ cudaError_t CUDARTAPI
 cudaLaunch(const char* entry)
 {
     
@@ -1132,6 +1150,10 @@
     wrapper_info->cuda_exec_type |= CUDA_EXEC_RUNTIME;
     cuda_exec_type |= CUDA_EXEC_RUNTIME;
 
+    if (!profile_enabled){
+      return real_cudaLaunch(entry);
+    }
+
     static __thread int first_time = 1;
     if (first_time){   
 	real_cudaEventCreate(&start_event);
@@ -1139,14 +1161,12 @@
     }
     
  
-    //printf("%s is called in file %s\n", __FUNCTION__, __FILE__);
-    
     cudaError_t ret;
     
     if (first_time){
-	real_cudaEventRecord(start_event, 0);
+	real_cudaEventRecord(start_event, kernel_stream);
 	ret =  (*real_cudaLaunch)(entry);	
-	real_cudaEventRecord(stop_event, 0);
+	real_cudaEventRecord(stop_event, kernel_stream);
     }else{
 	if (real_cudaEventQuery(stop_event) == cudaSuccess){
 	    float tmp;
@@ -1158,9 +1178,9 @@
 	    real_cudaEventCreate(&stop_event);
 	    
 	    gpu_kernel_time  += 0.001*tmp;
-	    real_cudaEventRecord(start_event, 0);
+	    real_cudaEventRecord(start_event, kernel_stream);
 	    ret =  (*real_cudaLaunch)(entry);	
-	    real_cudaEventRecord(stop_event, 0);
+	    real_cudaEventRecord(stop_event, kernel_stream);
 	    
 	}else{
 	
@@ -1172,7 +1192,7 @@
 	    real_cudaEventDestroy(stop_event);
 	    real_cudaEventCreate(&stop_event);
 	    
-	    real_cudaEventRecord(stop_event,0);
+	    real_cudaEventRecord(stop_event,kernel_stream);
 	}
     }
     
@@ -1224,7 +1244,7 @@
 static void
 update_gpu_kernel_time()
 {
-    if(cuda_exec && wrapper_info && (cuda_exec_type == CUDA_EXEC_RUNTIME) ){
+  if(cuda_exec && wrapper_info && (cuda_exec_type == CUDA_EXEC_RUNTIME) && profile_enabled ){
 	if (real_cudaEventQuery(stop_event) == cudaSuccess){
 	    float tmp;
 	    real_cudaEventElapsedTime(&tmp, start_event, stop_event);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.