[Cudawrapper-svn] SF.net SVN: cudawrapper:[53] trunk

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 53
          http://cudawrapper.svn.sourceforge.net/cudawrapper/?rev=53&view=rev
Author:   gshi
Date:     2010-04-07 18:13:12 +0000 (Wed, 07 Apr 2010)

Log Message:
-----------
add the gpu/cpu affinity test utility 

Modified Paths:
--------------
    trunk/makefile

Added Paths:
-----------
    trunk/cuda_affinity_test.c

Added: trunk/cuda_affinity_test.c
===================================================================

--- trunk/cuda_affinity_test.c	                        (rev 0)
+++ trunk/cuda_affinity_test.c	2010-04-07 18:13:12 UTC (rev 53)
@@ -0,0 +1,343 @@
+
+#include <cuda_runtime.h>
+#include <numa.h>
+#include <stdio.h>
+
+#define CUERR  do{ cudaError_t err;					\
+        if ((err = cudaGetLastError()) != cudaSuccess) {		\
+	    printf("ERROR: CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
+	    exit(-1);							\
+        }}while(0)
+
+#define MAX_NODES   32
+#define MAX_STRING_LEN 128
+#define VERBOSE_PRINT(fmt,...) do {		\
+	if (verbose >=2){			\
+	    printf(fmt, ##__VA_ARGS__);		\
+	}					\
+    }while(0)
+
+#define PRINT(fmt,...) do {			\
+	if (verbose){				\
+	    printf(fmt, ##__VA_ARGS__);		\
+	}					\
+    }while(0)
+
+static int verbose = 0;
+
+static double
+gettime(void)
+{
+    struct timeval t;
+    gettimeofday(&t, NULL);
+
+    return t.tv_sec + 0.000001* t.tv_usec;
+}
+
+
+/* This function tests the gpu bandwidth (H2D and D2H) with given cpu bitmask
+ * The highest number on either direction are summed together
+ */
+
+double
+bandwidth_test(int node, struct bitmask* bitmask, int deviceid)
+{
+    double sum;
+    char* A = NULL;
+    char* hostA = NULL;
+    char* hostB = NULL;
+    double t0, t1;
+    unsigned int max_data_size = 1 << 28;
+
+    int i;
+    int repeat_num = 100;
+    int cpu_index;
+    
+    numa_sched_setaffinity(0, bitmask);
+    
+    unsigned int datasize = 1048576;
+    cudaSetDevice(deviceid);CUERR;
+    
+    cudaMallocHost((void**)&hostA, max_data_size); CUERR;
+    cudaMallocHost((void**)&hostB, max_data_size); CUERR;
+    cudaMalloc((void**)&A, max_data_size); CUERR;
+    
+    VERBOSE_PRINT("\nTesting node=%d, gpu deviceid =%d\n",node, deviceid);
+    VERBOSE_PRINT("size(bytes) \th2d_time(us) \t bandwidth(MB/s)\td2htime(us) \t banwidth(MB/s)\n");
+    
+    
+    double max_h2d_bandwidth = 0;
+    double max_d2h_bandwidth = 0;
+
+    while( datasize <= max_data_size){
+	repeat_num = 10;
+	
+	t0 = gettime();
+	for (i =0 ;i <repeat_num ;i ++){
+	    cudaMemcpy(A, hostA, datasize, cudaMemcpyHostToDevice);
+	    cudaThreadSynchronize();
+	}
+	t1 = gettime();
+	
+	double h2d_time = t1 - t0;
+	
+	t0 = gettime();
+	for (i =0 ;i <repeat_num ;i ++){
+	    cudaMemcpy(hostB, A, datasize, cudaMemcpyDeviceToHost);
+	    cudaThreadSynchronize();
+	}
+	t1 = gettime();
+	
+	double d2h_time = t1 - t0;	
+	
+	double h2d_bandwidth = 1.0*datasize*repeat_num/h2d_time*1e-6;
+	double d2h_bandwidth = 1.0*datasize*repeat_num/d2h_time*1e-6;
+	
+	if (h2d_bandwidth> max_h2d_bandwidth){
+	    max_h2d_bandwidth = h2d_bandwidth;
+	}
+	
+	if (d2h_bandwidth > max_d2h_bandwidth){
+	    max_d2h_bandwidth = d2h_bandwidth;
+	}
+	
+	VERBOSE_PRINT("%10d \t%10.2f \t%10.2f \t%10.2f \t%10.2f\n",
+	       datasize,
+	       h2d_time/repeat_num*(1e+6), h2d_bandwidth,
+	       d2h_time/repeat_num*(1e+6), d2h_bandwidth );
+	fflush(stdout);
+	
+	datasize = datasize* 2;
+    }
+       
+    sum = max_h2d_bandwidth + max_d2h_bandwidth;
+    
+    
+    cudaFreeHost(hostA);
+    cudaFreeHost(hostB);
+    cudaFree(A);    
+    
+    cudaThreadExit();
+    
+    VERBOSE_PRINT(" max_d2h_bandwidth=%f, max_h2d_bandwidht=%f, sum=%f\n",
+	   max_d2h_bandwidth, max_h2d_bandwidth, sum);
+
+    return sum;
+
+}
+
+void
+process_bandwidth_table(double * bandwidth_table, int nodes, int device_count, 
+			int* gpu_affinity_table)
+{
+    int i, j;
+    for(i=0;i < device_count; i++){
+	double max = 0;
+	int max_idx = 0;
+	for(j=0; j < nodes; j++){
+	    if (bandwidth_table[i*nodes + j]> max){
+		max = bandwidth_table[i*nodes + j];
+		max_idx = j;
+	    }   
+	}	
+	gpu_affinity_table[i]= max_idx;
+    }
+    
+    return;
+}
+
+void 
+display_bandwidth_table(double* table, int nodes, int device_count)
+{
+    int i, j;
+    for(i=0; i < nodes; i++){
+	PRINT("\tnode%d", i);
+    }
+    PRINT("\n");
+    
+    for(i =0;i < device_count; i++){
+	PRINT("gpu%d\t", i);
+	for(j=0; j < nodes; j++){
+	    PRINT("%f\t", table[i*nodes + j]);   
+	}
+	PRINT("\n");
+    }
+    
+}
+
+void
+output_to_file(char node_affinity_string[MAX_NODES][MAX_STRING_LEN], int* gpu_affinity_table, int device_count, char* output_file)
+{
+    char filebuf[1024*16];       
+    char* s =  filebuf;
+    char buf[1024];
+    int buflen=1024;
+    int i;
+    FILE* file = fopen(output_file, "r");
+    if (file){
+	while(fgets(buf, buflen, file) != NULL){
+	    char keyword[] = "affinity";
+	    if (strncmp(buf, keyword, strlen(keyword)) !=0){
+		s+= sprintf(s,"%s", buf);
+	    }
+	}
+	fclose(file);
+    }
+    
+    for(i = 0;i < device_count; i++){
+	s+= sprintf(s, "affinity     %d    %s\n", i, node_affinity_string[gpu_affinity_table[i]]);
+    }
+    
+    file = fopen(output_file, "w");
+    if (file == NULL){
+	printf("ERROR: opening file %s for write failed\n");
+	exit(1);
+    }
+    fprintf(file, "%s", filebuf);
+    fclose(file);    
+}
+
+void 
+usage(char** argv)
+{
+    printf("This utility detects the GPU/CPU affinity by performing exhaustive bandwidth tests\n");
+    printf("Usage: %s [options]\n", argv[0]);
+    printf("options\n");
+    printf("-v                 Verbose, print out the bandwidth table\n");
+    printf("-vv                More verbose message, print out all measurements and related information\n");
+    printf("-o <output_file>   Output file for GPU/CPU affinity. The file is created if it does not exists.\n");
+    printf("                   If the file exists, the old affinity entries is removed. Other content in the file remains the same\n");	   
+    printf("--help             This message\n");
+
+    exit(1);
+}
+
+int
+main(int argc, char** argv)
+{
+    char* output_file = NULL;    
+    int i;
+    for (i =1;i < argc; i++){
+	
+	if( strcmp(argv[i], "--help")== 0){
+	    usage(argv);
+	}
+	
+	if( strcmp(argv[i], "-o") == 0){
+	    if (i+1 >= argc){
+		usage(argv);
+	    }
+	    output_file =  argv[i+1];
+	    i++;
+	    continue;
+	}
+
+	if( strcmp(argv[i], "-v") == 0){
+	    verbose = 1;
+	    continue;
+	}	
+	
+	if( strcmp(argv[i], "-vv") == 0){
+	    verbose = 2;
+	    continue;
+	}		
+	
+
+	printf("ERROR: invalid option\n");
+	usage(argv);
+	
+    }
+    
+
+    int nodes;
+    
+    nodes = numa_num_configured_nodes();
+    if (nodes <= 0){
+	printf("ERROR: no nodes found\n");
+	exit(1);
+    }
+    
+    int device_count;
+
+    cudaGetDeviceCount(&device_count); CUERR;
+    
+    
+    double* bandwidth_table;
+    bandwidth_table = (double*) malloc(nodes*device_count*sizeof(double));
+    if (bandwidth_table == NULL){
+	printf("ERROR: malloc failed for bandwidth table\n");
+	exit(1);
+    }
+    memset(bandwidth_table, 0, nodes*device_count*sizeof(double));
+
+    int* gpu_affinity_table;
+    gpu_affinity_table = (int*)malloc(device_count*sizeof(int));
+    if (gpu_affinity_table == NULL){
+	printf("ERROR: malloc failed for gpu_affinity_table\n");
+	exit(1);
+    }
+    memset(gpu_affinity_table, 0, device_count* sizeof(int));
+
+    char node_affinity_string[MAX_NODES][MAX_STRING_LEN];
+    
+    int j;
+    unsigned long mask;
+    struct bitmask bitmask;
+    bitmask.size = sizeof(unsigned long);
+    bitmask.maskp = &mask;
+
+    for(i=0; i < nodes;i++){
+	int rc;
+	rc =numa_node_to_cpus(i, &bitmask);
+	if (rc != 0){
+	    printf("Error: changing node %d to cpu mask failed\n", i);
+	    exit(1);
+	}
+	VERBOSE_PRINT("node %d has mask: %x\n", i,mask);
+	
+	for(j = 0; j < device_count; j++){
+	    double bandwidth  = bandwidth_test(i, &bitmask, j);	    
+	    bandwidth_table[j*nodes + i] = bandwidth;
+	}
+	int cpu = 0;
+	int firsttime= 1;
+	char* s = node_affinity_string[i];
+	for(j =0; j < sizeof(unsigned long)*8; j++){
+	    if (mask & 1){
+		if (firsttime){
+		    s += sprintf(s, "%d", cpu);
+		    firsttime =0;
+		}else{
+		    s+=sprintf(s,",%d", cpu);
+		}
+	    }
+	    mask = mask >> 1;
+	    cpu++;
+	}
+	//sprintf(s, "\n");
+	VERBOSE_PRINT("node_affiity_string[%d]: %s\n", i, node_affinity_string[i]);
+	
+    }
+    
+    
+    display_bandwidth_table(bandwidth_table, nodes, device_count);
+    
+    process_bandwidth_table(bandwidth_table, nodes, device_count, gpu_affinity_table);
+    
+    if (!output_file){
+	printf("## affinity  <GPU>  cpu1,cpu2,etc\n");
+	for(i = 0;i < device_count; i++){
+	    printf("affinity     %d    %s\n", i, node_affinity_string[gpu_affinity_table[i]]);
+	}
+    }else{
+	output_to_file(node_affinity_string, gpu_affinity_table, device_count, output_file);
+    }
+    free(bandwidth_table);
+    free(gpu_affinity_table);
+    
+    return 0;
+
+}
+
+
+

Modified: trunk/makefile
===================================================================
--- trunk/makefile	2010-03-24 01:19:11 UTC (rev 52)
+++ trunk/makefile	2010-04-07 18:13:12 UTC (rev 53)
@@ -45,7 +45,7 @@
 CC=gcc
 CFLAGS = -fPIC -Wall
 CUDA_INC_DIR = ${CUDA_HOME}/include
-CUDA_LIB_DIR =${CUDA_HOME}/lib
+CUDA_LIB_DIR =${CUDA_HOME}/lib64
 
 
 ifeq (${COMPILE_CUDA_OCL_WRAPPER}, 1)
@@ -56,7 +56,7 @@
 	OPENCL_LINK_LIB=
 endif
 
-all: wrapper_init wrapper_terminate wrapper_query cuda_wrapper.so cuda_memscrubber
+all: wrapper_init wrapper_terminate wrapper_query cuda_wrapper.so cuda_memscrubber cuda_affinity_test
 
 wrapper_common.o: wrapper_common.c cuda_wrapper.h
 	${CC} -c ${CFLAGS}  -I. -D_GNU_SOURCE wrapper_common.c 
@@ -85,6 +85,9 @@
 cuda_memscrubber: cuda_memscrubber.c 
 	$(CC) -O3 -funroll-all-loops -I. -I$(CUDA_INC_DIR) cuda_memscrubber.c -o cuda_memscrubber -L$(CUDA_LIB_DIR) -lcuda -lpthread
 
+cuda_affinity_test: cuda_affinity_test.c
+	${CC} -O3 -I${CUDA_INC_DIR} -o $@ $< -L${CUDA_LIB_DIR} -lnuma -lcudart
+
 clean:
 	rm -f *.o cuda_wrapper.so wrapper_terminate wrapper_init wrapper_query cuda_memscrubber
 install:


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.