We use a simple sum of vectors example: given three vectors of integers a, b and c, we want to compute c = a + b, i.e. c[i] = a[i] + b[i] for all i.
In the CUDA version we must:
1. allocate the memory on host and device
2. copy data from host memory to device memory before calling the kernel
3. determine how many blocks we need in function of the size of the vectors
4. find the exact formula to get the global thread index inside the kernel
5. copy the result from device to host memory after call to kernel
6. free the data in device memory
For the kernel, we must determine the global thread index (gtid )in function of the dimension of the blocks. In this example we use a grid of type (x,1,1) composed of blocks of type (32,1,1). The gtid is then given by the formula:
int gtid = blockDim.x * blockIdx.x + threadIdx.x; // 32 * blockIdx.x + threadIdx.x
Here is the CUDA version
/** * Example of the sum of vectors, we compute c = a + b * by using the CUDA API */ #include <cuda.h> #include <iostream> #include <iterator> #include <algorithm> #include <numeric> #include <cassert> using namespace std; // ------------------------------------------------------------------ // definition of a macro instruction that checks if a CUDA function // was successull or not. If the execution of the function resulted // in some error we display it and stop the program // ------------------------------------------------------------------ #define cuda_check(value) { \ cudaError_t err = value; \ if (err != cudaSuccess) { \ cerr << "============================================"; \ cerr << "Error: " << cudaGetErrorString(err) << " at line "; \ cerr << __LINE__ << " in file " << __FILE__; \ cerr << endl; \ exit(EXIT_FAILURE); \ } \ } /** * kernel that computes the sum c = a + b for vectors a, b and c * of size integers * we use a 1D grid (x,1,1) of 1D blocks of 32 threads (32,1,1) */ __global__ void kernel_sum(int *a, int *b, int *c, int size) { // evaluate global thread index int gtid = blockDim.x * blockIdx.x + threadIdx.x; if (gtid < size) { c[gtid] = a[gtid] + b[gtid]; } } /** * main function */ int main() { // vectors a, b and c in host memory int *cpu_a, *cpu_b, *cpu_c; // vectors a, b and c in device memory int *gpu_a, *gpu_b, *gpu_c; // size of vectors const int SIZE = 1027; cout << "Example of vector sum with CUDA API" << endl; // create vectors a, b and c in host memory cpu_a = new int [SIZE]; cpu_b = new int [SIZE]; cpu_c = new int [SIZE]; // initialize vectors a and b in host memory std::fill(&cpu_a[0], &cpu_a[SIZE], 1); std::fill(&cpu_b[0], &cpu_b[SIZE], 2); // create vectors a, b, and c in device memory cuda_check( cudaMalloc((void **)&gpu_a, SIZE*sizeof(int)) ); cuda_check( cudaMalloc((void **)&gpu_b, SIZE*sizeof(int)) ); cuda_check( cudaMalloc((void **)&gpu_c, SIZE*sizeof(int)) ); // copy vectors a and b from host to device memory cuda_check( cudaMemcpy(gpu_a, cpu_a, SIZE*sizeof(int), cudaMemcpyHostToDevice); ); cuda_check( cudaMemcpy(gpu_b, cpu_b, SIZE*sizeof(int), cudaMemcpyHostToDevice); ); // compute number of blocks in the grid int block = 32; int grid = (SIZE + block-1) / block; // call kernel with grid and block kernel_sum<<<grid, block>>>(gpu_a, gpu_b, gpu_c, SIZE); // check if kernel produced an error cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { cerr << "============================================"; cerr << "Error: " << cudaGetErrorString(err) << " at line "; cerr << __LINE__ << " in file " << __FILE__; cerr << endl; exit(EXIT_FAILURE); } // copy the result (vector c) from device to host memory cuda_check( cudaMemcpy(cpu_c, gpu_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost) ); // print vector //copy(&cpu_c[0], &cpu_c[SIZE], ostream_iterator<int>(cout, ", ")); //cout << endl; // ************************************************************** // check if result is ok or not // ************************************************************** int sum = accumulate(&cpu_c[0], &cpu_c[SIZE], 0); cout << "computed sum = " << sum << endl; int expected_sum = 3*SIZE; cout << "expected sum = " << expected_sum << endl; assert(sum == expected_sum); // delete vectors a, b and c in device memory cuda_check( cudaFree(gpu_a) ); cuda_check( cudaFree(gpu_b) ); cuda_check( cudaFree(gpu_c) ); // delete vectors a, b and c in host memory delete [] cpu_a; delete [] cpu_b; delete [] cpu_c; cout << "OK" << endl; return EXIT_SUCCESS; }
In the CUME version we will use two data structures:
As a result the code is easier to write and the global thread index can be obtained from the Kernel::Resource structure.
#include "cume.h" using namespace cume; /** * kernel that computes the sum c = a + b for vectors a, b and c * of size integers * we use a 1D grid (x,1,1) of 1D blocks of 32 threads (32,1,1) * %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% * as we call the kernel with the kernel_call * macro instruction we need to define a pointer to a * Kernel::Resource structure as first parameter. * the res variable will help us to automatically get the right * global thread index formula * %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */ __global__ void kernel_sum(Kernel::Resource *res, int *a, int *b, int *c, int size) { // ************************************************************** // automatically get global thread index in function of kernel // type: no need to wonder which formula to use // ************************************************************** int gtid = res->get_global_tid(); if (gtid < size) { c[gtid] = a[gtid] + b[gtid]; } } /** * main function */ int main() { const int SIZE = 1027; cout << "Example of vector sum with CUDA and CUME" << endl; cout << "with usage of array and resource" << endl; // ************************************************************** // improvement: create arrays of integers that will be stored // on host and device memory // ************************************************************** Array<int> a(SIZE), b(SIZE), c(SIZE); // ************************************************************** // fill array in the host memory using the std:fill function // ************************************************************** std::fill(a.begin(), a.end(), 1); std::fill(b.begin(), b.end(), 2); // ************************************************************** // copy data from host to device memory // ************************************************************** a.push(); b.push(); // ************************************************************** // automatically determine number of blocks using the Kernel // class and the GRID_GUESS constant // we also use a timer to evaluate the time spent in the kernel // ************************************************************** Kernel k(SIZE); k.configure(GRID_GUESS | GRID_X, BLOCK_X, 32); k.set_timer_needed(true); cout << k << endl; // ************************************************************** // execute kernel by passing as first argument a resource // held by 'k' that will automatically give to the kernel the // right evaluation of the global thread index // ************************************************************** kernel_call(kernel_sum, k, &a, &b, &c, a.get_size()); // ************************************************************** // get back result by copying data from device to host memory // ************************************************************** c.pull(); // print result //cout << c << endl; // ************************************************************** // check if result is ok or not // ************************************************************** int sum = accumulate(c.begin(), c.end(), 0); cout << "computed sum = " << sum << endl; int expected_sum = 3*SIZE; cout << "expected sum = " << expected_sum << endl; assert(sum == expected_sum); // ************************************************************** // no need to free resources explicitly: it is done by the // destructor of the Array class // ************************************************************** cout << "OK" << endl; return EXIT_SUCCESS; }