Menu

compare_cuda_cume

Jean-Michel Richer

Comparison of CUDA and CUME

We use a simple sum of vectors example: given three vectors of integers a, b and c, we want to compute c = a + b, i.e. c[i] = a[i] + b[i] for all i.

CUDA version

In the CUDA version we must:
1. allocate the memory on host and device
2. copy data from host memory to device memory before calling the kernel
3. determine how many blocks we need in function of the size of the vectors
4. find the exact formula to get the global thread index inside the kernel
5. copy the result from device to host memory after call to kernel
6. free the data in device memory

For the kernel, we must determine the global thread index (gtid )in function of the dimension of the blocks. In this example we use a grid of type (x,1,1) composed of blocks of type (32,1,1). The gtid is then given by the formula:

int gtid = blockDim.x * blockIdx.x + threadIdx.x; // 32 * blockIdx.x + threadIdx.x

Here is the CUDA version

/**
 * Example of the sum of vectors, we compute c = a + b
 * by using the CUDA API
 */

#include <cuda.h>
#include <iostream>
#include <iterator>
#include <algorithm>
#include <numeric>
#include <cassert>
using namespace std;

// ------------------------------------------------------------------
// definition of a macro instruction that checks if a CUDA function
// was successull or not. If the execution of the function resulted
// in some error we display it and stop the program
// ------------------------------------------------------------------
#define cuda_check(value) { \
        cudaError_t err = value; \
        if (err != cudaSuccess) {   \
            cerr << "============================================"; \
            cerr << "Error: " << cudaGetErrorString(err) << " at line "; \
            cerr << __LINE__ << " in file " <<  __FILE__;   \
            cerr <<  endl; \
            exit(EXIT_FAILURE); \
        } \
}

/**
 * kernel that computes the sum c = a + b for vectors a, b and c
 * of size integers
 * we use a 1D grid (x,1,1) of 1D blocks of 32 threads (32,1,1)
 */
__global__ void kernel_sum(int *a, int *b, int *c, int size) {
    // evaluate global thread index
    int gtid = blockDim.x * blockIdx.x + threadIdx.x;

    if (gtid < size) {
        c[gtid] = a[gtid] + b[gtid];
    }
}

/**
 * main function
 */
int main() {
    // vectors a, b and c in host memory
    int *cpu_a, *cpu_b, *cpu_c;
    // vectors a, b and c in device memory
    int *gpu_a, *gpu_b, *gpu_c;

    // size of vectors
    const int SIZE = 1027;

    cout << "Example of vector sum with CUDA API" << endl;

    // create vectors a, b and c in host memory
    cpu_a = new int [SIZE];
    cpu_b = new int [SIZE];
    cpu_c = new int [SIZE];

    // initialize vectors a and b in host memory
    std::fill(&cpu_a[0], &cpu_a[SIZE], 1);
    std::fill(&cpu_b[0], &cpu_b[SIZE], 2);

    // create vectors a, b, and c in device memory
    cuda_check( cudaMalloc((void **)&gpu_a, SIZE*sizeof(int)) );
    cuda_check( cudaMalloc((void **)&gpu_b, SIZE*sizeof(int)) );
    cuda_check( cudaMalloc((void **)&gpu_c, SIZE*sizeof(int)) );

    // copy vectors a and b from host to device memory
    cuda_check( cudaMemcpy(gpu_a, cpu_a, SIZE*sizeof(int), cudaMemcpyHostToDevice); );
    cuda_check( cudaMemcpy(gpu_b, cpu_b, SIZE*sizeof(int), cudaMemcpyHostToDevice); );

    // compute number of blocks in the grid
    int block = 32;
    int grid = (SIZE + block-1) / block;

    // call kernel with grid and block
    kernel_sum<<<grid, block>>>(gpu_a, gpu_b, gpu_c, SIZE);

    // check if kernel produced an error
    cudaError_t err = cudaGetLastError(); 
    if (err != cudaSuccess)  { 
        cerr << "============================================"; 
        cerr << "Error: " << cudaGetErrorString(err) << " at line "; 
        cerr << __LINE__ << " in file " <<  __FILE__;   
        cerr <<  endl; 
        exit(EXIT_FAILURE); 
    }

    // copy the result (vector c) from device to host memory
    cuda_check( cudaMemcpy(cpu_c, gpu_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost) );

    // print vector
    //copy(&cpu_c[0], &cpu_c[SIZE], ostream_iterator<int>(cout, ", "));
    //cout << endl;

    // **************************************************************
    // check if result is ok or not
    // **************************************************************
    int sum = accumulate(&cpu_c[0], &cpu_c[SIZE], 0);
    cout << "computed sum = " << sum << endl;
    int expected_sum = 3*SIZE;
    cout << "expected sum = " << expected_sum << endl;
    assert(sum == expected_sum);

    // delete vectors a, b and c in device memory
    cuda_check( cudaFree(gpu_a) );
    cuda_check( cudaFree(gpu_b) );
    cuda_check( cudaFree(gpu_c) );

    // delete vectors a, b and c in host memory
    delete [] cpu_a;
    delete [] cpu_b;
    delete [] cpu_c;

    cout << "OK" << endl;

    return EXIT_SUCCESS;
}

CUME version

In the CUME version we will use two data structures:

  1. the Array<int> class the helps represent a 1D array which stores data in host and device memory so we won't have to take care of allocation and transfer from host to device memory
  2. the Kernel class that will automatically determine the number of blocks of the grid

As a result the code is easier to write and the global thread index can be obtained from the Kernel::Resource structure.

#include "cume.h"

using namespace cume;

/**
 * kernel that computes the sum c = a + b for vectors a, b and c
 * of size integers
 * we use a 1D grid (x,1,1) of 1D blocks of 32 threads (32,1,1)
 * %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 * as we call the kernel with the kernel_call
 * macro instruction we need to define a pointer to a 
 * Kernel::Resource structure as first parameter.
 * the res variable will help us to automatically get the right 
 * global thread index formula
 * %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 */
__global__ void kernel_sum(Kernel::Resource *res, int *a, int *b, int *c, int size) {
    // **************************************************************
    // automatically get global thread index in function of kernel
    // type: no need to wonder which formula to use
    // **************************************************************
    int gtid = res->get_global_tid();

    if (gtid < size) {
        c[gtid] = a[gtid] + b[gtid];
    }
}

/**
 * main function
 */
int main() {
    const int SIZE = 1027;

    cout << "Example of vector sum with CUDA and CUME" << endl;
    cout << "with usage of array and resource" << endl;

    // **************************************************************
    // improvement: create arrays of integers that will be stored
    // on host and device memory
    // **************************************************************
    Array<int> a(SIZE), b(SIZE), c(SIZE);

    // **************************************************************
    // fill array in the host memory using the std:fill function 
    // **************************************************************
    std::fill(a.begin(), a.end(), 1);
    std::fill(b.begin(), b.end(), 2);

    // **************************************************************
    // copy data from host to device memory
    // **************************************************************
    a.push();
    b.push();

    // **************************************************************
    // automatically determine number of blocks using the Kernel
    // class and the GRID_GUESS constant
    // we also use a timer to evaluate the time spent in the kernel
    // **************************************************************
    Kernel k(SIZE);
    k.configure(GRID_GUESS | GRID_X, BLOCK_X, 32);
    k.set_timer_needed(true);
    cout << k << endl;

    // **************************************************************
    // execute kernel by passing as first argument a resource 
    // held by 'k' that will automatically give to the kernel the
    // right evaluation of the global thread index
    // **************************************************************
    kernel_call(kernel_sum, k, &a, &b, &c, a.get_size());

    // **************************************************************
    // get back result by copying data from device to host memory 
    // **************************************************************
    c.pull();

    // print result
    //cout << c << endl;

    // **************************************************************
    // check if result is ok or not
    // **************************************************************
    int sum = accumulate(c.begin(), c.end(), 0);
    cout << "computed sum = " << sum << endl;
    int expected_sum = 3*SIZE;
    cout << "expected sum = " << expected_sum << endl;
    assert(sum == expected_sum);

    // **************************************************************
    // no need to free resources explicitly: it is done by the
    // destructor of the Array class
    // **************************************************************

    cout << "OK" << endl;

    return EXIT_SUCCESS;
}