Imran Ashraf - 2014-04-09

I am trying to use shared memory for a kernel in the following way, but the results
are incorrect. Am I doing something wrong here?

#pragma hicuda kernel SGFKernel tblock(NCOLS/16, NROWS/16) thread(16,16)
#pragma hicuda loop_partition over_tblock over_thread
for (y = bordery; y < NROWS - bordery ; y++)
{
#pragma hicuda loop_partition over_tblock over_thread
    for (x = borderx ; x < NCOLS - borderx ; x++)
    {
        // Sum the gradients in the surrounding window
        gxx = 0.0f;  gxy = 0.0f;  gyy = 0.0f;
#pragma hicuda shared alloc gradx[NCOLS*(y-7)+(x-7):NCOLS*(y+7)+(x+7)] copyin
#pragma hicuda shared alloc grady[NCOLS*(y-7)+(x-7):NCOLS*(y+7)+(x+7)] copyin
#pragma hicuda barrier
        for (yy = y-window_hh ; yy <= y+window_hh ; yy++)
        {
            for (xx = x-window_hw ; xx <= x+window_hw ; xx++)
            {
                gx = gradx[NCOLS*yy+xx];
                gy = grady[NCOLS*yy+xx];
                gxx += gx * gx;
                gxy += gx * gy;
                gyy += gy * gy;
            }
        }
#pragma hicuda barrier

        // Store the trackability of the pixel as the minimum of the two eigenvalues
        val = (float) ((gxx + gyy - sqrt((gxx - gyy)*(gxx - gyy) + 4.0f*gxy*gxy))/2.0f);
        pointlist[ (y-bordery)*3*(NCOLS - 2*borderx) + 3*(x-borderx) + 0] = x;
        pointlist[ (y-bordery)*3*(NCOLS - 2*borderx) + 3*(x-borderx) + 1] = y;
        pointlist[ (y-bordery)*3*(NCOLS - 2*borderx) + 3*(x-borderx) + 2] = (val > limit)? (float)limit : val;
    }
}
#pragma hicuda kernel_end

Thanks in advance for the help.