GPU直方圖統計

GPU直方圖統計

0.引言

1.code

#include <stdio.h>
#include <cuda_runtime.h>



__global__ void histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
    int idx = threadIdx.x;
    //int iterms = sizeof(d_in);
    volatile __shared__ int sdata[128];
    __shared__ int tmpbin[24];// 8個線程的 bin

    sdata[idx + 0] = d_in[idx + 0]; ++tmpbin[sdata[idx + 0]%BIN_COUNT + 0] ;
    sdata[idx + 8] = d_in[idx + 8]; ++tmpbin[sdata[idx + 8]%BIN_COUNT + 0] ;
    sdata[idx + 16] = d_in[idx + 16]; ++tmpbin[sdata[idx + 16]%BIN_COUNT + 3] ;
    sdata[idx + 24] = d_in[idx + 24]; ++tmpbin[sdata[idx + 24]%BIN_COUNT + 3] ;
    sdata[idx + 32] = d_in[idx + 32]; ++tmpbin[sdata[idx + 32]%BIN_COUNT + 6] ;
    sdata[idx + 40] = d_in[idx + 40]; ++tmpbin[sdata[idx + 40]%BIN_COUNT + 6] ;
    sdata[idx + 48] = d_in[idx + 48]; ++tmpbin[sdata[idx + 48]%BIN_COUNT + 9] ;
    sdata[idx + 56] = d_in[idx + 56]; ++tmpbin[sdata[idx + 56]%BIN_COUNT + 9] ;
    sdata[idx + 64] = d_in[idx + 64]; ++tmpbin[sdata[idx + 64]%BIN_COUNT + 12] ;
    sdata[idx + 72] = d_in[idx + 72]; ++tmpbin[sdata[idx + 72]%BIN_COUNT + 12] ;
    sdata[idx + 80] = d_in[idx + 80]; ++tmpbin[sdata[idx + 80]%BIN_COUNT + 15] ;
    sdata[idx + 88] = d_in[idx + 88]; ++tmpbin[sdata[idx + 88]%BIN_COUNT + 15] ;
    sdata[idx + 96] = d_in[idx + 96]; ++tmpbin[sdata[idx + 96]%BIN_COUNT + 18] ;
    sdata[idx + 104] = d_in[idx + 104]; ++tmpbin[sdata[idx + 104]%BIN_COUNT + 18] ;
    sdata[idx + 112] = d_in[idx + 112]; ++tmpbin[sdata[idx + 112]%BIN_COUNT + 21] ;
    sdata[idx + 120] = d_in[idx + 120]; ++tmpbin[sdata[idx + 120]%BIN_COUNT + 21] ;

    __syncthreads();
/*
    if(idx > 1) 
    {
        d_bins[0] += tmpbin[idx * 3 - 0];
        d_bins[1] += tmpbin[idx * 3 - 2];
        d_bins[2] += tmpbin[idx * 3 - 1]; 
    } 
    if(idx == 0) d_bins[0] + tmpbin[0];
*/
    for(int i = 0;i < 24; ++i)
    {
        d_bins[0] += tmpbin[i%3];
        d_bins[1] += tmpbin[i%3];
        d_bins[2] += tmpbin[i%3];
    }
}


__global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
    int myId = threadIdx.x;
    int myItem = d_in[myId];
    int myBin = myItem % BIN_COUNT;
    atomicAdd(&(d_bins[myBin]), 1);//原子鎖
}



int main(int argc, char **argv)
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        fprintf(stderr, "error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
    int dev = 0;
    cudaSetDevice(dev);

    cudaDeviceProp devProps;
    if (cudaGetDeviceProperties(&devProps, dev) == 0)
    {
        printf("Using device %d:\n", dev);
        printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
               devProps.name, (int)devProps.totalGlobalMem, 
               (int)devProps.major, (int)devProps.minor, 
               (int)devProps.clockRate);
    }

    const int ARRAY_SIZE = 128;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
    const int BIN_COUNT = 3;
    const int BIN_BYTES = BIN_COUNT * sizeof(int);

    // generate the input array on the host   0~127
    int h_in[ARRAY_SIZE];
    for(int i = 0; i < ARRAY_SIZE; i++) {
        h_in[i] = i;
    }
    int h_bins[BIN_COUNT];
    for(int i = 0; i < BIN_COUNT; i++) {
        h_bins[i] = 0;
    }

    // declare GPU memory pointers
    int * d_in;
    int * d_bins;

    // allocate GPU memory
    cudaMalloc((void **) &d_in, ARRAY_BYTES);
    cudaMalloc((void **) &d_bins, BIN_BYTES);

    // transfer the arrays to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice); 

    histo<<<1, ARRAY_SIZE/16>>>(d_bins, d_in, BIN_COUNT);
    //simple_histo<<<1, 128>>>(d_bins, d_in, BIN_COUNT);
    // copy back the sum from GPU
    cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost);

    for(int i = 0; i < BIN_COUNT; i++) {
        printf("bin %d: count %d\n", i, h_bins[i]);
    }

    // free GPU memory allocation
    cudaFree(d_in);
    cudaFree(d_bins);
        
    return 0;
}

2.result

有誤,shit!!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章