cuda에서 수조 수가 스레드 수보다 많은 처리 방법

8002 단어
Stackoverflow 게시물 처리 방법 참조:https://stackoverflow.com/questions/26913683/different-way-to-index-threads-in-cuda-c
코드에서 cudagridsize 함수는 yolo를 참조합니다.
코드는 다음과 같습니다.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include 
#include 
#include 
#include 

using namespace std;
#define BLOCK 512

dim3 cuda_gridsize(size_t n){
    size_t k = (n - 1) / BLOCK + 1;
    unsigned int x = k;
    unsigned int y = 1;
    if (x > 65535){
        x = ceil(sqrt(k));
        y = (n - 1) / (x*BLOCK) + 1;
    }
    dim3 d = { x, y, 1 };
    //printf("%ld %ld %ld %ld
", n, x, y, x*y*BLOCK);
return d; } __global__ void gpuCalc(unsigned char *img,long H,long W) { long threadId_2D = threadIdx.x + threadIdx.y*blockDim.x; long blockId_2D = blockIdx.x + blockIdx.y*gridDim.x; long i = threadId_2D + (blockDim.x*blockDim.y)*blockId_2D; // //long i = (gridDim.x*blockDim.x)*(threadIdx.y + blockDim.y*blockIdx.y) + (threadIdx.x + blockDim.x*blockIdx.x); while (i < H*W){ img[i] = 255 - img[i]; i += (gridDim.x*blockDim.x)*(gridDim.y*blockDim.y); } } void addWithCuda(unsigned char *img, long H,long W) { unsigned char *dev_a = 0; cudaSetDevice(0); cudaMalloc((void**)&dev_a, H*W * sizeof(unsigned char)); cudaMemcpy(dev_a, img, H*W * sizeof(unsigned char), cudaMemcpyHostToDevice); gpuCalc<<> >(dev_a, H, W); cudaMemcpy(img, dev_a, H*W * sizeof(unsigned char), cudaMemcpyDeviceToHost); cudaFree(dev_a); cudaGetLastError(); } void cpuCalc(unsigned char *img,long W, long H) { for (long i = 0; i < H*W; i++) img[i] = 255 - img[i]; } int main() { long W = 20000; long H = 20000; unsigned char *img = new unsigned char[W*H]; unsigned char *cmp = new unsigned char[W*H]; for (long i = 0; i < H*W; i++) img[i] = rand() % 100; memcpy(cmp, img, H*W); cpuCalc(img, W, H); printf("cpu calc end
"); addWithCuda(img, W,H); printf("gpu calc end
"); bool flag = true; for (long i = 0; i < H*W; i++) { if (img[i] != cmp[i]) { printf("no pass
"); flag = false; break; } } if (flag) printf("pass"); delete[] cmp; delete[] img; getchar(); return 0; }

전재 대상:https://www.cnblogs.com/tiandsp/p/9460130.html

좋은 웹페이지 즐겨찾기