Cuda全局函数的运行速度比主机函数慢

问题描述

我有两个空隙,它们的作用相同:向量与数字的乘积。一个是简单的cvoid,另一个是全局的。但是在测量时间之后,我发现常规c函数的运行速度比全局函数快得多。他们在这里:

preg_replace('/\s+/u',' ',$arr_return ['name']);

为什么会这样? 预先谢谢你。

解决方法

有很多缺陷:不使用pointerToU,因为v不能使用dim3 grid,threads。无论如何,我知道有N个块只有一个线程,因此您的内核无法受益于合并的内存访问,这可能是cuda版本比cpu版本慢的主要原因 试试

VectorOnNumber<<<N/32+1,32>>>(pointerToVector,10,pointerToVector);
,

这是我的代码: GPU内核:

void VectorOnNumber(double *vector1,double number,double *resultVector,int N)
{
    dim3 grid(N/256+1),threads(256);

    VectorOnNumber_K<<<grid,threads>>>(vector1,number,resultVector,N);
}

__global__
void VectorOnNumber_K(double *vector1,int N)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}


void VectorOnNumberf(float *vector1,float number,float *resultVector,threads(256);

    VectorOnNumberf_K<<<grid,N);
}

__global__
void VectorOnNumberf_K(float *vector1,int N)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    if(tid < N){
        resultVector[tid] = vector1[tid]*number;
    }
}


CPU:

void Stack_von(double *vec,double n,double *res,int N)
{
int i;
    for(i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}
void Stack_vonf(float *vec,float n,float *res,int N)
{
int i;
    for(i = 0; i < N; i++){
        res[i] = vec[i]*n;
    }
}

全面测试:

void Stack()
{
int i,N;
double *x,*u,*dx,*du;
float  *fx,*fu,*dfx,*dfu;

    N=1000000;
    x=new double[N];
    u=new double[N];
    fx=new float[N];
    fu=new float[N];

    
    for(i = 0; i < N; i++){
        x[i] = i*i;
        fx[i] = i*i;
    }

    // cpu
    printf("start\n");
    clock_t start = clock();
    for(int k=0; k < 1000; k++) Stack_von(x,u,N);
    clock_t end = clock();
    float seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("host double %f ms\n",seconds); 

    start = clock();
    for(int k=0; k < 1000; k++) Stack_vonf(fx,fu,N);
    end = clock();
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("host float  %f ms\n",seconds); // 0.03 ms
    
    // gpu
    cudaMalloc(&dfx,N*sizeof(float));
    cudaMalloc(&dfu,N*sizeof(float));
    cudaMemcpy(dfx,fx,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMalloc(&dx,N*sizeof(double));
    cudaMalloc(&du,N*sizeof(double));
    cudaMemcpy(dx,x,N*sizeof(double),cudaMemcpyHostToDevice);

    cudaEvent_t dstart,dstop;
    float elapsedTime;
    cudaEventCreate(&dstart);
    cudaEventCreate(&dstop);

    cudaEventRecord(dstart,0);
    VectorOnNumber(dx,du,N);
    cudaEventRecord(dstop,0);
    cudaEventSynchronize(dstop);
    cudaEventElapsedTime(&elapsedTime,dstart,dstop);
    printf("device double %f ms\n",elapsedTime);

    cudaEventRecord(dstart,0);
    VectorOnNumberf(dfx,dfu,dstop);
    printf("device float  %f ms\n",elapsedTime);
     

    cudaFree(dx);
    cudaFree(du);
    cudaFree(dfx);
    cudaFree(dfu);
    delete [] x;
    delete [] u;
    delete [] fx;
    delete [] fu;
}

结果 主机倍数:1.35ms浮点0.45ms 设备两倍0.067ms浮点0.037ms

设备(GTX1080)比主机(XEON 3.50GHz 8核)快10倍 我将N设置为10 ^ 6以使其可测量

如果每个块只有一个线程,则设备上的时间为1.37毫秒!

相关问答

错误1:Request method ‘DELETE‘ not supported 错误还原:...
错误1:启动docker镜像时报错:Error response from daemon:...
错误1:private field ‘xxx‘ is never assigned 按Alt...
报错如下,通过源不能下载,最后警告pip需升级版本 Requirem...