问题描述
我是 cuda 的新手,我正在尝试找到三个矩阵的逆矩阵。矩阵是 [4 8;3 9]。我想编写一个 cuda 内核来计算 GPU 上所有三个矩阵的逆。这是我写的代码。但我收到此错误:ptxas fatal : Unresolved extern function 'cublasCreate_v2'
#include <stdio.h>
#include <cublas_v2.h>
__global__ void copy(float** a,float* b,float** c,float* d,int count){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx < count){
a[idx] = b + idx*2*2 ;
c[idx] = d + idx*2*2;
}
__syncthreads();
}
__global__ void inv(float** a,float** c){
int Nmatrices = 3; int N = 2;
cublasHandle_t handle;
cublasCreate(&handle);
int *h_PivotArray = (int *)malloc(N*Nmatrices*sizeof(int));
int *h_InfoArray = (int *)malloc( Nmatrices*sizeof(int));
cublasSgetrfBatched(handle,N,a,h_PivotArray,h_InfoArray,Nmatrices);
cublasSgetriBatched(handle,c,Nmatrices);
cublasDestroy(handle);
}
int main() {
int N = 2;
int Nmatrices = 3;//number of batches
// --- Matrices to be inverted
float *h_A = new float[N*N*Nmatrices];
float *r_A = new float[N*N*Nmatrices];//result
h_A[0] = 4.f;
h_A[1] = 3.f;
h_A[2] = 8.f;
h_A[3] = 9.f;
h_A[4] = 4.f;
h_A[5] = 3.f;
h_A[6] = 8.f;
h_A[7] = 9.f;
h_A[8] = 4.f;
h_A[9] = 3.f;
h_A[10] = 8.f;
h_A[11] = 9.f;
int count = Nmatrices;
// --- Allocate device matrices
float *d_A; cudamalloc((void**)&d_A,N*N*Nmatrices*sizeof(float));
float *c_A; cudamalloc((void**)&c_A,N*N*Nmatrices*sizeof(float));
// --- Move the matrix to be inverted from host to device
// --- Creating the array of pointers needed as input to the batched getrf
float **d_inout_pointers;
cudamalloc((void**)&d_inout_pointers,Nmatrices*sizeof(float *));
float **rd_inout_pointers;
cudamalloc((void**)&rd_inout_pointers,Nmatrices*sizeof(float *));
int *d_PivotArray; cudamalloc((void**)&d_PivotArray,N*Nmatrices*sizeof(int));
int *d_InfoArray; cudamalloc((void**)&d_InfoArray,Nmatrices*sizeof(int));
for(int i = 0; i<2; i++){
cudamemcpy(d_A,h_A,N*N*Nmatrices*sizeof(float),cudamemcpyHostToDevice);
copy<<<1,10>>>(d_inout_pointers,d_A,rd_inout_pointers,c_A,count);
inv<<<1,1>>>(d_inout_pointers,rd_inout_pointers);
cudamemcpy(h_A,N*N*sizeof(float),cudamemcpyDevicetoHost);
cudamemcpy(r_A,Nmatrices*N*N*sizeof(float),cudamemcpyDevicetoHost);
for (int i=0; i<N*N*Nmatrices; i++) printf("A[%i]=%f\n",i,r_A[i]);
h_A = r_A;
}
cudaFree(c_A);
cudaFree(d_A);
cudaFree(d_inout_pointers);
cudaFree(rd_inout_pointers);
cudaFree(d_InfoArray);
cudaFree(d_PivotArray);
return 0;
}
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)