问题描述
我目前正在尝试使 cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) 版本正常工作。我首先制作了一个小测试用例,以查看确切需要哪些参数以及如何输入它们。然而,经过多次反复试验,我仍然无法让它工作,我得到 13 的状态返回,这对应于 CUBLAS_STATUS_EXECUTION_Failed 这是一个非常模糊的错误,我还尝试了其他一些 cublas 测试用例,它们似乎工作正常。我也在 MATlab 中测试了输入矩阵,它确实有一个 LS 解决方案。
#include "stdafx.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cmath>
#include <Windows.h>
int main()
{
//init id,handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasstatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat,*Ymat,*Xmat;
float *gAmat,*gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudamalloc( &gAmat,M * C * sizeof(float));
cudamalloc( &gYmat,C * sizeof(float));
//copy mem
cudamemcpy(gAmat,Amat,M * C * sizeof(float),cudamemcpyHostToDevice);
cudamemcpy(gYmat,Ymat,C * 1 * sizeof(float),cudamemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think,but just to test)
cudaDeviceSynchronize();
//run cublas
cublasstatus_t status = cublasSgelsBatched(m_cuBLAS,CUBLAS_OP_N,C,M,1,&gAmat,lda,//or 1
&gYmat,&info,NULL,1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudamemcpy(Xmat,gYmat,cudamemcpyDevicetoHost);
//Output printed
std::cout << Xmat[0] << "," << Xmat[1] << "," << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
我在使用 CUDA 9.0 在 MVS 中运行的 Windows 10 上
非常感谢您的帮助
解决方法
正如评论中所指出的,您没有在设备上创建正确的指针数组。 batched function 使用位于设备内存中的指针数组,用于数据参数,例如:
Aarray device 输入/输出指向数组的指针数组,每个数组都是暗的。 m x n 且 lda>=max(1,m)。矩阵 Aarray[i] 不应重叠;否则,会出现未定义的行为。
例如传递 &gAmat
似乎满足类型要求,但是那个指针不指向设备内存。
对您的代码的以下修改侧重于正确处理 gAmat
和 gYmat
对我来说似乎没有错误:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id,handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat,*Ymat,*Xmat;
float *gAmat,*gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat,M * C * sizeof(float));
cudaMalloc( &gYmat,C * sizeof(float));
//copy mem
cudaMemcpy(gAmat,Amat,M * C * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(gYmat,Ymat,C * 1 * sizeof(float),cudaMemcpyHostToDevice);
float **ggAmat,**ggYmat;
cudaMalloc(&ggAmat,sizeof(float*));
cudaMalloc(&ggYmat,sizeof(float*));
cudaMemcpy(ggAmat,&gAmat,sizeof(float*),cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat,&gYmat,cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think,but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,CUBLAS_OP_N,C,M,1,ggAmat,lda,//or 1
ggYmat,&info,NULL,1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat,gYmat,cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << "," << Xmat[1] << "," << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used
t130.cu(24): warning: variable "ldb" was declared but never referenced
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168,0.514827,-4.29722
========= ERROR SUMMARY: 0 errors
$
您的代码仅显示一个数组。如果您有一批数组,您将为 A 和 Y 中的每一个传递一个实际的设备分配指针数组。
根据下面的评论,这里是使用非随机输入的代码版本:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id,handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t status;
// create handle
status = cublasCreate(&m_cuBLAS);
std::cout << "status = " << status << std::endl;
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat,*gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc( &gAmat,but just to test)
cudaDeviceSynchronize();
//run cublas
status = cublasSgelsBatched(m_cuBLAS," << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5,9.7,0.707106
========= ERROR SUMMARY: 0 errors
$