问题描述
我正在尝试使用下面给出的代码进行 SVD,但是我没有得到 d_S 的正确结果,当 num_Matrices > 128 时,比如说 256,当 num_Matrices = 128*128 时我得到正确的结果,我指的是代码基于对堆栈溢出问题的回答,链接为 Parallel implementation for multiple SVDs using CUDA。
下面给出了我的代码,它将前 26 个矩阵的奇异值全部设为 0。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define FULLSVD
#define PRINTRESULTS
#define M_SQR(x) ((x)*(x))
__global__ void initSIGPU(cuComplex *SI,float *SDiag,int N,int M,int len,float epsilon);
void initSI(cuComplex *SI,float epsilon,int threadsPerBlock);
__global__ void initCZeros(cuComplex *in,int len);
void checkArray(cuComplex *ptr,int rows,int cols,const char *name);
void printArray(cuComplex *ptr,char mode,const char *name);
cuComplex complexMul(cuComplex a,cuComplex b);
double verifyDecomposition(cuComplex *U,float *S,cuComplex *V,cuComplex *A,int N);
void GPU_Multi(cuComplex **M,cuComplex **N,cuComplex **P,size_t pr,size_t pc,size_t mc,size_t num_mat,cuComplex alpha,cuComplex beta);
/********/
/* MAIN */
/********/
int main() {
const int M = 5;
const int N = 3;
const int K = 2;
int lda = M;
const int numMatrices = 128*3;
//const int numMatrices = 256;
cublasHandle_t cublasHandle = NULL;
/* residual and executed_sweeps are not supported on gesvdjBatched */
//double residual = 0;
int executed_sweeps = 0;
TimingGPU timerGPU;
cudaEvent_t start,stop;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cublasCreate(&cublasHandle);
cudaEventCreate(&start);
cudaEventCreate(&stop);
int major = -1,minor = -1,patch = -1;
cusolverGetProperty(MAJOR_VERSION,&major);
cusolverGetProperty(MInor_VERSION,&minor);
cusolverGetProperty(PATCH_LEVEL,&patch);
printf("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n",major,minor,patch);
// --- Setting the host matrix
cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(cuComplex));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++) {
for (unsigned int j = 0; j < N; j++) {
h_A[k * M * N + j * M + i] = make_float2((rand() * 1.0 + k*0.1)/ RAND_MAX,(rand() * 1.0 + k*0.1) / RAND_MAX); //make_float2((1. / (k + 1)) * (i + j * j) * (i + j),(1. / (k + 1)) * (i + j * j) * (i + j));
//printf("[%d,%d] %f\n",i,j,h_A[j*M + i]);
//printf("%f %f ",h_A[j * M + i].x,h_A[j * M + i].y);
}
//printf("\n");
}
printArray(h_A,M,N,'T',"h_A1");
printArray(h_A + M * N,"h_A2");
cuComplex *d_b = NULL;
checkCudaErrors(cudamalloc((void **)&d_b,sizeof(cuComplex) * M * K * numMatrices));
cuComplex *h_b = (cuComplex *)malloc(sizeof(cuComplex) * M * K * numMatrices);
for (int k = 0; k < numMatrices; k++)
{
for (int i = 0; i < M; i++)
{
for (int j = 0; j < K; j++)
{
//h_b[i] = make_float2((i + 1) * 1.0,(i + 1) * 1.0);
if ((i == 0) || (i == 1) || (i == 2) || (i == 3) || (i == 4))
h_b[k * M * K + i * K + j] = make_float2(1.0,0.0);
else
h_b[k * M * K + i * K + j] = make_float2(2.0,0.0);
}
}
}
gpuErrchk(cudamemcpy(d_b,h_b,sizeof(cuComplex) * M * K * numMatrices,cudamemcpyHostToDevice));
printArray(h_b,K,'N',"h_b1");
printArray(h_b + M * K,"h_b2");
// --- Setting the device matrix and moving the host matrix to the device
cuComplex *d_A; gpuErrchk(cudamalloc(&d_A,M * N * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudamemcpy(d_A,h_A,M * N * numMatrices * sizeof(cuComplex),cudamemcpyHostToDevice));
// --- host side SVD results space
float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
cuComplex *h_SI = (cuComplex *)malloc(N * M * numMatrices * sizeof(cuComplex));
cuComplex *h_x = (cuComplex *)malloc(N * numMatrices * sizeof(cuComplex));
cuComplex *h_U = NULL;
cuComplex *h_V = NULL;
#ifdef FULLSVD
h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudamalloc(&devInfo,sizeof(int)));
float *d_S; gpuErrchk(cudamalloc(&d_S,N * numMatrices * sizeof(float)));
cuComplex *d_SI; gpuErrchk(cudamalloc(&d_SI,N * M * numMatrices * sizeof(cuComplex)));
cuComplex *d_x; gpuErrchk(cudamalloc(&d_x,N * numMatrices * sizeof(cuComplex)));
cuComplex *d_U = NULL;
cuComplex *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudamalloc(&d_U,M * M * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudamalloc(&d_V,N * N * numMatrices * sizeof(cuComplex)));
#endif
cuComplex *d_work = NULL; /* device workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 100;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance,since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params,tol));
// --- Set the maximum number of sweeps,since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params,maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
solver_handle,jobz,// --- Compute the singular vectors or not
M,// --- Number of rows of A,0 <= M
N,// --- Number of columns of A,0 <= N
d_A,// --- M x N
lda,// --- Leading dimension of A
d_S,// --- Square matrix of size min(M,N) x min(M,N)
d_U,// --- M x M if econ = 0,M x min(M,N) if econ = 1
lda,// --- Leading dimension of U,ldu >= max(1,M)
d_V,// --- N x N if econ = 0,N x min(M,// --- Leading dimension of V,ldv >= max(1,N)
&work_size,gesvdj_params,numMatrices));
gpuErrchk(cudamalloc(&d_work,sizeof(cuComplex) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnCgesvdjBatched(
solver_handle,N) if econ = 1
N,N)
d_work,work_size,devInfo,numMatrices));
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
printf("Calculation of the singular values only: %f ms\n\n",timerGPU.GetCounter());
/*
* The folowing two functions do not support batched version.
* The error CUSOLVER_STATUS_NOT_SUPPORTED is returned.
*/
status = cusolverDnXgesvdjGetSweeps(
solver_handle,&executed_sweeps);
assert(CUSOLVER_STATUS_NOT_SUPPORTED == status);
status = cusolverDnXgesvdjGetResidual(
solver_handle,&residual);
assert(CUSOLVER_STATUS_NOT_SUPPORTED == status);
gpuErrchk(cudamemcpy(&devInfo_h,sizeof(int),cudamemcpyDevicetoHost));
gpuErrchk(cudamemcpy(h_S,d_S,sizeof(float) * N * numMatrices,cudamemcpyDevicetoHost));
#ifdef FULLSVD
gpuErrchk(cudamemcpy(h_U,d_U,sizeof(cuComplex) * lda * M * numMatrices,cudamemcpyDevicetoHost));
gpuErrchk(cudamemcpy(h_V,d_V,sizeof(cuComplex) * N * N * numMatrices,cudamemcpyDevicetoHost));
#endif
}
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)