运行具有大数据集的程序时,程序命中cudaErrorIllegalAdress时没有cuda-memcheck错误

问题描述

我是cuda的新手,正在做一个小项目,以了解在执行程序时遇到以下错误时如何使用它:

CUDA error at main.cu:89 code=77(cudaErrorIllegalAddress) "cudaMemcpy(HMAT,MAT,int(size*sizeof(float)),cudaMemcpyDeviceToHost)"

该程序的目的是获得kmer距离矩阵,因此该程序将带有序列的fasta文件和将用于计算序列之间距离的k作为输入。 * fasta文件结构非常简单,它具有一系列标题和序列:

>Header 1
ACGTAGT...TCG
>Header 2
CAGAGT...ACT

程序的结构如下:

  1. 读取fasta文件,这是在主机上完成的。标头存储在vector<string>上,序列存储在vector<string>以及char**上,当我第一次编写程序时,没有并行化,我使用了向量和字符串,但是由于我意识到将这些向量和字符串转换为我决定使用选项char**的其他设备兼容的字符串和类似向量的结构是很困难的。

  2. 计算出kmer距离,这是通过kmdist函数在设备上完成的,该函数针对每个字符串(序列)比较调用(此部分内部调用了许多函数):>

    2.1。将字符串转换为所有大小为k的子字符串的字符串,例如,输入字符串“ abcde”,如果ak = 3,则出现字符串“ abcbcdcde”,因此,如果原始字符串的大小为N,那么输出的字符串的大小为(N-k + 1)* k。

    2.2。子字符串字符串按字母顺序排序(可以改进,但稍后会做),例如,如果输入字符串“ cddabcaba”,则出现字符串“ abaabccdd”。

    2.3。还比较了两个子字符串,以查看共享了多少个子字符串,例如:子字符串“ abccde”和子字符串“ abccddcde”将返回2,因为它们共享子字符串“ abc”和“ cde”。

    2.4。 kmer距离是使用公式计算的。

值得一提的是,即使保留了N * N(N为序列总数)线程,也仅进行必要的比较,这是因为矩阵是对称的,对角线为0,我知道肯定有更好的方法,但我想不出任何其他选择。

源代码为:

aligner.h:

#ifndef ALIGNER_H
#define ALIGNER_H
#include <string>
#include <vector>
#include <iostream>
#include <stdio.h>
#include <fstream>
#include <sstream>
using namespace std;

class aligner{
public:
  __host__ aligner();
  __host__ aligner(string name);
  __host__ ~aligner();
  __host__ double length();
  __host__ void getReads();
  char** h_reads;
  vector<string>* reads;
  vector<string>* headers;
  __device__ static void swapp(char* a,int id1,int id2,int k);
  __device__ static double compare(char* v1,char* v2,int* k);
  __device__ static float kmdist(char* A,char* B,int* k);
  __device__ static void veckm(char *test,char *a,int k);
private:
  string fileName;
  double noReads;
};
#endif

aligner.cu:

#include "aligner.h"

__host__ aligner::aligner(){}
__host__ aligner::aligner(string name){
  fileName = name;
}
__host__ aligner::~aligner(){}

__host__ double aligner::length(){
    double counter = 0;
    ifstream file(fileName.c_str());
    string data;
    while(getline(file,data)){
        if(data[0] == '>')counter++;
    }
    noReads =  counter;
    return counter;
}
__host__ void aligner::getReads(){
  reads = new vector<string>(noReads);
  h_reads = new char*[int(noReads)];
  headers =  new vector<string>(noReads);
  ifstream file(fileName.c_str());
  stringstream buffer;
  buffer << file.rdbuf();
  string data,id,seq;
  double indx = 0;
  while (getline(buffer,data)) {
    if(data.empty())
        continue;
    if(data[0] == '>') {
      if(!id.empty())
        {reads->at(indx) = seq;
        h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
        strcpy(h_reads[int(indx)],seq.c_str());
        h_reads[seq.size()] = '\0';
        headers->at(indx) = id;
        indx++;}
      id = data.substr(1);
      seq.clear();
    }
    else {
      seq += data;
    }
  }
  if(!id.empty()){
    reads->at(indx) = seq;
    h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
    strcpy(h_reads[int(indx)],seq.c_str());
    h_reads[seq.size()] = '\0';
    headers->at(indx) = id;
  }
}
__device__ void aligner::swapp(char* a,int k){
  char c;
  for(int i=0;i<k;i++){
    c = a[id1+i];
    a[id1+i] = a[id2+i];
    a[id2+i] = c;
  }
}
__device__ bool minstr(char* a,char* b,int k){
  for(int i =0;i<k;i++){
    if(a[i] < b[i]){return true;}
    if(a[i] > b[i]){return false;}
  }
  return false;
}
__device__ void alphsort(char* arr,int size,int k){
  char *tmp1,*tmp2;
  tmp2 = (char*)malloc(k*sizeof(char));
  tmp1 = (char*)malloc(k*sizeof(char));
  for(int i = 0;i<(size*k);i+=k){
    for(int l = 0;l<(size-1)*k;l+=k){
      for(int j = 0;j<k;j++)
        {tmp1[j] = arr[j+l];
        tmp2[j] = arr[j+l+k];}
      if(minstr(tmp2,tmp1,k))
      {aligner::swapp(arr,l,l+k,k);
      }
    }
  }
  free(tmp2);free(tmp1);
}
__device__ int cmpr(char* a,int k){
  for(int i =0;i<k;i++){
    if(a[i] < b[i]){return 0;} //a es menor
    if(a[i] > b[i]){return 1;} //b es mayor
  }
  return 2; //son iguales
}

__device__ double aligner::compare(char* v1,int* k){
  int len1 = 0,len2=0;
  double res = 0;
  int dk = *k;
  while (v1[len1] !='\0') {
    len1++;}
  while (v2[len2] !='\0') {
    len2++;}
  int size1 = len1/dk;
  int size2 = len2/dk;
  alphsort(v1,size1,dk);
  alphsort(v2,size2,dk);
  int i = 0,j=0;
  char *tmp1,*tmp2;
  tmp1 = (char*)malloc(dk*sizeof(char));
  tmp2 = (char*)malloc(dk*sizeof(char));
  int mm;
  while (i<len1&&j<len2) {
    for(int c=0;c<dk;c++){
      tmp1[c] = v1[c+i];
      tmp2[c] = v2[c+j];
    }
    mm = cmpr(tmp1,tmp2,dk);
    if(mm == 2){
      i+=dk;
      j+=dk;
      res++;
    }
    else{
        if (mm==0){i+=dk;}
        else{j+=dk;}
    }
  }
  free(tmp1);free(tmp2);
  return res;
}
__device__ float aligner::kmdist(char* A,int* k){
  float d = 1;
  int dk = *k;
  double m;
  double l;
  int len1 = 0,len2=0;
  while (A[len1] !='\0') {
    len1++;}
  while (B[len2] !='\0') {
    len2++;}
  char *v1,*v2;
  v1 = (char*)malloc((len1-dk+1)*dk*sizeof(char));
  v2 = (char*)malloc((len2-dk+1)*dk*sizeof(char));
  aligner::veckm(A,v1,dk);
  aligner::veckm(B,v2,dk);
  m = aligner::compare(v1,k);
  if(len1<len2){
    l = len1;}
  else{l = len2;}
  d -= m/(l-dk+1);
  free(v1);free(v2);
  return d;
}
__device__ void aligner::veckm(char *A,int k) {
  //Define size of string and size of array of substring
  int Size = 0;
  while (A[Size] != '\0') Size++;
  int n=0;
  for (int i=0; i < Size-(k-1); i++) {
    //Substrings obtainment
    for (int j = i; j < i+k ; j++){
      a[n] = A[j];
      n++;
    }
  }
}

main.cu:

#include "aligner.h"
#include <helper_cuda.h>
#include <time.h>
__global__ void kernel(float* MAT,char** a,int *k,double *len){
  int col = blockIdx.y*blockDim.y+threadIdx.y;
  int row = blockIdx.x*blockDim.x+threadIdx.x;
  int l = int(*len);
  if(col < l && row < l){
    if(col>row){
      MAT[row*l+col] = aligner::kmdist(a[row],a[col],k);
    }
  }
}
void compare(float* MAT,float* HMAT,double size,double len,char* r[],int k);
int main(int argc,char const *argv[]) {
  string file = argv[1];
  int K = atoi(argv[2]);
  aligner objAl(file);
  double len = objAl.length();
  objAl.getReads();
  double size = len*len;
  float* h_mat;
  h_mat = (float*)malloc(int(size*sizeof(float)));
  memset(h_mat,int(size*sizeof(float)));
  float* d_mat;
  checkCudaErrors(cudaMalloc((void**)&d_mat,int(size*sizeof(float))));
  checkCudaErrors(cudaMemcpy(d_mat,h_mat,cudaMemcpyHostToDevice));
  compare(d_mat,size,len,objAl.h_reads,K);
  //This was used to print the matrix and observe the output when it was small
  /*for(int i = 0;i<int(len);i++){
    for(int j = 0;j<(len);j++){
      printf("%f  ",h_mat[i*int(len)+j]);
    }
    printf("\n");
  }*/
  return 0;
}
//Call to the global function and make everything
void compare(float* MAT,int k){
  char **d_reads,**d_tmp;
  checkCudaErrors(cudaMalloc((void**)&d_reads,len*sizeof(char*)));
  d_tmp = (char**)malloc(len*sizeof(char*));
  int slen = 0;
  for(int i=0;i<len;i++){
    slen = strlen(r[i]);
    checkCudaErrors(cudaMalloc(&(d_tmp[i]),slen*sizeof(char)));
    checkCudaErrors(cudaMemcpy(d_tmp[i],r[i],slen*sizeof(char),cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_reads+i,&(d_tmp[i]),sizeof(char*),cudaMemcpyHostToDevice));
  }
  int *d_k;
  int* ptr_max_len = &k;
  checkCudaErrors(cudaMalloc((void**)&d_k,int(sizeof(int))));
  checkCudaErrors(cudaMemcpy(d_k,ptr_max_len,int(sizeof(int)),cudaMemcpyHostToDevice));
  double *d_len;
  double* d_tmp_len = &len;
  checkCudaErrors(cudaMalloc((void**)&d_len,int(sizeof(double))));
  checkCudaErrors(cudaMemcpy(d_len,d_tmp_len,int(sizeof(double)),cudaMemcpyHostToDevice));
  dim3 threadsPerBlock(len,len);
  dim3 blocksPerGrid(1,1);
  if (len*len > 256){
    threadsPerBlock.x = 16;
    threadsPerBlock.y = 16;
    blocksPerGrid.x = ceil(double(len)/double(threadsPerBlock.x));
    blocksPerGrid.y = ceil(double(len)/double(threadsPerBlock.y));
  }
  //Take time
  cudaEvent_t start,stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  //Parallel function
  cudaEventRecord(start,0);
  kernel<<<blocksPerGrid,threadsPerBlock>>>(MAT,d_reads,d_k,d_len);
  cudaDeviceSynchronize();
  cudaEventRecord(stop,0);
  cudaEventSynchronize(stop);
  float timer = 0;
  cudaEventElapsedTime(&timer,start,stop);
  cout << "Elapsed parallel time:" << timer/1000 << "seconds" << endl;
  cudaDeviceSynchronize();
  checkCudaErrors(cudaMemcpy(HMAT,cudaMemcpyDeviceToHost));
  cudaDeviceSynchronize();
}

它是用以下命令编译的:

nvcc -rdc=true -lineinfo main.cu aligner.cu -o main

当它运行时,发生了这种情况:

-bash-4.1$ ./main ../prueba100.fasta 4
Elapsed parallel time:0seconds
CUDA error at main.cu:89 code=77(cudaErrorIllegalAddress) "cudaMemcpy(HMAT,cudaMemcpyDeviceToHost)" 

当我使用cuda-memcheck运行代码时,这是输出:

-bash-4.1$ cuda-memcheck ./main ../prueba100.fasta 4
========= CUDA-MEMCHECK
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaDeviceSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x404d6]
=========     Host Frame:./main [0x51a5]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaEventRecord. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x57572]
=========     Host Frame:./main [0x51b6]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaEventSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x513ae]
=========     Host Frame:./main [0x51c2]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaEventElapsedTime. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x5106c]
=========     Host Frame:./main [0x51e7]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
Elapsed parallel time:0seconds
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaDeviceSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x404d6]
=========     Host Frame:./main [0x5244]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
CUDA error at main.cu:89 code=77(cudaErrorIllegalAddress) "cudaMemcpy(HMAT,cudaMemcpyDeviceToHost)" 
========= Program hit cudaErrorIllegalAddress (error 77) due to "an illegal memory access was encountered" on CUDA API call to cudaMemcpy. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x3ac5a3]
=========     Host Frame:./main [0x42cbf]
=========     Host Frame:./main [0x527d]
=========     Host Frame:./main [0x4c29]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
=========     Host Frame:./main [0x4139]
=========
========= Error: process didn't terminate successfully
=========        The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= No CUDA-MEMCHECK results found

然后我使用cuda-gdb来查看是否可以诊断问题,但是我不知道我是否使用不正确,因此我不知道输出告诉了我什么,或者它根本没有帮助。

(cuda-gdb) run ../prueba100.fasta 3
Starting program: /home/ulises2/parallel_programing/ivan/Proyecto/Parallel/main ../prueba100.fasta 3
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: File "/share/apps/gcc/gcc6/lib64/libstdc++.so.6.0.22-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load".
To enable execution of this file add
    add-auto-load-safe-path /share/apps/gcc/gcc6/lib64/libstdc++.so.6.0.22-gdb.py
line to your configuration file "/export/home/ulises2/.cuda-gdbinit".
To completely disable this security protection add
    set auto-load safe-path /
line to your configuration file "/export/home/ulises2/.cuda-gdbinit".
For more information about this security protection see the
"Auto-loading safe path" section in the GDB manual.  E.g.,run from the shell:
    info "(gdb)Auto-loading safe path"
[New Thread 0x7ffff6853700 (LWP 11540)]
[New Thread 0x7ffff5e52700 (LWP 11541)]

CUDA Exception: Device Illegal Address
The exception was triggered in device 0.

Thread 1 "main" received signal CUDA_EXCEPTION_10,Device Illegal Address.
[Switching focus to CUDA kernel 0,grid 1,block (1,1,0),thread (0,6,device 0,sm 6,warp 3,lane 0]
0x0000000000bb72b8 in aligner::kmdist(char*,char*,int*) ()

我从cuda-gdb输出中了解到,即使cuda-memcheck指出执行de cudaMemcpy时发生了错误,但该错误实际上发生在函数aligner::kmdist(char*,int*)上,该函数在内核内部被调用。

当我尝试只有2个序列的函数时,代码可以工作,但是需要指出的是,它非常慢,在比较100个序列时,这2个序列的实现与串行实现相当。

值得一提的是,当我用这2个序列和其他3个序列运行代码时,针对这2个序列计算的kmer距离发生了变化,总共进行了5个序列比较。

-bash-4.1$ ./main ../prueba2.fasta 3
Elapsed parallel time:8.58585seconds
0.000000  0.002770  
0.000000  0.000000  
-bash-4.1$ ./main ../prueba5.fasta 3
Elapsed parallel time:185.891seconds
0.000000  0.177778  0.000000  0.000000  0.000000  
0.000000  0.000000  0.005540  0.000000  0.000000  
0.000000  0.000000  0.000000  0.023438  0.150875  
0.000000  0.000000  0.000000  0.000000  0.021137  
0.000000  0.000000  0.000000  0.000000  0.000000  

如您所见,0.002770的值更改为0.177778,在某些时候我打印了veckm函数的输出以查看其是否正常工作,并且我注意到在第一个示例中它可以正常工作,输出是一个应该,但是在第二个示例中,即使函数相同,也不是,所以我假设它与我管理内存的方式有关,但是我不确定什么地方出错了。

我尝试与__shared__一起定义__syncthreads()数组,但是程序的输出保持不变,我还尝试减少每个块的线程数(这就是我使用的原因16),但最初的代码是:

if (len*len > 1024){
    threadsPerBlock.x = 32;
    threadsPerBlock.y = 32;
    blocksPerGrid.x = ceil(double(len)/double(threadsPerBlock.x));
    blocksPerGrid.y = ceil(double(len)/double(threadsPerBlock.y));
  }

用于尝试该程序的文件可以找到here

一些额外的信息:

  • 该程序是在使用Slurm的群集上执行的。所使用的群集分区具有2个Tesla K40m。

  • nvcc版本:

-bash-4.1 $ nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools,release 10.0,V10.0.130

我知道有很多可以改进的地方,但是我希望它在开始优化之前能够运行并起作用。

解决方法

您的内核malloc操作超过了the device heap size

每当您遇到在内核mallocnew中使用的CUDA代码遇到麻烦时,最好检查一下{{1} },然后再尝试使用(即取消引用)。

NULL中执行malloc操作之后,立即在您的代码中执行此操作时,我得到了命中断言,指示返回值为NULL。这表明您已超出设备堆。您可以增加设备堆的大小。

当我将设备堆大小增加到1GB时,此特定问题消失了,此时aligner::kmdist可能开始报告其他错误(我不知道,您的应用程序可能还存在其他缺陷,但最接近的问题这里超出了设备堆)。

顺便说一句,我还建议您编译代码以匹配所运行的体系结构:

cuda-memcheck

似乎您的代码中也存在另一个问题。

据我所见,strlen()返回字符串的长度不包括空终止符。您正在使用此长度来确定分配给字符串并传递给内核的字符串数组的大小:

nvcc -arch=sm_35 -rdc=true ... 

这意味着您没有为null终止符分配空间,也没有将null终止符复制到设备。

但是在您的设备代码( for(int i=0;i<len;i++){ slen = strlen(r[i]); checkCudaErrors(cudaMalloc(&(d_tmp[i]),slen*sizeof(char))); checkCudaErrors(cudaMemcpy(d_tmp[i],r[i],slen*sizeof(char),cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_reads+i,&(d_tmp[i]),sizeof(char*),cudaMemcpyHostToDevice)); } )中,您正在以下字符串上执行此操作:

aligner::kmdist

因此设计被破坏了。 (不确定为什么只传递长度数组,但是我离题了……)

我相信一个简单的解决方法可能是:

  while (A[len1] !='\0') {
    len1++;}
  while (B[len2] !='\0') {
    len2++;}

slen = strlen(r[i])+1; aligner::compare中也出现了类似的问题。 (即,此设计错误可能至少需要修复3个。)

您将aligner::kmdist变量用于doublesize之类的事情也让我感到奇怪。

使用以下修改的文件,

aligner.cu:

len

和main.cu:

#include "aligner.h"
#include <assert.h>
__host__ aligner::aligner(){}
__host__ aligner::aligner(string name){
  fileName = name;
}
__host__ aligner::~aligner(){}

__host__ double aligner::length(){
    double counter = 0;
    ifstream file(fileName.c_str());
    string data;
    while(getline(file,data)){
        if(data[0] == '>')counter++;
    }
    noReads =  counter;
    return counter;
}
__host__ void aligner::getReads(){
  reads = new vector<string>(noReads);
  h_reads = new char*[int(noReads)];
  headers =  new vector<string>(noReads);
  ifstream file(fileName.c_str());
  stringstream buffer;
  buffer << file.rdbuf();
  string data,id,seq;
  double indx = 0;
  while (getline(buffer,data)) {
    if(data.empty())
        continue;
    if(data[0] == '>') {
      if(!id.empty())
        {reads->at(indx) = seq;
        h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
        strcpy(h_reads[int(indx)],seq.c_str());
        h_reads[seq.size()] = '\0';
        headers->at(indx) = id;
        indx++;}
      id = data.substr(1);
      seq.clear();
    }
    else {
      seq += data;
    }
  }
  if(!id.empty()){
    reads->at(indx) = seq;
    h_reads[int(indx)] = (char*)malloc((seq.size()+1)*sizeof(char));
    strcpy(h_reads[int(indx)],seq.c_str());
    h_reads[seq.size()] = '\0';
    headers->at(indx) = id;
  }
}
__device__ void aligner::swapp(char* a,int id1,int id2,int k){
  char c;
  for(int i=0;i<k;i++){
    c = a[id1+i];
    a[id1+i] = a[id2+i];
    a[id2+i] = c;
  }
}
__device__ bool minstr(char* a,char* b,int k){
  for(int i =0;i<k;i++){
    if(a[i] < b[i]){return true;}
    if(a[i] > b[i]){return false;}
  }
  return false;
}
__device__ void alphsort(char* arr,int size,int k){
  char *tmp1,*tmp2;
  tmp2 = (char*)malloc(k*sizeof(char));
  tmp1 = (char*)malloc(k*sizeof(char));
  for(int i = 0;i<(size*k);i+=k){
    for(int l = 0;l<(size-1)*k;l+=k){
      for(int j = 0;j<k;j++)
        {tmp1[j] = arr[j+l];
        tmp2[j] = arr[j+l+k];}
      if(minstr(tmp2,tmp1,k))
      {aligner::swapp(arr,l,l+k,k);
      }
    }
  }
  free(tmp2);free(tmp1);
}
__device__ int cmpr(char* a,int k){
  for(int i =0;i<k;i++){
    if(a[i] < b[i]){return 0;} //a es menor
    if(a[i] > b[i]){return 1;} //b es mayor
  }
  return 2; //son iguales
}

__device__ double aligner::compare(char* v1,char* v2,int* k){
  int len1 = 0,len2=0;
  double res = 0;
  int dk = *k;
  while (v1[len1] !='\0') {
    len1++;}
  while (v2[len2] !='\0') {
    len2++;}
  int size1 = len1/dk;
  int size2 = len2/dk;
  alphsort(v1,size1,dk);
  alphsort(v2,size2,dk);
  int i = 0,j=0;
  char *tmp1,*tmp2;
  tmp1 = (char*)malloc(dk*sizeof(char));
  tmp2 = (char*)malloc(dk*sizeof(char));
  int mm;
  while (i<len1&&j<len2) {
    for(int c=0;c<dk;c++){
      tmp1[c] = v1[c+i];
      tmp2[c] = v2[c+j];
    }
    mm = cmpr(tmp1,tmp2,dk);
    if(mm == 2){
      i+=dk;
      j+=dk;
      res++;
    }
    else{
        if (mm==0){i+=dk;}
        else{j+=dk;}
    }
  }
  free(tmp1);free(tmp2);
  return res;
}
__device__ float aligner::kmdist(char* A,char* B,int* k){
  float d = 1;
  int dk = *k;
  double m;
  double l;
  int len1 = 0,len2=0;
  while (A[len1] !='\0') {
    len1++;}
  while (B[len2] !='\0') {
    len2++;}
  char *v1,*v2;
  len1++;
  len2++;
  v1 = (char*)malloc((len1-dk+1)*dk*sizeof(char));
  v2 = (char*)malloc((len2-dk+1)*dk*sizeof(char));
  assert(v1 != NULL);
  assert(v2 != NULL);
  aligner::veckm(A,v1,dk);
  aligner::veckm(B,v2,dk);
  m = aligner::compare(v1,k);
  if(len1<len2){
    l = len1;}
  else{l = len2;}
  d -= m/(l-dk+1);
  free(v1);free(v2);
  return d;
}
__device__ void aligner::veckm(char *A,char *a,int k) {
  //Define size of string and size of array of substring
  int Size = 0;
  while (A[Size] != '\0') Size++;
  Size++;
  int n=0;
  for (int i=0; i < Size-(k-1); i++) {
    //Substrings obtainment
    for (int j = i; j < i+k ; j++){
      a[n] = A[j];
      n++;
    }
  }
}

该代码将在K20X上无误地为我运行:

#include "aligner.h"
#include <helper_cuda.h>
#include <time.h>
__global__ void kernel(float* MAT,char** a,int *k,double *len){
  int col = blockIdx.y*blockDim.y+threadIdx.y;
  int row = blockIdx.x*blockDim.x+threadIdx.x;
  int l = int(*len);
  if(col < l && row < l){
    if(col>row){
      MAT[row*l+col] = aligner::kmdist(a[row],a[col],k);
    }
  }
}
void compare(float* MAT,float* HMAT,double size,double len,char* r[],int k);
int main(int argc,char const *argv[]) {
  checkCudaErrors(cudaDeviceSetLimit(cudaLimitMallocHeapSize,1048576ULL*256ULL));
  string file = argv[1];
  int K = atoi(argv[2]);
  aligner objAl(file);
  double len = objAl.length();
  objAl.getReads();
  double size = len*len;
  float* h_mat;
  h_mat = (float*)malloc(int(size*sizeof(float)));
  memset(h_mat,int(size*sizeof(float)));
  float* d_mat;
  checkCudaErrors(cudaMalloc((void**)&d_mat,int(size*sizeof(float))));
  checkCudaErrors(cudaMemcpy(d_mat,h_mat,int(size*sizeof(float)),cudaMemcpyHostToDevice));
  compare(d_mat,size,len,objAl.h_reads,K);
  //This was used to print the matrix and observe the output when it was small
  /*for(int i = 0;i<int(len);i++){
    for(int j = 0;j<(len);j++){
      printf("%f  ",h_mat[i*int(len)+j]);
    }
    printf("\n");
  }*/
  return 0;
}
//Call to the global function and make everything
void compare(float* MAT,int k){
  char **d_reads,**d_tmp;
  checkCudaErrors(cudaMalloc((void**)&d_reads,len*sizeof(char*)));
  d_tmp = (char**)malloc(len*sizeof(char*));
  int slen = 0;
  for(int i=0;i<len;i++){
    slen = strlen(r[i])+1;
    checkCudaErrors(cudaMalloc(&(d_tmp[i]),cudaMemcpyHostToDevice));
  }
  int *d_k;
  int* ptr_max_len = &k;
  checkCudaErrors(cudaMalloc((void**)&d_k,int(sizeof(int))));
  checkCudaErrors(cudaMemcpy(d_k,ptr_max_len,int(sizeof(int)),cudaMemcpyHostToDevice));
  double *d_len;
  double* d_tmp_len = &len;
  checkCudaErrors(cudaMalloc((void**)&d_len,int(sizeof(double))));
  checkCudaErrors(cudaMemcpy(d_len,d_tmp_len,int(sizeof(double)),cudaMemcpyHostToDevice));
  dim3 threadsPerBlock(len,len);
  dim3 blocksPerGrid(1,1);
  if (len*len > 256){
    threadsPerBlock.x = 16;
    threadsPerBlock.y = 16;
    blocksPerGrid.x = ceil(double(len)/double(threadsPerBlock.x));
    blocksPerGrid.y = ceil(double(len)/double(threadsPerBlock.y));
  }
  //Take time
  cudaEvent_t start,stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  //Parallel function
  cudaEventRecord(start,0);
  printf("threads x = %d,threads y = %d,block x = %d,block y = %d\n",threadsPerBlock.x,threadsPerBlock.y,blocksPerGrid.x,blocksPerGrid.y);
  kernel<<<blocksPerGrid,threadsPerBlock>>>(MAT,d_reads,d_k,d_len);
  cudaDeviceSynchronize();
  cudaEventRecord(stop,0);
  cudaEventSynchronize(stop);
  float timer = 0;
  cudaEventElapsedTime(&timer,start,stop);
  cout << "Elapsed parallel time:" << timer/1000 << "seconds" << endl;
  cudaDeviceSynchronize();
  checkCudaErrors(cudaMemcpy(HMAT,MAT,cudaMemcpyDeviceToHost));
  cudaDeviceSynchronize();
}

相关问答

错误1:Request method ‘DELETE‘ not supported 错误还原:...
错误1:启动docker镜像时报错:Error response from daemon:...
错误1:private field ‘xxx‘ is never assigned 按Alt...
报错如下,通过源不能下载,最后警告pip需升级版本 Requirem...