问题描述
Nvprof报告说,我的sgemm内核中大约有2亿个shared_ld_bank_conflict和一些shared_st_bank_conflict。我尝试了填充技巧__shared__ float smem[SIZE + OFFSET];
,它将存储库冲突减少到0,但仍然存在加载库冲突。我不知道如何进一步改善它。
__global__ void sgemm(
const float* __restrict__ A,const float* __restrict__ B,float* __restrict__ C,int M,int N,int K
){
int tid = threadIdx.x;
int gStartx = blockIdx.x * 128;
int gStarty = blockIdx.y * 128;
int dx = tid % 8;
int dy = tid / 8;
int vx = tid % 16;
int vy = tid / 16;
__shared__ volatile float aSM[8][128+4];
__shared__ volatile float bSM[8][128+4];
float aBuffer1[4];
float bBuffer1[4];
float aBuffer2[4];
float bBuffer2[4];
float cCache[8][8];
#pragma unroll
for (int i=0; i<8; i++)
#pragma unroll
for (int j=0; j<8; j++)
cCache[i][j] = 0.f;
//load first two tiles
#pragma unroll
for (int i=0; i<4; i++){
aBuffer1[i] = A[(gStarty + dy + i*32)*K + (dx)];
bBuffer1[i] = B[(gStartx + dy + i*32)*K + (dx)];
}
int nIt = (K + 8 - 1) / 8;
#pragma unroll
for (int itr=0; itr<nIt; itr++){
int gStartk = itr * 8;
int is_odd = itr & 1;
if (is_odd == 0){
#pragma unroll
for (int i=0; i<4; i++){
if (itr != (nIt - 1)){
// prefetch next tiles
aBuffer2[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
bBuffer2[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
}
//move current tiles to SMEM
aSM[dx][dy+i*32] = aBuffer1[i];
bSM[dx][dy+i*32] = bBuffer1[i];
}
} else {
#pragma unroll
for (int i=0; i<4; i++){
if (itr != (nIt - 1)){
//prefetch next tiles to another buffer
aBuffer1[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
bBuffer1[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
}
aSM[dx][dy+i*32] = aBuffer2[i];
bSM[dx][dy+i*32] = bBuffer2[i];
}
}
__syncthreads();
float aCache[8][4];
#pragma unroll
for (int p=0; p<2; p++){
#pragma unroll
for (int ki=0; ki<8; ki++){
#pragma unroll
for (int mi=0; mi<4; mi++){
aCache[ki][mi] = aSM[ki][8*vy + 4*p +mi];
}
}
#pragma unroll
for (int ki=0; ki<8; ki++){
#pragma unroll
for (int ni=0; ni<8; ni++){
float b = bSM[ki][8*vx + ni];
#pragma unroll
for (int mi=0; mi<4; mi++){
float a = aCache[ki][mi];
cCache[mi + 4*p][ni] = fma(a,b,cCache[mi + 4*p][ni] );
}
}
}
}
__syncthreads();
}
#pragma unroll
for (int i=0; i<8; i++){
for (int j=0; j<8; j++){
C[(gStarty + vy*8 + i)*N + (gStartx + vx*8 + j)] = cCache[i][j];
}
}
}
A(2048x2048)矩阵是行主要的,B(2048x2048)是列主要的,每个块具有256个线程,每个块计算C的128x128部分,每个线程计算8x8x8。 GPU是Tesla P100。
解决方法
好吧,我找到了一个解决方案:当存储到bSM中时,在第二维的每32个单词之间插入一个填充单词
//bSM[dx][dy+i*32] = bBuffer1[i];
bSM[dx][dy+i*33] = bBuffer1[i]; //we're skipping column 32,65,98,131
在阅读bSM[i][j]
时,应这样阅读:bSM[i][j/32 + j]
//float b = bSM[ki][8*vx + ni];
float b = bSM[ki][(8*vx) / 32 + 8*vx + ni];
// (8*vx+ni)/32 is the same as (8*vx)/32,since vi is always less than 8
现在它在tesla p4上使我的cublas gemm性能提高了55%