使用 Numba

问题描述

我需要将矩阵与其转置相乘,但我的 GPU 内存不足并显示错误消息 numba.cuda.cudadrv.driver.CudaAPIError: [2] Call to cuMemAlloc results in CUDA_ERROR_OUT_OF_MEMORY

我期望矩阵的大小大约为 10k 行和 100k 列,因此将其与其 trnspose 相乘将得到 10k 行和 10k 列的方阵的结果。矩阵只包含0和1。

这是我正在运行的脚本。

from numba import cuda,uint16
import numba
import numpy
import math
import time

TPB = 16

@cuda.jit()
def matmul_shared_mem(A,B,C):
    sA = cuda.shared.array((TPB,TPB),dtype=uint16)
    sB = cuda.shared.array((TPB,dtype=uint16)
    x,y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    if x >= C.shape[0] and y >= C.shape[1]:
        return
    tmp = 0.
    for i in range(int(A.shape[1] / TPB)):
        sA[tx,ty] = A[x,ty + i * TPB]
        sB[tx,ty] = B[tx + i * TPB,y]
        cuda.syncthreads()
        for j in range(TPB):
            tmp += sA[tx,j] * sB[j,ty]
        cuda.syncthreads()
    C[x,y] = tmp

A = numpy.random.randint(2,size=(TPB * 625,50000))

B = A.transpose()

C_shared_mem = cuda.device_array((A.shape[0],B.shape[1]))

threads_per_block = (TPB,TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(B.shape[1] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x,blocks_per_grid_y)

start_gpu_shared_memory = time.time()
matmul_shared_mem[blocks_per_grid,threads_per_block](A,C_shared_mem)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))

更新 1:

根据您的建议,我进行了一些更改,但我的内存仍然不足。

import numpy as np
import numba as nb
colm = int(200000/8)
rows = 100000
cols = int(colm*8)
AU = np.random.randint(2,size=(rows,cols),dtype=np.int8)
A = np.empty((rows,colm),dtype=np.uint8)

@nb.njit('void(uint8[:,:],int8[:,:])',parallel=True)
def compute(A,AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = AU[i,offset] << 7
            res |= AU[i,offset+1] << 6
            res |= AU[i,offset+2] << 5
            res |= AU[i,offset+3] << 4
            res |= AU[i,offset+4] << 3
            res |= AU[i,offset+5] << 2
            res |= AU[i,offset+6] << 1
            res |= AU[i,offset+7]
            A[i,j] = res

compute(A,AU)

from numba import cuda,uint8,int32
import numba
import numpy as np
import math
import time

TPB = 8
TPB1 = 9

@cuda.jit()
def bit_A_AT(A,dtype=uint8)
    sB = cuda.shared.array((TPB,TPB1),dtype=uint8)
    x,y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    if bx >= by:
        tmp = int32(0)
        for i in range((A.shape[1]+TPB-1) // TPB):
            if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sA[ty,tx] = A[y,i*TPB+tx]
            else:
                sA[ty,tx] = 0
            if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
                sB[ty,tx] = A[TPB*bx+ty,i*TPB+tx]
            else:
                sB[ty,tx] = 0
            cuda.syncthreads()
            for j in range(TPB):
                tmp1 = sA[ty,j] & sB[tx,j]
                test = uint8(1)
                for k in range(8):
                    if (tmp1 & test) > 0:
                        tmp += 1
                    test <<= 1
            cuda.syncthreads()
        if y < C.shape[0] and x < C.shape[1]:
            C[y,x] = tmp

C = np.empty((A.shape[0],A.shape[0]),dtype=np.int32)
threads_per_block = (TPB,TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x,blocks_per_grid_y)

start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid,C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))

知道如何解决这个问题吗?

解决方法

以下方法应该可以减少计算 A x AT 所需的设备内存量。我们将使用以下想法:

  • 由于输入数组 (A) 仅采用 0,1 的值,因此我们将该数组的存储空间减少到最小方便大小 int8,即每个元素一个字节
  • 由于 B 数组只是 A 数组的转置,因此无需显式处理它。我们可以从 A 数组中推导出它,有点类似于 here,尽管它执行的是 AT x A
  • A x AT 的矩阵乘法涉及取矩阵 A 的行的点积,如here
  • 我们将使用调整后的索引在 sB 数组中提供 A 转置版本
  • 对您的代码进行了一系列其他更改,以解决各种错误并提高加载/存储效率,例如您对 x,y 索引的使用的普遍逆转
  • 我还修复了您对 syncthreads 的使用,并修改了代码以允许行和列维度的任意值

这是一个有效的例子:

$ cat t62.py
from numba import cuda,int32,int8
import numba
import numpy as np
import math
import time

TPB = 32
TPB1 = TPB+1
@cuda.jit()
def byte_A_AT(A,C):
    sA = cuda.shared.array((TPB,TPB),dtype=int8)
    sB = cuda.shared.array((TPB,TPB1),dtype=int8)
    x,y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
# uncomment and indent remainder of kernel to only do the "symmetric half" of calculation
#    if bx >= by:
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1)// TPB):
        if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty,tx] = A[y,i*TPB+tx]
        else:
            sA[ty,tx] = 0
        if (TPB*bx+ty) < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sB[ty,tx] = A[TPB*bx+ty,i*TPB+tx]
        else:
            sB[ty,tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp += int32(sA[ty,j]) * int32(sB[tx,j])
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y,x] = tmp
rows = 1041
cols = 1043
print('host mem: ',(rows*cols*2+rows*rows*4*2)//1048576,'MB device mem: ',(rows*cols+rows*rows*4)//1048576,'MB')
A = np.random.randint(2,size=(rows,cols),dtype=np.int8)
AT = A.transpose()
CU = np.matmul(A,AT,dtype = np.int32)
C = np.empty((A.shape[0],A.shape[0]),dtype=np.int32)
threads_per_block = (TPB,TPB)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(math.ceil(A.shape[0] / threads_per_block[1]))
blocks_per_grid = (blocks_per_grid_x,blocks_per_grid_y)
byte_A_AT[blocks_per_grid,threads_per_block](A,C)
cuda.synchronize()
start_gpu_shared_memory = time.time()
byte_A_AT[blocks_per_grid,C)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
test = np.array_equal(C,CU)
print(test)
if test == False:
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            if C[i,j] != CU[i,j]:
                print(i,' ',j,C[i,j],CU[i,j])
$ python t62.py
host mem:  10 MB device mem:  5 MB
GPU time(shared memory):0.019593000411987305
True
$

注意事项:

  • 以上代码的大部分运行时间会消耗在python中(在np.matmul()操作中,实际上只是用于验证结果,实际实现中应该没有必要),而不是在GPU中部分。随着矩阵变大,代码运行速度会变慢。
  • 如评论中所述,A x AT 的结果是一个对称矩阵。我的代码没有利用这一点,但是我们可以粗略地利用它,方法是取消内核开头的 if 测试的注释,然后缩进内核的其余部分。但是,这当然会导致主机代码 np.array_equal 测试失败。
  • 为此的设备内存消耗在代码中计算。对于您在评论中的最大值(行 = 30k,列数 = 200k),这将达到大约 10GB,因此它仍然不会在您的 8GB GPU 中运行。
  • 我创建了此代码的一个版本,该版本为 A 矩阵每字节打包 8 个元素,这将进一步减少内存需求,但是编写此代码来处理任意列维度(与列的倍数相比)的情况8) 证明相当混乱。但是,对于 30k 行和 200k 列的情况,该代码可以将设备总内存消耗降低到大约 5GB。

这是每个元素版本的一位,它要求列数可以被8整除:

$ cat t61.py
from numba import cuda,uint8,int32
import numba
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_A_AT(A,dtype=uint8)
    sB = cuda.shared.array((TPB,dtype=uint8)
    x,y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty,tx] = 0
        cuda.syncthreads()
        for j in range(TPB):
            tmp1 = sA[ty,j] & sB[tx,j]
            test = uint8(1)
            for k in range(8):
                if (tmp1 & test) > 0:
                    tmp += 1
                test <<= 1
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y,x] = tmp
colm = 131
rows = 1041
cols = int(colm*8)
print('host mem: ',(((rows*cols)//8)+rows*rows*4)//1048576,'MB')
AU = np.random.randint(2,dtype=np.int8)
AUT = AU.transpose()
CU = np.matmul(AU,AUT,dtype=np.int32)
A = np.empty((rows,colm),dtype=np.uint8)
for i in range(A.shape[0]):
    for j in range(A.shape[1]):
        A[i,j] = 0
        for k in range(8):
            if AU[i,(j*8)+k] == 1:
                A[i,j] = A[i,j] | (1<<(7-k))
C = np.empty((A.shape[0],blocks_per_grid_y)
bit_A_AT[blocks_per_grid,C)
cuda.synchronize()

start_gpu_shared_memory = time.time()
bit_A_AT[blocks_per_grid,j])
                break
$ python t61.py
host mem:  10 MB device mem:  4 MB
GPU time(shared memory):0.009343624114990234
True
$

编辑:回答评论、更新中的一些问题,现在考虑到 A 矩阵可能有明显超过 30k 行,这也会导致 C 矩阵增加。如果 A 矩阵可以适合 GPU 内存,我们可以通过分块计算来减少 C 矩阵的内存需求。这些部分将是一组一起计算的行,我将其称为 C 矩阵的 row_slice。以下代码表明,只需对上述代码进行相对较小的更改即可实现这一点:

$ cat t63.py
from numba import cuda,int32
import numba as nb
import numpy as np
import math
import time

TPB = 32
TPB1 = 33

@cuda.jit()
def bit_slice_A_AT(A,C,row_offset):
    sA = cuda.shared.array((TPB,y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    tmp = int32(0)
    for i in range((A.shape[1]+TPB-1) // TPB):
        if y+row_offset < A.shape[0] and (i*TPB+tx) < A.shape[1]:
            sA[ty,tx] = A[y+row_offset,x] = tmp

@nb.njit('void(uint8[:,:],int8[:,:])',parallel=True)
def bitpack(A,AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = AU[i,offset] << 7
            res |= AU[i,offset+1] << 6
            res |= AU[i,offset+2] << 5
            res |= AU[i,offset+3] << 4
            res |= AU[i,offset+4] << 3
            res |= AU[i,offset+5] << 2
            res |= AU[i,offset+6] << 1
            res |= AU[i,offset+7]
            A[i,j] = res

colm = 131
rows = 1535
cols = int(colm*8)
row_slice = 512
print('host mem: ',(((rows*cols)//8)+row_slice*rows*4)//1048576,dtype=np.int8)
CU = np.matmul(AU,AU.T,dtype=np.uint8)
bitpack(A,AU)
A_dev = cuda.to_device(A)
threads_per_block = (TPB,TPB)
C = np.empty((row_slice,dtype=np.int32)
blocks_per_grid_x = int(math.ceil(A.shape[0] / threads_per_block[0]))
blocks_per_grid_y = int(row_slice / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x,blocks_per_grid_y)
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid,threads_per_block](A_dev,i*row_slice)
    lower = i*row_slice
    upper = min(lower+row_slice,CU.shape[0])
    width = upper-lower
    test = np.array_equal(C[:width,CU[i*row_slice:i*row_slice+width,:])
    print(test)
cuda.synchronize()
C_dev = cuda.device_array_like(C)
start_gpu_shared_memory = time.time()
for i in range((A.shape[0]+row_slice-1)//row_slice):
    bit_slice_A_AT[blocks_per_grid,C_dev,i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t63.py
host mem:  21 MB device mem:  3 MB
True
True
True
GPU time(shared memory):0.010116815567016602
$

这意味着,正如建议的那样,对于问题的最新更新中给出的行 = 100k 和列 = 200k 的情况,我们应该能够将 C 矩阵分成 5k 行的块。 A 矩阵的内存使用量为 2.5GB,但对于 C 矩阵,由于我们一次仅计算 5k 行切片,因此所需的设备内存存储量为 100k*5k*4 字节,因此本示例为 2GB .

经过进一步研究,我们可以通过从 matmul 数据类型切换到 int8 数据类型来加速主机代码 float32 操作。这使得该操作快了很多,但 GPU 代码似乎仍然比这快约 4 倍:

$ cat t64.py
from numba import cuda,float32[:,AU):
    for i in nb.prange(A.shape[0]):
        for j in range(A.shape[1]):
            offset = j * 8
            res = int(AU[i,offset]) << 7
            res |= int(AU[i,offset+1]) << 6
            res |= int(AU[i,offset+2]) << 5
            res |= int(AU[i,offset+3]) << 4
            res |= int(AU[i,offset+4]) << 3
            res |= int(AU[i,offset+5]) << 2
            res |= int(AU[i,offset+6]) << 1
            res |= int(AU[i,offset+7])
            A[i,j] = res

colm = 1000
rows = 6000
cols = int(colm*8)
row_slice = 512
print('host mem: ',(rows*cols*4+rows*colm+rows*rows*4+rows*row_slice*4)//1048576,'MB')
t1 = time.time()
AU = np.random.randint(2,dtype=np.int8)
AU = AU.astype(np.float32)
print("randint:" + str(time.time()-t1))
t1 = time.time()
#CU = np.empty((rows,rows),dtype=np.int32)
CU = np.matmul(AU,dtype=np.float32)
print("matmul:" + str(time.time()-t1))
t1 = time.time()
A = np.empty((rows,dtype=np.uint8)
print("np.empty:" + str(time.time()-t1))
t1 = time.time()
bitpack(A,AU)
print("bitpack:" + str(time.time()-t1))
t1 = time.time()
A_dev = cuda.to_device(A)
threads_per_block = (TPB,i*row_slice)
cuda.synchronize()
end_gpu_shared_memory = time.time()

time_gpu_shared = end_gpu_shared_memory - start_gpu_shared_memory
print("GPU time(shared memory):" + str(time_gpu_shared))
$ python t64.py
host mem:  337 MB device mem:  17 MB
randint:0.1817936897277832
matmul:3.498671531677246
np.empty:7.62939453125e-05
bitpack:0.03707313537597656
True
True
True
True
True
True
True
True
True
True
True
True
GPU time(shared memory):0.8318064212799072
$

我还没有彻底测试这些代码。可能存在错误。使用风险自负。

对于归属,numba 位打包代码似乎来自 here