函数调用 Fortran Intel MKL dgemm 与 numpy 但也与 matmul 的性能不佳

问题描述

我在 python 中发现了 fortran 和 ctypes，因为我计划使用 intel MKL 向 python 公开一个 fortran 库，现在从 ac libray 本身调用的库被从 c++ 库调用......最终由 c# 使用.. .

一开始，我设法编写了定义各种矩阵乘法函数的简单 fortran 代码：一个简单的，一个使用 fortran 的 matmul，一个使用 intel MKL 的 dgemm：

        subroutine matmultorig(M1,M2,M3,M,N,K) bind(c,name='matmultorig')
            !DEC$ ATTRIBUTES DLLEXPORT :: matmultorig
            use iso_c_binding,only: c_double,c_int
        
            integer(c_int),intent(in) :: M,K
            real(c_double),intent(in) :: M1(M,N),M2(N,K)
            real(c_double),intent(inout):: M3(M,K)

            M3 = matmul(M1,M2)
        
        end subroutine

        subroutine matmultmy(M1,name='matmultmy')
            !DEC$ ATTRIBUTES DLLEXPORT :: matmultmy
            use iso_c_binding,K)
        
            integer :: i,j,l
        
            do i = 1,M
                do j = 1,K
                    M3(i,j) = 0.
                    do l = 1,N
                        M3(i,j) = M3(i,j) + M1(i,l) * M2(l,j)
                    end do
                end do
            end do
        
        end subroutine

        subroutine matmultmkl(M1,name='matmultmkl')
            !DEC$ ATTRIBUTES DLLEXPORT :: matmultmkl
            use iso_c_binding,K)

            CALL DGEMM('N','N',K,1.,M1,0.,M)
        
        end subroutine

我使用 .bat 文件编译 fortran（我在 windows 下）：

@Echo off

setlocal ENABLEDELAYEDEXPANSION
SET "IFORT_INITIAL_FLAGS=-c -fpp"
SET "IFORT_OPTIMIZATION_FLAGS=/O3"

ifort %IFORT_OPTIMIZATION_FLAGS% %IFORT_INITIAL_FLAGS% /I"C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2020.4.311\windows\mkl\include" -o test.obj test.f
ifort -dll -o mylib.dll test.obj /link /LIBPATH:"C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2020.4.311\windows\mkl\lib\intel64_win" mkl_intel_lp64.lib mkl_intel_thread.lib mkl_core.lib libiomp5md.lib

最后，我编写了以下 python 脚本，我正在从 Visual Studio Code 执行：

from ctypes import *
import time

import os
os.add_dll_directory(r"C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.4.311/windows/redist/intel64_win/mkl")
os.add_dll_directory(r"C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.4.311/windows/redist/intel64_win/compiler")

import numpy as np

mylib = CDLL(r"C:/path/to/the/fortran/mylib.dll")

mylib.matmultmy.argtypes = [ POINTER(c_double),POINTER(c_double),POINTER(c_int),POINTER(c_int) ]

mylib.matmultorig.argtypes = [ POINTER(c_double),POINTER(c_int) ]

mylib.matmultmkl.argtypes = [ POINTER(c_double),POINTER(c_int) ]

# Setup    
M=500
N=500
K=500

a = np.empty((M,dtype=c_double)
b = np.empty((N,K),dtype=c_double)
c = np.empty((M,dtype=c_double)

a[:] = np.random.rand(M,N)
b[:] = np.random.rand(N,K)


# Fortran my call
start = time.time()
mylib.matmultmy( a.ctypes.data_as(POINTER(c_double)),b.ctypes.data_as(POINTER(c_double)),c.ctypes.data_as(POINTER(c_double)),c_int(M),c_int(N),c_int(K) )
stop = time.time()
print(f"Fortran my \t {stop - start}s")

# Fortran matmul call
start = time.time()
mylib.matmultorig( a.ctypes.data_as(POINTER(c_double)),c_int(K) )
stop = time.time()
print(f"Fortran matmul \t {stop - start}s")

# Fortran mkl call
start = time.time()
mylib.matmultmkl( a.ctypes.data_as(POINTER(c_double)),c_int(K) )
stop = time.time()
print(f"Fortran mkl \t {stop - start}s")

# Numpy
start = time.time()
c = a.dot(b)
stop = time.time()
print(f"Numpy \t\t {stop - start}s")

结果是：

Fortran my       0.11234903335571289s
Fortran matmul   0.023325443267822266s
Fortran mkl      0.5279343128204346s
Numpy            0.001001596450805664s

我在玩弄同样的想法，但在 python c++ pybind11 上下文中（相同的矩阵大小）：

pybind11 vs numpy for a matrix product

现在的数字无法与我的情况相提并论，但至少 numpy 和 intel mkl 的表现在某种程度上是相同的。在这里，调用 dgemm 的函数是 numpy 矩阵乘积的 500 倍。我怀疑这是因为以次要方式编组，主要是因为“c 绑定”。然而，我两天前发现了这一切，所以如果专业人士有想法......

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

ctypes fortran fortran-iso-c-binding intel-mkl python