问题描述
我有一些旧代码,它们以B = B + A*E
的形式执行矩阵运算
DO I = 1,N
DO L = 1,N
DO K = 1,N
B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
end do
end do
end do
为了提高可读性并利用现代的fortran内在函数,我想将以上代码编写为
B = B + matmul( A,E(:,1:N,J-1) )
我注意到提高的可读性是以性能为代价的。我确定问题不在于内在函数matmul
-左图显示了matmul
的性能与手动写入N
的所有值一样好。
将矩阵乘法与矩阵加法结合使用时,对于N
较小的值,手动写操作的性能要优于内在函数。我通常使用N < 10
;我想在不损失性能的情况下提高可读性。可能有个建议吗?
我正在使用的代码如下。我将Mac OS 10.14.6与gfortran 8.2.0结合使用,并使用-O3优化选项进行编译。
program test
implicit none
integer :: loop_max = 1000
integer :: j ! loop index
integer :: i ! loop index
real :: t1,t2 ! start and end times
real :: t_manual,t_intrinsic,t_man_add,t_intrn_add
integer :: N ! matrix dimension
integer,parameter :: NJ = 12
real,dimension(:,:),allocatable :: A,B ! matrices
real,allocatable :: D
real,dimension(:),allocatable :: G
real,:,allocatable :: E
open(1,file = 'Delete.txt',status = 'unkNown')
do N = 1,40
allocate(A(N,N),B(N,G(N),D(N,2*N+1),E(N,N+1,NJ))
! ##########################################################################
! manual matrix multiplication vs matmul
call rand_fill
call cpu_time(t1)
do i = 1,loop_max
do j = 2,12
call matmul_manual(j,N,NJ,A,B,D,G,E)
end do
end do
call cpu_time(t2)
t_manual = t2 - t1
write(1,*) A,E
call rand_fill
call cpu_time(t1)
do i = 1,12
B = matmul( A,j-1) )
end do
end do
call cpu_time(t2)
t_intrinsic = t2 - t1
write(1,E
! --------------------------------------------------------------------------
! ##########################################################################
! manual matrix multiplication with matrix addition
call rand_fill
call cpu_time(t1)
do i = 1,12
call manual_matmul_add(j,E)
end do
end do
call cpu_time(t2)
t_man_add = t2 - t1
write(1,E
! --------------------------------------------------------------------------
! ##########################################################################
! intrinsic matrix multiplication (matmul) with matrix addition
call rand_fill
call cpu_time(t1)
do i = 1,12
call intrinsic_matmul_add(j,E)
end do
end do
call cpu_time(t2)
t_intrn_add = t2 - t1
write(1,E
! --------------------------------------------------------------------------
deallocate(A,E)
print*,t_manual,t_intrn_add
end do
contains
subroutine rand_fill
! fill the matrices with random numbers
call random_number(A)
call random_number(B)
call random_number(D)
call random_number(G)
call random_number(E)
end subroutine
end program test
subroutine matmul_manual(j,E)
implicit none
integer,intent(in) :: j
integer,intent(in) :: N,NJ
real,dimension(N,intent(in out) :: A,B
real,intent(in out) :: D
real,dimension(N),intent(in out) :: G
real,NJ),intent(in out) :: E
integer :: I,L,K ! loop indices
B = 0.0
DO I = 1,N
DO L = 1,N
DO K = 1,N
B(I,J-1)
end do
end do
end do
end subroutine matmul_manual
subroutine manual_matmul_add(j,K ! loop indices
DO I = 1,N
D(I,N+1) = -G(I)
DO L = 1,N
D(I,N+1) = D(I,N+1)+A(I,J-1)
DO K = 1,J-1)
end do
end do
end do
end subroutine manual_matmul_add
subroutine intrinsic_matmul_add(j,intent(in out) :: E
real,N+1) :: temp1
real,N) :: temp2
D(:,N+1) = -G + matmul( A,j-1) )
B = B + matmul( A,j-1) )
end subroutine intrinsic_matmul_add
subroutine mat_sub_new(j,intent(in out) :: E
if (N == 1) then ! matmul seems to be inefficient when N = 1
D(N,N+1) = -G(N) + A(N,N)*E(N,J-1)
B(N,N) = B(N,N) + A(N,J-1)
else
D(:,j-1) )
B = B + matmul( A,j-1) )
end if
end subroutine mat_sub_new
解决方法
我怀疑这与两个问题有关:
-
编译器如何将
MATMUL
通用接口解析为对正确例程(REAL
与COMPLEX
与INTEGER
或矩阵时间的正确调用向量与矩阵乘以矩阵乘以矩阵):我不知道是在编译时还是在运行时系统地完成此操作,还是基于优化级别做出此选择; (这将证明额外的开销是合理的,尤其是在小尺寸情况下) -
通常,内部“通用”算法可能不是最适合您的问题的,因为在某些情况下,蛮力编译器优化看起来更好。例如,gfortran的
MATMUL
内部函数是否基于BLAS?如果是这样,那可能不是最快的。
我在PC上使用NORM2
做过类似的测试(Windows,i7,gfortran 9.2.0,使用-O3 -march=core-avx2
编译):事实证明,对于NORM2
:
-
BLAS
实现始终是最慢的,尽管已经进行了一些重构以向编译器提供PURE
版本 -
内部函数(
NORM2
或SQRT(SUM(X**2))
)的使用总是很慢,无论数组大小如何 -
最快的情况是使用简单循环或具有固定大小数组的内在函数:
ARRAY SIZE assumed-shape fixed-size intrinsic NORM2 LOOP BLAS N [ms/N] [ms/N] [ms/N] [ms/N] [ms/N] 2 5.93750E-09 4.06250E-09 8.43750E-09 4.06250E-09 1.03125E-08 12 1.03125E-08 7.81250E-09 3.12500E-08 7.81250E-09 5.09375E-08 22 1.65625E-08 1.40625E-08 5.50000E-08 1.43750E-08 9.15625E-08 32 2.25000E-08 2.00000E-08 7.81250E-08 2.00000E-08 1.29375E-07
顺便说一句,代码粘贴在下面(注意要占用大量内存!):
program test_norm
use iso_fortran_env
implicit none
integer :: xsize,i,iunit,icase
integer,parameter :: testSize = 50000000
real(real64),allocatable :: set(:,:),setNorm(:)
real(real64) :: t0,t1,setSum(5),timeTable(5,35)
intrinsic :: norm2
open(newunit=iunit,file='garbage.txt',action='write')
print '(6(1x,a15))','ARRAY SIZE','assumed-shape','fixed-size','intrinsic NORM2','LOOP','BLAS'
print '(6(1x,'N',('[ms/N]',i=1,5)
icase = 0
do xsize = 2,32,10
! Initialize test set
icase = icase+1
allocate(set(xsize,testSize),setNorm(testSize))
call random_number(set)
! Test 1: intrinsic SQRT/SUM,assumed-shape array
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v1(set(:,i)); call cpu_time(t1)
setSum(1) = sum(setNorm); timeTable(1,icase) = t1-t0
! Test 2: intrinsic SQRT/SUM,fixed-size array
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v2(xsize,set(:,i)); call cpu_time(t1)
setSum(2) = sum(setNorm); timeTable(2,icase) = t1-t0
! Test 3: intrinsic NORM2
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm2(set(:,i)); call cpu_time(t1)
setSum(3) = sum(setNorm); timeTable(3,icase) = t1-t0
! Test 4: LOOP
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v3(xsize,i)); call cpu_time(t1)
setSum(4) = sum(setNorm); timeTable(4,icase) = t1-t0
! Test 5: BLAS
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = DNRM2(xsize,i),1); call cpu_time(t1)
setSum(5) = sum(setNorm); timeTable(5,icase) = t1-t0
! Print output
print '(7x,i2,9x,5(2x,1pe13.5e2,1x))',xsize,timeTable(:,icase)/testSize
write (iunit,*) 'setSum = ',setSum
deallocate(set,setNorm)
end do
close(iunit)
contains
pure real(real64) function norm_v1(x) result(L2)
real(real64),intent(in) :: x(:)
L2 = sqrt(sum(x**2))
end function norm_v1
pure real(real64) function norm_v2(n,x) result(L2)
integer,intent(in) :: n
real(real64),intent(in) :: x(n)
L2 = sqrt(sum(x**2))
end function norm_v2
pure real(real64) function norm_v3(n,intent(in) :: x(n)
integer :: i
L2 = 0.0_real64
do i=1,n
L2 = L2 + x(i)**2
end do
L2 = sqrt(L2)
end function norm_v3
PURE REAL(REAL64) FUNCTION DNRM2 ( N,X,INCX )
INTEGER,INTENT(IN) :: N,INCX
REAL(REAL64),INTENT(IN) :: X( * )
REAL(REAL64),PARAMETER :: ONE = 1.0_REAL64
REAL(REAL64),PARAMETER :: ZERO = 0.0_REAL64
INTEGER :: IX
REAL(REAL64) :: ABSXI,NORM,SCALE,SSQ
INTRINSIC :: ABS,SQRT
IF( N<1 .OR. INCX<1 )THEN
NORM = ZERO
ELSE IF( N==1 )THEN
NORM = ABS( X( 1 ) )
ELSE
SCALE = ZERO
SSQ = ONE
DO IX = 1,1 + ( N - 1 )*INCX,INCX
IF( X( IX )/=ZERO )THEN
ABSXI = ABS( X( IX ) )
IF( SCALE<ABSXI )THEN
SSQ = ONE + SSQ*( SCALE/ABSXI )**2
SCALE = ABSXI
ELSE
SSQ = SSQ + ( ABSXI/SCALE )**2
END IF
END IF
END DO
NORM = SCALE * SQRT( SSQ )
END IF
DNRM2 = NORM
RETURN
END FUNCTION DNRM2
end program test_norm