为什么手动编程的矩阵乘法与矩阵加法相结合会比固有函数具有更好的性能？

问题描述

我有一些旧代码，它们以B = B + A*E的形式执行矩阵运算

DO I = 1,N
  DO L = 1,N
    DO K = 1,N
      B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
    end do
  end do
end do

为了提高可读性并利用现代的fortran内在函数，我想将以上代码编写为

B = B + matmul( A,E(:,1:N,J-1) )

我注意到提高的可读性是以性能为代价的。我确定问题不在于内在函数matmul －左图显示了matmul的性能与手动写入N的所有值一样好。

将矩阵乘法与矩阵加法结合使用时，对于N较小的值，手动写操作的性能要优于内在函数。我通常使用N < 10；我想在不损失性能的情况下提高可读性。可能有个建议吗？

我正在使用的代码如下。我将Mac OS 10.14.6与gfortran 8.2.0结合使用，并使用-O3优化选项进行编译。

program test
  implicit none

  integer :: loop_max = 1000
  integer :: j                                        ! loop index
  integer :: i                                        ! loop index
  real    :: t1,t2                                   ! start and end times
  real    :: t_manual,t_intrinsic,t_man_add,t_intrn_add


  integer                               :: N          ! matrix dimension
  integer,parameter                    :: NJ = 12

  real,dimension(:,:),allocatable :: A,B       ! matrices
  real,allocatable :: D
  real,dimension(:),allocatable :: G
  real,:,allocatable :: E

  open(1,file = 'Delete.txt',status = 'unkNown')


  do N = 1,40
    allocate(A(N,N),B(N,G(N),D(N,2*N+1),E(N,N+1,NJ))


    ! ##########################################################################
    ! manual matrix multiplication vs matmul
    call rand_fill
    call cpu_time(t1)

    do i = 1,loop_max
      do j = 2,12
        call matmul_manual(j,N,NJ,A,B,D,G,E)
      end do
    end do

    call cpu_time(t2)
    t_manual = t2 - t1
    write(1,*) A,E


    call rand_fill
    call cpu_time(t1)

    do i = 1,12
        B         =  matmul( A,j-1) )
      end do
    end do

    call cpu_time(t2)
    t_intrinsic = t2 - t1
    write(1,E
    ! --------------------------------------------------------------------------




    ! ##########################################################################
    ! manual matrix multiplication with matrix addition
    call rand_fill
    call cpu_time(t1)

    do i = 1,12
        call manual_matmul_add(j,E)
      end do
    end do

    call cpu_time(t2)
    t_man_add = t2 - t1
    write(1,E
    ! --------------------------------------------------------------------------



    ! ##########################################################################
    ! intrinsic matrix multiplication (matmul) with matrix addition
    call rand_fill
    call cpu_time(t1)

    do i = 1,12
        call intrinsic_matmul_add(j,E)
      end do
    end do

    call cpu_time(t2)
    t_intrn_add = t2 - t1
    write(1,E
    ! --------------------------------------------------------------------------


    deallocate(A,E)

    print*,t_manual,t_intrn_add

  end do





contains
  subroutine rand_fill
    ! fill the matrices with random numbers
    call random_number(A)
    call random_number(B)
    call random_number(D)
    call random_number(G)
    call random_number(E)

  end subroutine


end program test











subroutine matmul_manual(j,E)
  implicit none

  integer,intent(in)                          :: j
  integer,intent(in)                          :: N,NJ
  real,dimension(N,intent(in out) :: A,B
  real,intent(in out) :: D
  real,dimension(N),intent(in out) :: G
  real,NJ),intent(in out) :: E

  integer :: I,L,K  ! loop indices

  B = 0.0
  DO I = 1,N
    DO L = 1,N
      DO K = 1,N
        B(I,J-1)
      end do
    end do
  end do

end subroutine matmul_manual






subroutine manual_matmul_add(j,K  ! loop indices

  DO I = 1,N
    D(I,N+1) = -G(I)
    DO L = 1,N
      D(I,N+1) = D(I,N+1)+A(I,J-1)
      DO K = 1,J-1)
      end do
    end do
  end do

end subroutine manual_matmul_add




subroutine intrinsic_matmul_add(j,intent(in out) :: E

  real,N+1) :: temp1
  real,N)   :: temp2

  D(:,N+1) = -G + matmul( A,j-1) )
  B         =  B + matmul( A,j-1) )

end subroutine intrinsic_matmul_add




subroutine mat_sub_new(j,intent(in out) :: E


  if (N == 1) then        ! matmul seems to be inefficient when N = 1
    D(N,N+1) = -G(N)   + A(N,N)*E(N,J-1)
    B(N,N)   =  B(N,N) + A(N,J-1)

  else
    D(:,j-1) )
    B         =  B + matmul( A,j-1) )
  end if

end subroutine mat_sub_new

解决方法

我怀疑这与两个问题有关：

编译器如何将MATMUL通用接口解析为对正确例程（REAL与COMPLEX与INTEGER或矩阵时间的正确调用向量与矩阵乘以矩阵乘以矩阵）：我不知道是在编译时还是在运行时系统地完成此操作，还是基于优化级别做出此选择；（这将证明额外的开销是合理的，尤其是在小尺寸情况下）
通常，内部“通用”算法可能不是最适合您的问题的，因为在某些情况下，蛮力编译器优化看起来更好。例如，gfortran的MATMUL内部函数是否基于BLAS？如果是这样，那可能不是最快的。

我在PC上使用NORM2做过类似的测试（Windows，i7，gfortran 9.2.0，使用-O3 -march=core-avx2编译）：事实证明，对于NORM2：

BLAS实现始终是最慢的，尽管已经进行了一些重构以向编译器提供PURE版本
内部函数（NORM2或SQRT(SUM(X**2))）的使用总是很慢，无论数组大小如何

最快的情况是使用简单循环或具有固定大小数组的内在函数：

ARRAY SIZE   assumed-shape      fixed-size intrinsic NORM2            LOOP            BLAS
         N          [ms/N]          [ms/N]          [ms/N]          [ms/N]          [ms/N]
  2             5.93750E-09     4.06250E-09     8.43750E-09     4.06250E-09     1.03125E-08
 12             1.03125E-08     7.81250E-09     3.12500E-08     7.81250E-09     5.09375E-08
 22             1.65625E-08     1.40625E-08     5.50000E-08     1.43750E-08     9.15625E-08
 32             2.25000E-08     2.00000E-08     7.81250E-08     2.00000E-08     1.29375E-07

顺便说一句，代码粘贴在下面（注意要占用大量内存！）：

program test_norm
   use iso_fortran_env
   implicit none

   integer :: xsize,i,iunit,icase
   integer,parameter :: testSize = 50000000
   real(real64),allocatable :: set(:,:),setNorm(:)
   real(real64) :: t0,t1,setSum(5),timeTable(5,35)
   intrinsic :: norm2 

   open(newunit=iunit,file='garbage.txt',action='write')

   print '(6(1x,a15))','ARRAY SIZE','assumed-shape','fixed-size','intrinsic NORM2','LOOP','BLAS'
   print '(6(1x,'N',('[ms/N]',i=1,5)

   icase = 0
   do xsize = 2,32,10

     ! Initialize test set 
     icase = icase+1
     allocate(set(xsize,testSize),setNorm(testSize))
     call random_number(set)

     ! Test 1: intrinsic SQRT/SUM,assumed-shape array
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v1(set(:,i)); call cpu_time(t1)
     setSum(1) = sum(setNorm); timeTable(1,icase) = t1-t0
      
     ! Test 2: intrinsic SQRT/SUM,fixed-size array 
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v2(xsize,set(:,i)); call cpu_time(t1)
     setSum(2) = sum(setNorm); timeTable(2,icase) = t1-t0

     ! Test 3: intrinsic NORM2
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm2(set(:,i)); call cpu_time(t1)
     setSum(3) = sum(setNorm); timeTable(3,icase) = t1-t0

     ! Test 4: LOOP 
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v3(xsize,i)); call cpu_time(t1)
     setSum(4) = sum(setNorm); timeTable(4,icase) = t1-t0

     ! Test 5: BLAS
     call cpu_time(t0); forall(i=1:testSize) setNorm(i) = DNRM2(xsize,i),1); call cpu_time(t1)
     setSum(5) = sum(setNorm); timeTable(5,icase) = t1-t0

     ! Print output
     print '(7x,i2,9x,5(2x,1pe13.5e2,1x))',xsize,timeTable(:,icase)/testSize
     write (iunit,*) 'setSum = ',setSum
     deallocate(set,setNorm)
      
   end do 
   
   close(iunit)

   contains

   pure real(real64) function norm_v1(x) result(L2)
      real(real64),intent(in) :: x(:)

      L2 = sqrt(sum(x**2))
   end function norm_v1      

   pure real(real64) function norm_v2(n,x) result(L2)
      integer,intent(in) :: n 
      real(real64),intent(in) :: x(n)

      L2 = sqrt(sum(x**2))
   end function norm_v2
       
   pure real(real64) function norm_v3(n,intent(in) :: x(n)
      integer :: i

      L2 = 0.0_real64
      do i=1,n
        L2 = L2 + x(i)**2
      end do  
      L2 = sqrt(L2)
       
   end function norm_v3

   PURE REAL(REAL64) FUNCTION DNRM2 ( N,X,INCX )
   INTEGER,INTENT(IN) :: N,INCX
   REAL(REAL64),INTENT(IN) :: X( * )
   REAL(REAL64),PARAMETER :: ONE  = 1.0_REAL64
   REAL(REAL64),PARAMETER :: ZERO = 0.0_REAL64
   
   INTEGER :: IX
   REAL(REAL64) :: ABSXI,NORM,SCALE,SSQ
   INTRINSIC :: ABS,SQRT
   IF( N<1 .OR. INCX<1 )THEN
     NORM  = ZERO
   ELSE IF( N==1 )THEN
     NORM  = ABS( X( 1 ) )
   ELSE
     SCALE = ZERO
     SSQ   = ONE
     DO IX = 1,1 + ( N - 1 )*INCX,INCX
        IF( X( IX )/=ZERO )THEN
           ABSXI = ABS( X( IX ) )
           IF( SCALE<ABSXI )THEN
              SSQ   = ONE   + SSQ*( SCALE/ABSXI )**2
              SCALE = ABSXI
           ELSE
              SSQ   = SSQ   +     ( ABSXI/SCALE )**2
           END IF
        END IF
     END DO
     NORM  = SCALE * SQRT( SSQ )
  END IF
  DNRM2 = NORM
  RETURN
  END FUNCTION DNRM2

end program test_norm

fortran gfortran