带 BLAS 的 OpenMP

问题描述

我试图扩展上述链接的答案中的代码，以包括交叉检查和 openmp。

Program reshape_for_blas

  Use,Intrinsic :: iso_fortran_env,Only :  wp => real64,li => int64

  Implicit None

  Real( wp ),Dimension( :,:    ),Allocatable :: a
  Real( wp ),:,: ),Allocatable :: b
  Real( wp ),Allocatable :: c1,c2,c3,c4,c5
  Real( wp ),Allocatable :: d
  Real( wp ),Allocatable :: e

  Integer :: na,nb,nc,nd,ne
  Integer :: la,lb,lc,ld  
  Integer( li ) :: start,finish,rate,numthreads

  numthreads = 2

  call omp_set_num_threads(numthreads)  
  
  
  Write( *,* ) 'na,nd ?'
  Read( *,* ) na,nd
  ne = nc * nd
  Allocate( a ( 1:na,1:nb ) ) 
  Allocate( b ( 1:nb,1:nc,1:nd ) ) 
  Allocate( c1( 1:na,1:nd ) ) 
  Allocate( c2( 1:na,1:nd ) ) 
  Allocate( c3( 1:na,1:nd ) )
  Allocate( c4( 1:na,1:nd ) )
  Allocate( c5( 1:na,1:nd ) )  
  Allocate( d ( 1:nb,1:ne ) ) 
  Allocate( e ( 1:na,1:ne ) ) 

  ! Set up some data
  Call Random_number( a )
  Call Random_number( b )

  ! With reshapes
  Call System_clock( start,rate )
  
  !write (*,*) 'clock',start,rate
  
  
  d = Reshape( b,Shape( d ) )
  Call dgemm( 'N','N',na,ne,1.0_wp,a,Size( a,Dim = 1 ),&
                                            d,Size( d,&
                                    0.0_wp,e,Size( e,Dim = 1 ) )
  c1 = Reshape( e,Shape( c1 ) )
  Call System_clock( finish,rate
  
  
  
  Write( *,* ) 'Time for reshaping method ',Real( finish - start,wp ) / rate
  Write( *,* ) 'Difference between result matrices ',Maxval( Abs( c1 - c2 ) )
  
  
  ! Direct
  Call System_clock( start,rate )
  Call dgemm( 'N',&
                                            b,Size( b,&
                                            0.0_wp,Size( c2,Dim = 1 ) )
  Call System_clock( finish,rate )
  Write( *,* ) 'Time for straight  method ',wp ) / rate
    
    
  
  Call System_clock( start,rate )
  
  !$omp parallel
  ! Direct
  Call dgemm( 'N',Size( c4,Dim = 1 ) )
  !$omp end parallel
  Call System_clock( finish,rate )  
  Write( *,* ) 'Time for straight  method omp',wp ) / rate
    
  
 
  !naive
  Call System_clock( start,rate )

  do la = 1,na 
    do lc = 1,nc
      do ld = 1,nd
         c3(la,ld) = 0.0_wp
      enddo
    enddo
  enddo
  
  do la = 1,na 
    do lb = 1,nb
      do lc = 1,nc
        do ld = 1,nd
          c3(la,ld) = c3(la,ld)  + a(la,lb) * b(lb,ld)
        enddo  
      enddo
    enddo
  enddo  
 
  
  Call System_clock( finish,* ) 'Time for loop',wp ) / rate  
 
   

  !naive omp
  Call System_clock( start,rate )  
  !$omp parallel

  do la = 1,nd
         c5(la,ld) = 0.0_wp
      enddo
    enddo
  enddo
  
  !$omp do private(la,ld) schedule(dynamic) reduction(+: c5)    
  do la = 1,nd
          c5(la,ld) = c5(la,ld)
        enddo  
      enddo
    enddo
  enddo  
  !$omp end do  
  !$omp end parallel
 
  
  Call System_clock( finish,* ) 'Time for loop omp',wp ) / rate  
 
  
  
  
  
  do la = 1,nd
         
         if ( dabs(c3(la,ld) - c1(la,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c1',c3(la,ld)
         endif         
         
         
         if ( dabs(c3(la,ld) - c2(la,*) '!!! c2',ld)
         endif
         
         if ( dabs(c3(la,ld) - c4(la,*) '!!! c4',la,ld,ld) - c5(la,*) '!!! c5',ld)
         endif         
      enddo
    enddo
  enddo  
  
End Program reshape_for_blas

我有两个问题：

无论是 BLAS 还是朴素循环，都没有显着的加速。例如，通过 gfortran -std=f2008 -Wall -Wextra -fcheck=all reshape.f90 -lblas -fopenmp，输入30 30 30 60，我得到

30 30 30 60
 Time for reshaping method    2.9443999999999998E-003
 Difference between result matrices    12.380937791257775
 Time for straight  method    1.0016000000000001E-003
 Time for straight  method omp   2.4878000000000001E-003
 Time for loop   6.6072500000000006E-002
 Time for loop omp  0.100242600000000002

当维度变大时，例如输入中的 60 60 60 60，openmp BLAS 结果可以获得与原始循环不同的值，似乎我错过了一些控制选项。

在这里使用 OpenMP 会有什么问题？

编辑我在 c5 部分的初始化中添加了 omp 行并注释掉了两个打印行，


Program reshape_for_blas

  Use,wp ) / rate
    
    

  !naive loop
  Call System_clock( start,wp ) / rate  
   



  !dgemm omp 
  Call System_clock( start,wp ) / rate
    
  
 

  !loop omp
  Call System_clock( start,wp ) / rate  
 
  


!single core: c1 c2 c3
! c1 reshape blas
! c2 blas
! c3 naive loop (reference)
! parallel: c4 c5
! c4 dgemm parallel
! c5 naive loop parallel


  do la = 1,ld)
         endif         
      enddo
    enddo
  enddo  
  
End Program reshape_for_blas

然后gfortran reshape.f90 -lblas -fopenmp 和30 30 30 30输入导致

 Time for reshaping method    1.3519000000000001E-003
 Difference between result matrices    12.380937791257775
 Time for straight  method    6.2549999999999997E-004
 Time for straight  method omp   1.2600000000000001E-003
 Time for naive loop   1.0008599999999999E-002
 Time for naive loop omp   1.6678999999999999E-002

虽然不是很好的加速。

解决方法

您正在使用同一组变量并行调用 DGEMM（因为并行区域中的变量在 Fortran 中默认共享）。由于数据竞争，这不起作用并产生奇怪的结果。您有两个选择：

找到一个并行的 BLAS 实现，其中 DGEMM 已经被线程化。英特尔 MKL 和 OpenBLAS 是主要候选者。英特尔 MKL 使用 OpenMP，更具体地说，它是使用英特尔 OpenMP 运行时构建的，因此它可能无法很好地与使用 GCC 编译的 OpenMP 代码配合使用，但它可以完美地与非线程代码配合使用。
并行调用 DGEMM 但不要使用相同的参数集。相反，对一个或两个张量执行块分解，并让每个线程为单独的块进行收缩。由于 Fortran 使用列优先存储，因此分解第二个张量可能是合适的：
```
C[i,k,l=1..L] = A[i,j] * B[j,l=1..L]
```
带有两个线程：
```
thread 0: C[i,l=1..L/2] = A[i,l=1..L/2]
thread 1: C[i,l=L/2+1..L] = A[i,l=L/2+1..L]
```
对于任意数量的线程，它归结为计算每个线程中 l 索引的开始和结束值，并相应地调整 DGEMM 的参数。

就我个人而言，我会采用并行 BLAS 实现。使用英特尔 MKL，您只需要链接并行驱动程序，它就会自动使用所有可用的 CPU。

块分解的示例实现如下。仅显示对原始代码的添加和更改：

  ! ADD: Use the OpenMP module
  Use :: omp_lib

  ! ADD: Variables used for the decomposition
  Integer :: ithr,istart,iend

  ! CHANGE: OpenMP with block decomposition
  !$omp parallel private(ithr,iend)
    ithr = omp_get_thread_num()

    ! First index (plane) in B for the current thread
    istart = ithr * nd / omp_get_num_threads()
    ! First index (plane) in B for the next thread
    iend = (ithr + 1) * nd / opm_get_num_threads()

    Call dgemm('N','N',na,nc * (iend - istart),nb,1.0_wp,a,nd,&
               b(1,1,1 + istart),Size(b,Dim = 1),&
               0.0_wp,c4(1,Size(c4,Dim = 1))
  !$omp end parallel

istart 是 B 的第一个平面的索引，每个单独的线程都在其上工作。 iend 是下一个线程的第一个平面，所以 iend - istart 是当前线程的平面数。 b(1,1 + istart) 是 B 中平面块的起点。 c4(1,1 + istart) 是结果张量中的块开始的位置。

确保您执行其中一项操作，但不要同时执行两项操作。即，如果您的 BLAS 实现是线程化的，但您决定使用块分解，请在 BLAS 库中禁用线程化。相反，如果您在 BLAS 实现中使用线程，则不要在您的代码中执行块分解。