我的程序的核心操作之一是N个速度场之和,每个速度场由一个高斯随机变量加权成一个单一的速度场。在LaTeX表示法中,
U(x,y,z) = \sum_{n=1}^{N} \phi_{n}(x,y,z) \xi_{n}
其中\phi是速度场,U是结果。我目前正在考虑不同的实现方案,以获得尽可能多的加速,因为我的矩阵可能非常大:每个字段\phi都有维度(100,100,30),它们总共是720个。
我目前正在研究两种存储这些数据的方法,以便有效地使用:
。
第二个方法的方便之处在于,一旦我有了随机向量\xi,就可以使用BLAS dgemv计算矩阵向量积,结果表明,第二个选项是更有效的。
但是,由于我们讨论的是维度矩阵(300.000720),我认为使用OpenMp会带来一些好处。我已经设置了手动调度,其中我将矩阵划分为子矩阵,使用私有指针指向矩阵的部分,并为每个子矩阵调用dgemv。结果是,它比整个矩阵上的原始BLAS操作花费更多的时间,即使有两个线程(这不应该增加太多的通信开销),所以我显然遗漏了一些我看不到的地方。我正在用一个简单的gfortran -o -02 main.out MWE.F90 -llapack -lblas -fopenmp编译,我使用的代码是:
PROGRAM new_modes
IMPLICIT NONE
INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND(12,307)
interface
function OMP_get_wtime()
INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND(12,307)
real(wp) :: OMP_get_wtime
end function OMP_get_wtime
function OMP_get_thread_num()
integer(kind = 4) :: OMP_get_thread_num
end function OMP_get_thread_num
end interface
INTEGER, PARAMETER :: nn_tlu_nmod = 720
INTEGER, PARAMETER :: jpi = 100! 32
INTEGER, PARAMETER :: jpj = 100! 17
INTEGER, PARAMETER :: jpk = 30! 31
INTEGER, PARAMETER :: np = jpk * jpj * jpi
!
REAL(wp), ALLOCATABLE, DIMENSION(:,:,:) :: A
REAL(wp), ALLOCATABLE, TARGET, DIMENSION(:,:) :: B
REAL(wp), POINTER, DIMENSION(:,:) :: sub_B
REAL(wp), POINTER, DIMENSION(:) :: sub_B_vec
REAL(wp), ALLOCATABLE, DIMENSION(:,:,:) :: U_ref
REAL(wp), ALLOCATABLE, DIMENSION(:,:,:) :: U
REAL(wp), ALLOCATABLE, TARGET, DIMENSION(:) :: vec_U
REAL(wp), POINTER, DIMENSION(:) :: sub_vec_U
REAL(wp), ALLOCATABLE, TARGET, DIMENSION(:) :: tcoeff
REAL(wp), POINTER, DIMENSION(:) :: tcoef_pt
!
REAL(wp) :: tic, toc
INTEGER :: ji, jj, jk, jm
INTEGER :: m_idx
INTEGER :: cnt
!
INTEGER :: ierr
!
REAL(wp) :: ddot
!
!-------------------------------------------------------------------------------------
! Declaration for the Parallel parameters
!-------------------------------------------------------------------------------------
INTEGER, PARAMETER :: num_threads=1
INTEGER, SAVE :: myID
!$OMP THREADPRIVATE(myID)
!-------------------------------------------------------------------------------------
! Declaration for the row scheduling
!-------------------------------------------------------------------------------------
INTEGER :: sub_n
INTEGER :: istart,inext
INTEGER, ALLOCATABLE :: work_sharing(:)
WRITE(6,*) 'Number of threads employed in the solver:', num_threads
ALLOCATE( A(jpi,jpj,jpk * nn_tlu_nmod), stat=ierr )
ALLOCATE( B(jpi*jpj*jpk, nn_tlu_nmod), stat=ierr )
ALLOCATE( Bt( nn_tlu_nmod, jpi*jpj*jpk), stat=ierr )
ALLOCATE( tcoeff(nn_tlu_nmod), stat=ierr )
ALLOCATE( U(jpi,jpj,jpk), &
& U_ref(jpi,jpj,jpk), &
& vec_U(jpi*jpj*jpk), stat=ierr )
cnt = 0
DO jm = 1, nn_tlu_nmod
!
m_idx = ( jm - 1 ) * jpk
!
DO jk = 1, jpk
DO jj = 1, jpj
DO ji = 1, jpi
cnt = cnt + 1
A(ji,jj,m_idx + jk) = cnt * jm
END DO
END DO
END DO
B(:,jm) = RESHAPE(A(:,:,m_idx + 1: m_idx + jpk), (/jpi*jpj*jpk/) )
Bt(jm,:) = RESHAPE(A(:,:,m_idx + 1: m_idx + jpk), (/jpi*jpj*jpk/) )
END DO
tcoeff = 1._wp
! First option, normal matrix summation (already tested faster than
! with explicit do loop)
tic = omp_get_wtime()
U = 0._wp
!
DO jm = 1, nn_tlu_nmod
! Define zero-th indexed mode
m_idx = ( jm - 1 ) * jpk
!
U(:,:,:) = U(:,:,:) + A(:,:,m_idx + 1 : m_idx + jpk ) * tcoeff(jm)
!
ENDDO
!
toc = omp_get_wtime()
print '("Time, operation without for loops = ",f13.7," seconds.")', toc-tic
U_ref = U
! Second option, exploit a different shape of the matrix in order to use BLAS
! (Usage of pointers speeds-up a lot)
tic = omp_get_wtime()
U = 0._wp
sub_B => B(:, :)
sub_n = np
sub_vec_U => vec_U(:)
tcoef_pt => tcoeff(:)
! CALL DGEMV( trans, m, n, alpha, A, ldA, x, incx, beta, y, incy )
CALL DGEMV( 'n', sub_n, nn_tlu_nmod, 1._wp, sub_B, sub_n, tcoef_pt, 1, 0._wp, sub_vec_U, 1 )
U = RESHAPE(vec_U, (/ jpi, jpj, jpk /) )
!
toc = omp_get_wtime()
print '("Time, 2D matrix with BLAS = ",f13.7," seconds.", f25.7)', toc-tic, MAXVAL(ABS(U-U_ref))
! Third option, Divide the job in a static way between M threads, use pointers to point at the
! portion of the matrix assigned to each processor and use BLAS
tic = omp_get_wtime()
vec_U = 0._wp
!$OMP PARALLEL PRIVATE(istart, sub_n)
myID = omp_get_thread_num()+1
istart = work_sharing(myID)
sub_n = work_sharing(myID+1)-istart
! CALL DGEMV( trans, m, n, alpha, A, ldA, x, incx, beta, y, incy )
CALL DGEMV( 'n', sub_n, nn_tlu_nmod, 1._wp, B(istart,1), np, tcoeff, 1, 0._wp, vec_U(istart), 1 )
!$OMP END PARALLEL
U = RESHAPE(vec_U, (/ jpi, jpj, jpk /) )
toc = omp_get_wtime()
print '("Time, Static scheduling BLAS OpenMP = ",f13.7," seconds.", f25.7)', toc-tic, MAXVAL(ABS(U-U_ref))
! Fourth option, run a parallel Do and use DDOT
tic = omp_get_wtime()
vec_U = 0._wp
!$OMP PARALLEL PRIVATE(ji)
!$OMP DO SCHEDULE (STATIC)
DO ji=1,np
! res = DDOT( n, x, inc_x, y, inc_y )
vec_U(ji) = DDOT( nn_tlu_nmod, B(ji, :), 1, tcoeff, 1 )
END DO
!$OMP END DO
!$OMP END PARALLEL
U = RESHAPE(vec_U, (/ jpi, jpj, jpk /) )
toc = omp_get_wtime()
print '("Time, Parallelized for loop DDOT = ",f13.7," seconds.", f25.7)', toc-tic, MAXVAL(ABS(U-U_ref))
! Fifth option, Divide the job in a static way between M threads, but using a transposed matrix
tic = omp_get_wtime()
vec_U = 0._wp
!$OMP PARALLEL PRIVATE(istart, sub_n)
myID = omp_get_thread_num()+1
istart = work_sharing(myID)
sub_n = work_sharing(myID+1) - istart
! CALL DGEMV( trans, m, n, alpha, A, ldA, x, incx, beta, y, incy )
CALL DGEMV( 't', nn_tlu_nmod, sub_n, 1._wp, Bt(1,istart), nn_tlu_nmod, tcoeff, 1, 0._wp, vec_U(istart), 1 )
!$OMP END PARALLEL
U = RESHAPE(vec_U, (/ jpi, jpj, jpk /) )
toc = omp_get_wtime()
print '("Time, Static scheduling tran OpenMP = ",f13.7," seconds.", f25.7)', toc-tic, MAXVAL(ABS(U-U_ref))
END PROGRAM感谢所有
编辑:--我已经用PierU的所有建议修改了初始代码,并添加了第五种可能性,即从一开始就基本转换矩阵(在我的例子中是可行的),以便对连续的内存位置进行调度。由于一些原因,这最后一个无论如何比第三个慢,这是违反直觉对我。
发布于 2022-09-14 13:11:26
在第三个选项中(在并行区域中使用DGEMV ),对DGEMV的调用涉及大的非连续子数组,这会触发复制/复制行为。顶替
sub_B => B(istart:inext-1, :)
...
CALL DGEMV('n', sub_n, nn_tlu_nmod, 1._wp, sub_B , sub_n, tcoef_pt,1, 0._wp,sub_vec_U,1 )通过
...
CALL DGEMV('n', sub_n, nn_tlu_nmod, 1._wp, B(istart,i), np , tcoef_pt,1, 0._wp,sub_vec_U,1 )避免了大拷贝,而且效率要高得多:使用2个线程,我用x1.6来计时速度,而不是用初始代码慢下来3x。
一般来说,在可能的情况下避免复制/复制行为总是可取的,特别是当一个处理大数组时(这里的A和B数组各占1.6GB)。
我在这里不再重复我上面就这段代码所作的其他评论。
https://stackoverflow.com/questions/73714643
复制相似问题