首先請看下面的代碼:
module add
use cudafor
contains
attributes(global) subroutine call_global(a)
integer :: a(:)
integer :: i
i=threadIdx%x+(blockIdx%x-1)*blockDim%x
a(i)=i
end subroutine call_global
end module
program test
use add
implicit none
integer,device :: d_a(256)
integer :: a(256)
integer :: i
do i=1,10
call call_global<<<2,128>>>(d_a)
enddo
a=d_a
print*,"a(1)=",a(1)
end program
把COMPUTE_PROFILE設置爲1,profile的分析結果:
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 13859dd8cdd69c55
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 0.960 ] cputime=[ 6.234 ]
method=[ add_call_global_ ] gputime=[ 3.360 ] cputime=[ 18.620 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 4.235 ]
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.075 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.925 ]
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 4.782 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.893 ]
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.502 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.729 ]
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.471 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.835 ]
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.420 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.800 ] cputime=[ 3.744 ]
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.660 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 3.731 ]
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.437 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.689 ]
method=[ add_call_global_ ] gputime=[ 1.600 ] cputime=[ 4.476 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 3.696 ]
method=[ add_call_global_ ] gputime=[ 1.568 ] cputime=[ 4.375 ] occupancy=[ 1.000 ]
method=[ memcpyDtoH ] gputime=[ 2.080 ] cputime=[ 14.631 ]
什麼情況?每次循環之前都有數據拷貝!!然後給出PGI的解釋:
The array is declared assumed-shape in the global subroutine, so we have to allocate, initialize, and pass the descriptor every time.
PGI給出兩個解決辦法:
1、參數以reference形式傳遞
修改代碼如下:
module add
use cudafor
contains
attributes(global) subroutine call_global(a)
integer :: a(*)
integer :: i
i=threadIdx%x+(blockIdx%x-1)*blockDim%x
a(i)=i
end subroutine call_global
end module
program test
use add
implicit none
integer,device :: d_a(256)
integer :: a(256)
integer :: i
do i=1,10
call call_global<<<2,128>>>(d_a)
enddo
a=d_a
print*,"a(1)=",a(1)
end program
如果不注意看的話,估計沒看出啥區別,請注意代碼第5行。看懂了吧?
profile的分析結果如下:
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 13859dd8cdbb1678
method,gputime,cputime,occupancy
method=[ add_call_global_ ] gputime=[ 3.168 ] cputime=[ 16.332 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.883 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.376 ] cputime=[ 3.722 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.344 ] cputime=[ 3.291 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.566 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.438 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.333 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.376 ] cputime=[ 3.304 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.311 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 3.226 ] occupancy=[ 1.000 ]
method=[ memcpyDtoH ] gputime=[ 2.080 ] cputime=[ 12.897 ]
木有了每次循環之前都有數據拷貝的過程。2、把變量放置到moule中(測試有問題)
我們繼續修改代碼如下:
module add
use cudafor
integer,device :: d_a(256)
contains
attributes(global) subroutine call_global(a)
integer :: a(:)
integer :: i
i=threadIdx%x+(blockIdx%x-1)*blockDim%x
a(i)=i
end subroutine call_global
end module
program test
use add
implicit none
integer :: a(256)
integer :: i
do i=1,10
call call_global<<<2,128>>>(d_a)
enddo
a=d_a
print*,"a(1)=",a(1)
end program
profile的分析結果如下:
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 13859fff450cdbde
method,gputime,cputime,occupancy
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 21.286 ]
method=[ add_call_global_ ] gputime=[ 2.112 ] cputime=[ 7.290 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 69.106 ]
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.707 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 54.235 ]
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.123 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 75.521 ]
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.707 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 78.729 ]
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 6.123 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.864 ] cputime=[ 67.357 ]
method=[ add_call_global_ ] gputime=[ 1.504 ] cputime=[ 5.832 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 53.944 ]
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.832 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 53.652 ]
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.540 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 76.105 ]
method=[ add_call_global_ ] gputime=[ 1.472 ] cputime=[ 6.123 ] occupancy=[ 1.000 ]
method=[ memcpyHtoD ] gputime=[ 0.832 ] cputime=[ 69.690 ]
method=[ add_call_global_ ] gputime=[ 1.536 ] cputime=[ 5.540 ] occupancy=[ 1.000 ]
method=[ memcpyDtoH ] gputime=[ 2.688 ] cputime=[ 87.477 ]
問題還是依舊。等待繼續反饋看結果。
7-31號補充:
把數據放置到module中用錯了,正確的方法如下:
module add
use cudafor
integer,device :: d_a(256,16)
contains
attributes(global) subroutine call_global()
integer :: i,j
i=threadIdx%x+(blockIdx%x-1)*blockDim%x
j=threadIdx%y+(blockIdx%y-1)*blockDim%y
d_a(i,j)=i+j
end subroutine call_global
end module
program test
use add
implicit none
integer :: a(256,16)
integer :: i
type(dim3) :: tblock,tgrid
tblock=dim3(128,2,1)
tgrid=dim3(2,8,1)
do i=1,10
call call_global<<<tgrid,tblock>>>()
enddo
a=d_a
print*,"a(1)=",a(1,1)
end program
profile分析結果:
# CUDA_PROFILE_LOG_VERSION 2.0
# CUDA_DEVICE 0 GeForce GTX 750
# CUDA_CONTEXT 1
# TIMESTAMPFACTOR 1385ef0816adc600
method,gputime,cputime,occupancy
method=[ add_call_global_ ] gputime=[ 1.888 ] cputime=[ 8.164 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 5.249 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.957 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.472 ] cputime=[ 4.957 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.440 ] cputime=[ 4.957 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.665 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.440 ] cputime=[ 4.374 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.440 ] cputime=[ 4.957 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.665 ] occupancy=[ 1.000 ]
method=[ add_call_global_ ] gputime=[ 1.408 ] cputime=[ 4.374 ] occupancy=[ 1.000 ]
method=[ memcpyDtoH ] gputime=[ 4.768 ] cputime=[ 167.663 ]
解決了不必要的H2D的數據拷貝。