module cula_test use cudafor contains ! gpu error reporting routine subroutine check_status(status) integer status integer info integer cula_geterrorinfo info = cula_geterrorinfo() if (status .ne. 0) then if (status .eq. 7) then write(*,*) 'invalid value for parameter ', info else if (status .eq. 8) then write(*,*) 'data error (', info ,')' else if (status .eq. 9) then write(*,*) 'blas error (', info ,')' else if (status .eq. 10) then write(*,*) 'runtime error (', info ,')' else call cula_getstatusstring(status) endif stop 1 end if end subroutine ! cpu test (baseline) subroutine do_cpu_test(n,nrhs,ain,bin) ! input real,dimension(:,:) :: ain,bin ! allocations real,dimension(:,:),allocatable :: a,b,ans integer,dimension(:),allocatable :: ipiv integer n,nrhs integer c1,c2,cr,cm real norm ! back up input for reconstruction test allocate( a(n,n), b(n,nrhs), ipiv(n), ans(n,nrhs) ) a = ain b = bin ! start test call system_clock( c1, cr, cm ) print *, 'starting cpu test...' ! call lapack solver call sgesv(n,nrhs,a,n,ipiv,b,n,info) ! stop test call system_clock( count=c2 ) print *, ' runtime:', 1.e3*real(c2-c1) / real(cr), 'ms' print *, ' gflops:', (0.66*n**3.) / (real(c2-c1) / real(cr)) / (1.e9) ! check answer ans = bin; call sgemm('n','n',n,nrhs,n,1.0,ain,n,b,n,-1.0,ans,n) norm = slange('1',n,nrhs,ans,n,work) / real(n) print *, ' error:', norm print *, '' ! cleanup deallocate(a,b,ipiv,ans) end subroutine do_cpu_test ! cula test (host interface) subroutine do_cula_host_test(n,nrhs,ain,bin) ! input real,dimension(:,:) :: ain,bin ! allocations (all on host) real,dimension(:,:),allocatable :: a,b,ans integer,dimension(:),allocatable :: ipiv integer n,nrhs,status integer c1,c2,cr,cm real norm ! back up input for reconstruction test allocate( a(n,n), b(n,nrhs), ipiv(n), ans(n,nrhs) ) a = ain b = bin ! start test call system_clock( c1,cr,cm ) print *, 'starting cula (host interface) test...' ! call cula solver (host interface) status = cula_sgesv(n,nrhs,a,n,ipiv,b,n) call check_status(status) ! stop test call system_clock( count=c2 ) print *, ' runtime:', 1.e3*real(c2-c1) / real(cr), 'ms' print *, ' gflops:', (0.66*n**3.) / (real(c2-c1) / real(cr)) / (1.e9) ! check answer ans = bin; call sgemm('n','n',n,nrhs,n,1.0,ain,n,b,n,-1.0,ans,n) norm = slange('1',n,nrhs,ans,n,work) / real(n) print *, ' error:', norm print *, '' ! cleanup deallocate(a,b,ipiv,ans) end subroutine do_cula_host_test ! cula test (device interface) subroutine do_cula_device_test(n,nrhs,ain,bin) ! input real,dimension(:,:) :: ain,bin ! allocations (all on host) real,dimension(:,:),allocatable :: a,b,ans integer n,nrhs,status integer c1,c2,cr,cm real norm ! gpu memory real,device,dimension(:,:),allocatable :: a_dev,b_dev integer,device,dimension(:),allocatable :: ipiv_dev ! back up input for reconstruction test allocate( a(n,n), b(n,nrhs), ans(n,nrhs) ) a(1:n,1:n) = ain b(1:n,1:nrhs) = bin ! allocate gpu memory allocate( a_dev(n,n), b_dev(n,nrhs), ipiv_dev(n) ) ! start test call system_clock( c1,cr,cm ) print *, 'starting cula (device interface) test...' ! copy memory to gpu a_dev = a b_dev = b ! call cula solver (device interface) status = cula_device_sgesv(n,nrhs,a_dev,n,ipiv_dev,b_dev,n) ! copy answer to host b = b_dev ! stop test call system_clock( count=c2 ) print *, ' runtime:', 1.e3*real(c2-c1) / real(cr), 'ms' print *, ' gflops:', (0.66*n**3.) / (real(c2-c1) / real(cr)) / (1.e9) ! check answer ans(1:n,1:nrhs) = bin; call sgemm('n','n',n,nrhs,n,1.,ain,n,b,n,-1.,ans,n) norm = slange('1',n,nrhs,ans,n,work) / real(n) print *, ' error:', norm print *, '' ! cleanup deallocate(a,b,ans) deallocate(a_dev,b_dev,ipiv_dev) end subroutine do_cula_device_test end module cula_test ! main program program cula use cula_test real error,eps ! Host memory real,dimension(:,:),allocatable :: a, b integer n, info, i, j, status n = 10000 nrhs = 1 print *,'cula + pgfortran test (matrix solve)' print *,' array size: ', n, ' by ', n print *,' right hand sides: ', nrhs print *,'' allocate( a(n,n), b(n,nrhs) ) ! intialize a and b call random_number(a) call random_number(b) ! Make sure a() isn't singular do i=1,n a(i,i) = 10. * a(i,i) + 10. enddo ! initialize cula status = cula_initialize() call check_status(status) ! do cpu test (baseline) call do_cpu_test(n,nrhs,a,b) ! do gpu test (host interface) call do_cula_host_test(n,nrhs,a,b) ! do gpu test (device interface) call do_cula_device_test(n,nrhs,a,b) end program cula