1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
program matrixproduct
!> Performs a matrix-matrix multiplication using various methods
!> and compares the performance of each. The following are used:
!> 1. Basic triple-loop (iterative algorithm)
!> 2. Fortran native matmul routine
!> 3. LAPACK/BLAS library call
use omp_lib
use, intrinsic :: iso_fortran_env
implicit none
external :: dgemm !> double-precision general matrix-matrix multiplication
real(real64), allocatable, dimension(:,:) :: A, B, C
real(real64) :: start, end, loop_time, loop_alt_time, matmul_time, blas_time
integer(int32) :: n, start_num, step_num, stop_num
character(10) :: temp_in
logical :: run_loops
!> Start by taking the command-line arguments. This is useful because
!> it lets us call the program from Bash with a variable matrix size
call get_command_argument(1, temp_in)
read(temp_in,'(i10)') start_num
call get_command_argument(2, temp_in)
read(temp_in,'(i10)') stop_num
call get_command_argument(3, temp_in)
read(temp_in,'(i10)') step_num
!> The last argument is a string [yes/no] that instructs the program
!> to either run with the triple-loops or ignore them completely.
call get_command_argument(4, temp_in)
select case (trim(temp_in))
case ('yes')
run_loops=.TRUE.
case ('no')
run_loops=.FALSE.
case default
write(*,'("WARNING:",A," is not a supported argument [yes/no], defaulting to YES")') temp_in
run_loops=.TRUE.
end select
write(*,'("Compiled with ",A,A," on ",A)') COMPILER_VERSION(), COMPILER_OPTIONS()
write(*,'("Running with start=",I0,", stop=",I0,", step=",I0)') start_num, stop_num, step_num
do n = start_num, stop_num, step_num
call prep_mats(A,B,C,n)
if(run_loops) then
call cpu_time(start)
call triple_loop_mul(A,B,C,n)
call cpu_time(end)
loop_time = end-start
C = 0
call cpu_time(start)
call triple_loop_mul_alt(A,B,C,n)
call cpu_time(end)
loop_alt_time = end-start
C = 0
endif
call cpu_time(start)
C = matmul(A,B)
call cpu_time(end)
matmul_time = end-start
C = 0
call cpu_time(start)
call dgemm('N', 'N', n, n, n, 1.0_real64, A, n, B, n, 0.0_real64, C, n)
call cpu_time(end)
blas_time = end-start
deallocate(A,B,C)
if(run_loops) then
write(*,'((i5),4(e16.8))') n, loop_time, loop_alt_time, matmul_time, blas_time
else
write(*,'((i5),2(e16.8))') n, matmul_time, blas_time
endif
enddo
contains
subroutine prep_mats(A,B,C,n)
real(real64), dimension(:,:), allocatable, intent(out) :: A,B,C
integer(int32), intent(in) :: n
allocate(A(n,n))
allocate(B(n,n))
allocate(C(n,n))
call random_number(A)
call random_number(B)
C = 0
end subroutine prep_mats
subroutine triple_loop_mul(A,B,C,n)
real(real64), dimension(:,:), intent(in) :: A,B
real(real64), dimension(:,:), intent(out) :: C
integer(int32), intent(in) :: n
integer(int32) :: i, j, k
!$omp parallel do private(j,k)
row: do i = 1,n
col: do j = 1,n
sum: do k = 1,n
C(i,j) = C(i,j) + A(i,k)*B(k,j)
end do sum
end do col
end do row
!$omp end parallel do
end subroutine triple_loop_mul
subroutine triple_loop_mul_alt(A,B,C,n)
real(real64), dimension(:,:), intent(in) :: A,B
real(real64), dimension(:,:), intent(out) :: C
integer(int32), intent(in) :: n
integer(int32) :: i, j, k
!$omp parallel do private(j,k)
col: do j = 1,n
row: do i = 1,n
sum: do k = 1,n
C(i,j) = C(i,j) + A(i,k)*B(k,j)
end do sum
end do row
end do col
!$omp end parallel do
end subroutine triple_loop_mul_alt
end program matrixproduct
|