1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
program matrixproduct
!> MCSC-6030G Project 1. Connor Moore, 2026 <connor@hhmoore.ca>
!> Performs a matrix-matrix multiplication using various methods
!> and compares the performance of each. The following are used:
!> 1. Basic triple-loop (iterative algorithm)
!> 2. Fortran native matmul routine
!> 3. LAPACK/BLAS library call
!> The wall times for these are compared with gfortran from gnu
!> and ifx from Intel. OpenMP is used for parallel comparisons
use omp_lib !> For OpenMP parallel do loops
use, intrinsic :: iso_fortran_env !> For named datatypes, e.g. real64
implicit none !> Don't infer any types from names
external :: dgemm !> double-precision general matrix-matrix multiplication
!> A number of variables will be delcared. This includes three square matrices (allocatable),
!> a start/end time holder, variables to hold time for each "technique" (loop/loop/matmul/blas),
!> and a (char and logical) for command line arugments. The ANSI escape char is also defined.
real(real64), allocatable, dimension(:,:) :: A, B, C
real(real64) :: loop_time, loop_alt_time, matmul_time, blas_time
integer(int64) :: start, end, clockrate
integer(int32) :: n, start_num, step_num, stop_num
character(len=32) :: temp_in
character(len=*), parameter :: ESC_CHAR=achar(27) !> Pretty printing with ANSI escape sequences
logical :: run_loops
!> Start by taking the command-line arguments. This is useful because
!> it lets us call the program from Bash with a variable matrix size
call get_command_argument(1, temp_in)
read(temp_in,*) start_num
call get_command_argument(2, temp_in)
read(temp_in,*) stop_num
call get_command_argument(3, temp_in)
read(temp_in,*) step_num
!> The last argument is a string [yes/no] that instructs the program
!> to either run with the triple-loops or ignore them completely.
call get_command_argument(4, temp_in)
select case (trim(temp_in))
case ('yes')
run_loops=.TRUE.
case ('no')
run_loops=.FALSE.
case default
write(error_unit,'(A)') ESC_CHAR // &
"[31mERROR: Unsupported input (" // trim(temp_in) // &
") for loop specification [yes/no]" // ESC_CHAR // "[0m"
stop
end select
write(*,'(A)') ESC_CHAR // "[32m" // COMPILER_VERSION() // achar(10) // COMPILER_OPTIONS() // ESC_CHAR // "[0m"
write(*,'(A,I0,A,I0,A,I0)') "Running with start=", start_num, ", stop=", stop_num, ", step=", step_num
call system_clock(count_rate=clockrate)
do n = start_num, stop_num, step_num
call prep_mats(A,B,C,n)
if(run_loops) then
call system_clock(count=start)
call triple_loop_mul(A,B,C,n)
call system_clock(count=end)
loop_time = real(end-start, real64)/real(clockrate, real64)
C = 0
call system_clock(count=start)
call triple_loop_mul_alt(A,B,C,n)
call system_clock(count=end)
loop_alt_time = real(end-start, real64)/real(clockrate, real64)
C = 0
endif
call system_clock(count=start)
C = matmul(A,B)
call system_clock(count=end)
matmul_time = real(end-start, real64)/real(clockrate, real64)
C = 0
call system_clock(count=start)
call dgemm('N', 'N', n, n, n, 1.0_real64, A, n, B, n, 0.0_real64, C, n)
call system_clock(count=end)
blas_time = real(end-start, real64)/real(clockrate, real64)
deallocate(A,B,C)
if(run_loops) then
write(*,'((i5),4(e16.8))') n, loop_time, loop_alt_time, matmul_time, blas_time
else
write(*,'((i5),2(e16.8))') n, matmul_time, blas_time
endif
enddo
contains
subroutine prep_mats(A,B,C,n)
real(real64), dimension(:,:), allocatable, intent(out) :: A,B,C
integer(int32), intent(in) :: n
allocate(A(n,n))
allocate(B(n,n))
allocate(C(n,n))
call random_number(A)
call random_number(B)
C = 0
end subroutine prep_mats
subroutine triple_loop_mul(A,B,C,n)
real(real64), dimension(:,:), intent(in) :: A,B
real(real64), dimension(:,:), intent(out) :: C
integer(int32), intent(in) :: n
integer(int32) :: i, j, k
!$omp parallel do private(j,k)
row: do i = 1,n
col: do j = 1,n
sum: do k = 1,n
C(i,j) = C(i,j) + A(i,k)*B(k,j)
end do sum
end do col
end do row
!$omp end parallel do
end subroutine triple_loop_mul
subroutine triple_loop_mul_alt(A,B,C,n)
real(real64), dimension(:,:), intent(in) :: A,B
real(real64), dimension(:,:), intent(out) :: C
integer(int32), intent(in) :: n
integer(int32) :: i, j, k
!$omp parallel do private(i,k)
col: do j = 1,n
row: do i = 1,n
sum: do k = 1,n
C(i,j) = C(i,j) + A(i,k)*B(k,j)
end do sum
end do row
end do col
!$omp end parallel do
end subroutine triple_loop_mul_alt
end program matrixproduct
|