matrixproduct.f90


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

program matrixproduct

    !> MCSC-6030G Project 1. Connor Moore, 2026 <connor@hhmoore.ca>
    !> Performs a matrix-matrix multiplication using various methods
    !> and compares the performance of each. The following are used:
    !>  1. Basic triple-loop (iterative algorithm)
    !>  2. Fortran native matmul routine
    !>  3. LAPACK/BLAS library call
    !> The wall times for these are compared with gfortran from gnu
    !> and ifx from Intel. OpenMP is used for parallel comparisons
        
    use omp_lib                         !> For OpenMP parallel do loops
    use, intrinsic :: iso_fortran_env   !> For named datatypes, e.g. real64
    implicit none                       !> Don't infer any types from names
    external :: dgemm                   !> double-precision general matrix-matrix multiplication
    
    !> A number of variables will be delcared. This includes three square matrices (allocatable),
    !> a start/end time holder, variables to hold time for each "technique" (loop/loop/matmul/blas),
    !> and a (char and logical) for command line arugments. The ANSI escape char is also defined.
    real(real64), allocatable, dimension(:,:) :: A, B, C
    real(real64) :: loop_time, loop_alt_time, matmul_time, blas_time
    integer(int64) :: start, end, clockrate
    integer(int32) :: n, start_num, step_num, stop_num
    character(len=32) :: temp_in
    character(len=*), parameter :: ESC_CHAR=achar(27) !> Pretty printing with ANSI escape sequences
    logical :: run_loops
    
    !> Start by taking the command-line arguments. This is useful because
    !> it lets us call the program from Bash with a variable matrix size

    call get_command_argument(1, temp_in)
    read(temp_in,*) start_num
    call get_command_argument(2, temp_in)
    read(temp_in,*) stop_num
    call get_command_argument(3, temp_in)
    read(temp_in,*) step_num

    !> The last argument is a string [yes/no] that instructs the program
    !> to either run with the triple-loops or ignore them completely.
    call get_command_argument(4, temp_in)
    select case (trim(temp_in))
        case ('yes')
            run_loops=.TRUE.
        case ('no')
            run_loops=.FALSE.
        case default
            write(error_unit,'(A)') ESC_CHAR // & 
                "[31mERROR: Unsupported input (" // trim(temp_in) // &
                ") for loop specification [yes/no]" // ESC_CHAR // "[0m"
            stop

    end select 

    write(*,'(A)') ESC_CHAR // "[32m" // COMPILER_VERSION() // achar(10) // COMPILER_OPTIONS() // ESC_CHAR // "[0m"
    write(*,'(A,I0,A,I0,A,I0)') "Running with start=", start_num, ", stop=", stop_num, ", step=", step_num

    call system_clock(count_rate=clockrate)
    
    do n = start_num, stop_num, step_num
        call prep_mats(A,B,C,n)

        if(run_loops) then
                call system_clock(count=start)
                call triple_loop_mul(A,B,C,n)
                call system_clock(count=end)
                loop_time = real(end-start, real64)/real(clockrate, real64)
                C = 0

                call system_clock(count=start)
                call triple_loop_mul_alt(A,B,C,n)
                call system_clock(count=end)
                loop_alt_time = real(end-start, real64)/real(clockrate, real64)
                C = 0
        endif

        call system_clock(count=start)
        C = matmul(A,B)
        call system_clock(count=end)
        matmul_time = real(end-start, real64)/real(clockrate, real64)
        C = 0

        call system_clock(count=start)
        call dgemm('N', 'N', n, n, n, 1.0_real64, A, n, B, n, 0.0_real64, C, n)
        call system_clock(count=end)
        blas_time = real(end-start, real64)/real(clockrate, real64)

        deallocate(A,B,C)

        if(run_loops) then
                write(*,'((i5),4(e16.8))') n, loop_time, loop_alt_time, matmul_time, blas_time
        else
                write(*,'((i5),2(e16.8))') n, matmul_time, blas_time
        endif
    enddo

contains

    subroutine prep_mats(A,B,C,n)
        real(real64), dimension(:,:), allocatable, intent(out) :: A,B,C
        integer(int32), intent(in) :: n

        allocate(A(n,n))
        allocate(B(n,n))
        allocate(C(n,n))

        call random_number(A)
        call random_number(B)
        C = 0

    end subroutine prep_mats
    
    subroutine triple_loop_mul(A,B,C,n)
        real(real64), dimension(:,:), intent(in) :: A,B
        real(real64), dimension(:,:), intent(out) :: C
        integer(int32), intent(in) :: n
        integer(int32) :: i, j, k

        !$omp parallel do private(j,k)
        row: do i = 1,n
            col: do j = 1,n
                sum: do k = 1,n
                    C(i,j) = C(i,j) + A(i,k)*B(k,j)
                end do sum
            end do col
        end do row
        !$omp end parallel do

    end subroutine triple_loop_mul

    subroutine triple_loop_mul_alt(A,B,C,n)
        real(real64), dimension(:,:), intent(in) :: A,B
        real(real64), dimension(:,:), intent(out) :: C
        integer(int32), intent(in) :: n
        integer(int32) :: i, j, k

        !$omp parallel do private(i,k)
        col: do j = 1,n
            row: do i = 1,n
                sum: do k = 1,n
                    C(i,j) = C(i,j) + A(i,k)*B(k,j)
                end do sum
            end do row
        end do col 
        !$omp end parallel do

    end subroutine triple_loop_mul_alt

end program matrixproduct