program matrixproduct

    !> MCSC-6030G Project 1. Connor Moore, 2026 <connor@hhmoore.ca>
    !> Performs a matrix-matrix multiplication using various methods
    !> and compares the performance of each. The following are used:
    !>  1. Basic triple-loop (iterative algorithm)
    !>  2. Fortran native matmul routine
    !>  3. LAPACK/BLAS library call
    !> The wall times for these are compared with gfortran from gnu
    !> and ifx from Intel. OpenMP is used for parallel comparisons
        
    use omp_lib                         !> For OpenMP parallel do loops
    use, intrinsic :: iso_fortran_env   !> For named datatypes, e.g. real64
    implicit none                       !> Don't infer any types from names
    external :: dgemm                   !> double-precision general matrix-matrix multiplication
    
    !> A number of variables will be delcared. This includes three square matrices (allocatable),
    !> a start/end time holder, variables to hold time for each "technique" (loop/loop/matmul/blas),
    !> and a (char and logical) for command line arugments. The ANSI escape char is also defined.
    real(real64), allocatable, dimension(:,:) :: A, B, C
    real(real64) :: loop_time, loop_alt_time, matmul_time, blas_time
    integer(int64) :: start, end, clockrate
    integer(int32) :: n, start_num, step_num, stop_num
    character(len=32) :: temp_in
    character(len=*), parameter :: ESC_CHAR=achar(27) !> Pretty printing with ANSI escape sequences
    logical :: run_loops
    
    !> Start by taking the command-line arguments. This is useful because
    !> it lets us call the program from Bash with a variable matrix size

    call get_command_argument(1, temp_in)
    read(temp_in,*) start_num
    call get_command_argument(2, temp_in)
    read(temp_in,*) stop_num
    call get_command_argument(3, temp_in)
    read(temp_in,*) step_num

    !> The last argument is a string [yes/no] that instructs the program
    !> to either run with the triple-loops or ignore them completely.
    call get_command_argument(4, temp_in)
    select case (trim(temp_in))
        case ('yes')
            run_loops=.TRUE.
        case ('no')
            run_loops=.FALSE.
        case default
            write(error_unit,'(A)') ESC_CHAR // & 
                "[31mERROR: Unsupported input (" // trim(temp_in) // &
                ") for loop specification [yes/no]" // ESC_CHAR // "[0m"
            stop

    end select 

    write(*,'(A)') ESC_CHAR // "[32m" // COMPILER_VERSION() // achar(10) // COMPILER_OPTIONS() // ESC_CHAR // "[0m"
    write(*,'(A,I0,A,I0,A,I0)') "Running with start=", start_num, ", stop=", stop_num, ", step=", step_num

    call system_clock(count_rate=clockrate)
    
    do n = start_num, stop_num, step_num
        call prep_mats(A,B,C,n)

        if(run_loops) then
                call system_clock(count=start)
                call triple_loop_mul(A,B,C,n)
                call system_clock(count=end)
                loop_time = real(end-start, real64)/real(clockrate, real64)
                C = 0

                call system_clock(count=start)
                call triple_loop_mul_alt(A,B,C,n)
                call system_clock(count=end)
                loop_alt_time = real(end-start, real64)/real(clockrate, real64)
                C = 0
        endif

        call system_clock(count=start)
        C = matmul(A,B)
        call system_clock(count=end)
        matmul_time = real(end-start, real64)/real(clockrate, real64)
        C = 0

        call system_clock(count=start)
        call dgemm('N', 'N', n, n, n, 1.0_real64, A, n, B, n, 0.0_real64, C, n)
        call system_clock(count=end)
        blas_time = real(end-start, real64)/real(clockrate, real64)

        deallocate(A,B,C)

        if(run_loops) then
                write(*,'((i5),4(e16.8))') n, loop_time, loop_alt_time, matmul_time, blas_time
        else
                write(*,'((i5),2(e16.8))') n, matmul_time, blas_time
        endif
    enddo

contains

    subroutine prep_mats(A,B,C,n)
        real(real64), dimension(:,:), allocatable, intent(out) :: A,B,C
        integer(int32), intent(in) :: n

        allocate(A(n,n))
        allocate(B(n,n))
        allocate(C(n,n))

        call random_number(A)
        call random_number(B)
        C = 0

    end subroutine prep_mats
    
    subroutine triple_loop_mul(A,B,C,n)
        real(real64), dimension(:,:), intent(in) :: A,B
        real(real64), dimension(:,:), intent(out) :: C
        integer(int32), intent(in) :: n
        integer(int32) :: i, j, k

        !$omp parallel do private(j,k)
        row: do i = 1,n
            col: do j = 1,n
                sum: do k = 1,n
                    C(i,j) = C(i,j) + A(i,k)*B(k,j)
                end do sum
            end do col
        end do row
        !$omp end parallel do

    end subroutine triple_loop_mul

    subroutine triple_loop_mul_alt(A,B,C,n)
        real(real64), dimension(:,:), intent(in) :: A,B
        real(real64), dimension(:,:), intent(out) :: C
        integer(int32), intent(in) :: n
        integer(int32) :: i, j, k

        !$omp parallel do private(i,k)
        col: do j = 1,n
            row: do i = 1,n
                sum: do k = 1,n
                    C(i,j) = C(i,j) + A(i,k)*B(k,j)
                end do sum
            end do row
        end do col 
        !$omp end parallel do

    end subroutine triple_loop_mul_alt

end program matrixproduct