program matrixproduct

    !> Performs a matrix-matrix multiplication using various methods
    !> and compares the performance of each. The following are used:
    !>  1. Basic triple-loop (iterative algorithm)
    !>  2. Fortran native matmul routine
    !>  3. LAPACK/BLAS library call
        
    use :: omp_lib
    use, intrinsic :: iso_fortran_env
    implicit none
    external :: dgemm   !> double-precision general matrix-matrix multiplication
    
    real(real64), allocatable, dimension(:,:) :: A, B, C
    real(real64) :: start, end, loop_time, loop_alt_time, matmul_time, blas_time
    integer(int32) :: n, start_num, step_num, stop_num
    character(10) :: temp_in
    logical :: run_loops

    call get_command_argument(1, temp_in)
    read(temp_in,'(i10)') start_num
    call get_command_argument(2, temp_in)
    read(temp_in,'(i10)') step_num
    call get_command_argument(3, temp_in)
    read(temp_in,'(i10)') stop_num
    call get_command_argument(4, temp_in)
    select case (temp_in)
        case ('yes       ')
            run_loops=.TRUE.
        case ('no        ')
            run_loops=.FALSE.
        case default
            write(*,'(A,A,A)') "WARNING: ",temp_in," not supported argument for run_loops [yes/no]"
    end select 

    write(*,'(A,i10,i10,i10)') "Running with start, step, stop ",start_num,step_num,stop_num
    
    do n = start_num, stop_num, step_num
        call prep_mats(A,B,C,n)

        if(run_loops) then
                call cpu_time(start)
                call triple_loop_mul(A,B,C,n)
                call cpu_time(end)
                loop_time = end-start
                C = 0

                call cpu_time(start)
                call triple_loop_mul_alt(A,B,C,n)
                call cpu_time(end)
                loop_alt_time = end-start
                C = 0
        endif

        call cpu_time(start)
        C = matmul(A,B)
        call cpu_time(end)
        matmul_time = end-start
        C = 0

        call cpu_time(start)
        call dgemm('N', 'N', n, n, n, 1.0_real64, A, n, B, n, 0.0_real64, C, n)
        call cpu_time(end)
        blas_time = end-start

        deallocate(A,B,C)

        if(run_loops) then
                write(*,'((i5),4(e16.8))') n, loop_time, loop_alt_time, matmul_time, blas_time
        else
                write(*,'((i5),2(e16.8))') n, matmul_time, blas_time
        endif
    enddo

contains

    subroutine prep_mats(A,B,C,n)
        real(real64), dimension(:,:), allocatable, intent(out) :: A,B,C
        integer(int32), intent(in) :: n

        allocate(A(n,n))
        allocate(B(n,n))
        allocate(C(n,n))

        call random_number(A)
        call random_number(B)
        C = 0

    end subroutine prep_mats
    
    subroutine triple_loop_mul(A,B,C,n)
        real(real64), dimension(:,:), intent(in) :: A,B
        real(real64), dimension(:,:), intent(out) :: C
        integer(int32), intent(in) :: n
        integer(int32) :: i, j, k

        !$omp parallel do private(i,j,k)
        row: do i = 1,n
            col: do j = 1,n
                sum: do k = 1,n
                    C(i,j) = C(i,j) + A(i,k)*B(k,j)
                end do sum
            end do col
        end do row
        !$omp end parallel do

    end subroutine triple_loop_mul

    subroutine triple_loop_mul_alt(A,B,C,n)
        real(real64), dimension(:,:), intent(in) :: A,B
        real(real64), dimension(:,:), intent(out) :: C
        integer(int32), intent(in) :: n
        integer(int32) :: i, j, k

        col: do j = 1,n
            row: do i = 1,n
                sum: do k = 1,n
                    C(i,j) = C(i,j) + A(i,k)*B(k,j)
                end do sum
            end do row
        end do col 

    end subroutine triple_loop_mul_alt

end program matrixproduct