diff options
| author | Connor Moore <connor@hhmoore.ca> | 2026-01-31 00:32:06 -0500 |
|---|---|---|
| committer | Connor Moore <connor@hhmoore.ca> | 2026-01-31 00:32:06 -0500 |
| commit | c896807fd9fc5b5f2cdbec9eef717f815af3779a (patch) | |
| tree | 3267fd9331ea6337d327f407063f53a798a79bce | |
| parent | e1babc4f71ba2e3fa3139dddb6d77f1c7b5a9683 (diff) | |
Added ANSI colours for error messages. Major update for the makefile, including parametrized start/stop/step and looping over optimization flags (O0/O1/O2/O3/Ofast)
| -rw-r--r-- | Makefile | 76 | ||||
| -rw-r--r-- | matrixproduct.f90 | 62 |
2 files changed, 85 insertions, 53 deletions
@@ -1,35 +1,55 @@ -GCC=gfortran -oneAPI=ifx +GCC = gfortran +oneAPI = ifx + +# These flags will be looped over to generate different datasets +FLAGS := O0 O1 O2 O3 Ofast +# Also specify a default (note the conditional assignment operation, ?=) +OPTFLAGS ?= O3 + +GREEN := \033[0;32m +RESET := \033[0m + +S_START:= 100 +S_END := 3500 +S_STEP := 100 + +L_END := 20000 +L_STEP := 500 all: - mkdir -p bin/ - $(GCC) matrixproduct.f90 -o bin/$(GCC).serial.out -O3 -fexternal-blas -lopenblas -march=native - $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).serial.out -qmkl=sequential -O3 -heap-arrays -xHost - $(GCC) matrixproduct.f90 -o bin/$(GCC).parallel.out -O3 -fexternal-blas -lopenblas -march=native -fopenmp - $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).parallel.out -qmkl=parallel -O3 -heap-arrays -xHost -fopenmp - -tests: clean all - mkdir -p results/ - - # Serial runs - export OMP_NUM_THREADS=1 - ./bin/$(GCC).serial.out 100 3500 100 yes > results/$(GCC)_short_serial - ./bin/$(oneAPI).serial.out 100 3500 100 yes > results/$(oneAPI)_short_serial - - ./bin/$(GCC).serial.out 3500 20000 500 no > results/$(GCC)_long_serial - ./bin/$(oneAPI).serial.out 3500 20000 500 no > results/$(oneAPI)_long_serial - - # Parallel runs - export OMP_NUM_THREADS=8 - ./bin/$(GCC).parallel.out 100 3500 100 yes > results/$(GCC)_short_parallel - ./bin/$(oneAPI).parallel.out 100 3500 100 yes > results/$(oneAPI)_short_parallel - - ./bin/$(GCC).parallel.out 3500 20000 500 no > results/$(GCC)_long_parallel - ./bin/$(oneAPI).parallel.out 3500 20000 500 no > results/$(oneAPI)_long_parallel + @mkdir -p bin/ + @echo "$(GREEN)Compiling serial and parallel binaries with $(OPTFLAGS)$(RESET)" + $(GCC) matrixproduct.f90 -o bin/$(GCC).serial.out -$(OPTFLAGS) -fexternal-blas -lopenblas -march=native + $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).serial.out -qmkl=sequential -$(OPTFLAGS) -heap-arrays -xHost + $(GCC) matrixproduct.f90 -o bin/$(GCC).parallel.out -$(OPTFLAGS) -fexternal-blas -lopenblas -march=native -fopenmp + $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).parallel.out -qmkl=parallel -$(OPTFLAGS) -heap-arrays -xHost -fopenmp + +tests: clean + @mkdir -p results/ + @for opt in $(FLAGS); do \ + $(MAKE) all OPTFLAGS=$$opt; \ + echo "$(GREEN)Running serial short runs with $$opt$(RESET)"; \ + export OMP_NUM_THREADS=1; \ + export MKL_NUM_THREADS=1; \ + ./bin/$(GCC).serial.out $(S_START) $(S_END) $(S_STEP) yes > results/$(GCC)_short_serial_$$opt.out; \ + ./bin/$(oneAPI).serial.out $(S_START) $(S_END) $(S_STEP) yes > results/$(oneAPI)_short_serial_$$opt.out; \ + echo "$(GREEN)Running serial long runs with $$opt$(RESET)"; \ + ./bin/$(GCC).serial.out $(S_END) $(L_END) $(L_STEP) no > results/$(GCC)_long_serial_$$opt.out; \ + ./bin/$(oneAPI).serial.out $(S_END) $(L_END) $(L_STEP) no > results/$(oneAPI)_long_serial_$$opt.out; \ + echo "$(GREEN)Running parallel short runs with $$opt$(RESET)"; \ + export OMP_NUM_THREADS=8; \ + export MKL_NUM_THREADS=8; \ + ./bin/$(GCC).parallel.out $(S_START) $(S_END) $(S_STEP) yes > results/$(GCC)_short_parallel; \ + ./bin/$(oneAPI).parallel.out $(S_START) $(S_END) $(S_STEP) yes > results/$(oneAPI)_short_parallel; \ + echo "$(GREEN)Running parallel long runs with $$opt$(RESET)"; \ + ./bin/$(GCC).parallel.out $(S_END) $(L_END) $(L_STEP) no > results/$(GCC)_long_parallel; \ + ./bin/$(oneAPI).parallel.out $(S_END) $(L_END) $(L_STEP) no > results/$(oneAPI)_long_parallel; \ + done plots: gnuplot -p plots.gnu clean: - rm -rf bin/ - rm -rf results/ + @echo "$(GREEN)Cleaning bin/ and results/$(RESET)" + @rm -rf bin/ + @rm -rf results/ diff --git a/matrixproduct.f90 b/matrixproduct.f90 index d7b3d13..97417e4 100644 --- a/matrixproduct.f90 +++ b/matrixproduct.f90 @@ -1,31 +1,40 @@ program matrixproduct + !> MCSC-6030G Project 1. Connor Moore, 2026 <connor@hhmoore.ca> !> Performs a matrix-matrix multiplication using various methods !> and compares the performance of each. The following are used: !> 1. Basic triple-loop (iterative algorithm) !> 2. Fortran native matmul routine !> 3. LAPACK/BLAS library call + !> The wall times for these are compared with gfortran from gnu + !> and ifx from Intel. OpenMP is used for parallel comparisons - use omp_lib - use, intrinsic :: iso_fortran_env - implicit none - external :: dgemm !> double-precision general matrix-matrix multiplication + use omp_lib !> For OpenMP parallel do loops + use, intrinsic :: iso_fortran_env !> For named datatypes, e.g. real64 + implicit none !> Don't infer any types from names + external :: dgemm !> double-precision general matrix-matrix multiplication + !> A number of variables will be delcared. This includes three square matrices (allocatable), + !> a start/end time holder, variables to hold time for each "technique" (loop/loop/matmul/blas), + !> and a (char and logical) for command line arugments. The ANSI escape char is also defined. real(real64), allocatable, dimension(:,:) :: A, B, C - real(real64) :: start, end, loop_time, loop_alt_time, matmul_time, blas_time + real(real64) :: loop_time, loop_alt_time, matmul_time, blas_time + integer(int64) :: start, end, clockrate integer(int32) :: n, start_num, step_num, stop_num - character(10) :: temp_in + character(len=32) :: temp_in + character(len=*), parameter :: ESC_CHAR=achar(27) !> Pretty printing with ANSI escape sequences logical :: run_loops !> Start by taking the command-line arguments. This is useful because !> it lets us call the program from Bash with a variable matrix size call get_command_argument(1, temp_in) - read(temp_in,'(i10)') start_num + read(temp_in,*) start_num call get_command_argument(2, temp_in) - read(temp_in,'(i10)') stop_num + read(temp_in,*) stop_num call get_command_argument(3, temp_in) - read(temp_in,'(i10)') step_num + read(temp_in,*) step_num + !> The last argument is a string [yes/no] that instructs the program !> to either run with the triple-loops or ignore them completely. call get_command_argument(4, temp_in) @@ -35,40 +44,43 @@ program matrixproduct case ('no') run_loops=.FALSE. case default - write(*,'("WARNING:",A," is not a supported argument [yes/no], defaulting to YES")') temp_in - run_loops=.TRUE. + write(error_unit,'(A)') ESC_CHAR // "[31mERROR: Unsupported input (" // trim(temp_in) // ") for loop specification [yes/no]" // ESC_CHAR // "[0m" + stop + end select - write(*,'("Compiled with ",A,A," on ",A)') COMPILER_VERSION(), COMPILER_OPTIONS() - write(*,'("Running with start=",I0,", stop=",I0,", step=",I0)') start_num, stop_num, step_num + write(*,'(A)') ESC_CHAR // "[32m" // COMPILER_VERSION() // achar(10) // COMPILER_OPTIONS() // ESC_CHAR // "[0m" + write(*,'(A,I0,A,I0,A,I0)') "Running with start=", start_num, ", stop=", stop_num, ", step=", step_num + + call system_clock(count_rate=clockrate) do n = start_num, stop_num, step_num call prep_mats(A,B,C,n) if(run_loops) then - call cpu_time(start) + call system_clock(count=start) call triple_loop_mul(A,B,C,n) - call cpu_time(end) - loop_time = end-start + call system_clock(count=end) + loop_time = real(end-start, real64)/real(clockrate, real64) C = 0 - call cpu_time(start) + call system_clock(count=start) call triple_loop_mul_alt(A,B,C,n) - call cpu_time(end) - loop_alt_time = end-start + call system_clock(count=end) + loop_alt_time = real(end-start, real64)/real(clockrate, real64) C = 0 endif - call cpu_time(start) + call system_clock(count=start) C = matmul(A,B) - call cpu_time(end) - matmul_time = end-start + call system_clock(count=end) + matmul_time = real(end-start, real64)/real(clockrate, real64) C = 0 - call cpu_time(start) + call system_clock(count=start) call dgemm('N', 'N', n, n, n, 1.0_real64, A, n, B, n, 0.0_real64, C, n) - call cpu_time(end) - blas_time = end-start + call system_clock(count=end) + blas_time = real(end-start, real64)/real(clockrate, real64) deallocate(A,B,C) |
