summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorConnor Moore <connor@hhmoore.ca>2026-01-31 00:32:06 -0500
committerConnor Moore <connor@hhmoore.ca>2026-01-31 00:32:06 -0500
commitc896807fd9fc5b5f2cdbec9eef717f815af3779a (patch)
tree3267fd9331ea6337d327f407063f53a798a79bce
parente1babc4f71ba2e3fa3139dddb6d77f1c7b5a9683 (diff)
Added ANSI colours for error messages. Major update for the makefile, including parametrized start/stop/step and looping over optimization flags (O0/O1/O2/O3/Ofast)
-rw-r--r--Makefile76
-rw-r--r--matrixproduct.f9062
2 files changed, 85 insertions, 53 deletions
diff --git a/Makefile b/Makefile
index 32449de..629cda6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,35 +1,55 @@
-GCC=gfortran
-oneAPI=ifx
+GCC = gfortran
+oneAPI = ifx
+
+# These flags will be looped over to generate different datasets
+FLAGS := O0 O1 O2 O3 Ofast
+# Also specify a default (note the conditional assignment operation, ?=)
+OPTFLAGS ?= O3
+
+GREEN := \033[0;32m
+RESET := \033[0m
+
+S_START:= 100
+S_END := 3500
+S_STEP := 100
+
+L_END := 20000
+L_STEP := 500
all:
- mkdir -p bin/
- $(GCC) matrixproduct.f90 -o bin/$(GCC).serial.out -O3 -fexternal-blas -lopenblas -march=native
- $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).serial.out -qmkl=sequential -O3 -heap-arrays -xHost
- $(GCC) matrixproduct.f90 -o bin/$(GCC).parallel.out -O3 -fexternal-blas -lopenblas -march=native -fopenmp
- $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).parallel.out -qmkl=parallel -O3 -heap-arrays -xHost -fopenmp
-
-tests: clean all
- mkdir -p results/
-
- # Serial runs
- export OMP_NUM_THREADS=1
- ./bin/$(GCC).serial.out 100 3500 100 yes > results/$(GCC)_short_serial
- ./bin/$(oneAPI).serial.out 100 3500 100 yes > results/$(oneAPI)_short_serial
-
- ./bin/$(GCC).serial.out 3500 20000 500 no > results/$(GCC)_long_serial
- ./bin/$(oneAPI).serial.out 3500 20000 500 no > results/$(oneAPI)_long_serial
-
- # Parallel runs
- export OMP_NUM_THREADS=8
- ./bin/$(GCC).parallel.out 100 3500 100 yes > results/$(GCC)_short_parallel
- ./bin/$(oneAPI).parallel.out 100 3500 100 yes > results/$(oneAPI)_short_parallel
-
- ./bin/$(GCC).parallel.out 3500 20000 500 no > results/$(GCC)_long_parallel
- ./bin/$(oneAPI).parallel.out 3500 20000 500 no > results/$(oneAPI)_long_parallel
+ @mkdir -p bin/
+ @echo "$(GREEN)Compiling serial and parallel binaries with $(OPTFLAGS)$(RESET)"
+ $(GCC) matrixproduct.f90 -o bin/$(GCC).serial.out -$(OPTFLAGS) -fexternal-blas -lopenblas -march=native
+ $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).serial.out -qmkl=sequential -$(OPTFLAGS) -heap-arrays -xHost
+ $(GCC) matrixproduct.f90 -o bin/$(GCC).parallel.out -$(OPTFLAGS) -fexternal-blas -lopenblas -march=native -fopenmp
+ $(oneAPI) matrixproduct.f90 -o bin/$(oneAPI).parallel.out -qmkl=parallel -$(OPTFLAGS) -heap-arrays -xHost -fopenmp
+
+tests: clean
+ @mkdir -p results/
+ @for opt in $(FLAGS); do \
+ $(MAKE) all OPTFLAGS=$$opt; \
+ echo "$(GREEN)Running serial short runs with $$opt$(RESET)"; \
+ export OMP_NUM_THREADS=1; \
+ export MKL_NUM_THREADS=1; \
+ ./bin/$(GCC).serial.out $(S_START) $(S_END) $(S_STEP) yes > results/$(GCC)_short_serial_$$opt.out; \
+ ./bin/$(oneAPI).serial.out $(S_START) $(S_END) $(S_STEP) yes > results/$(oneAPI)_short_serial_$$opt.out; \
+ echo "$(GREEN)Running serial long runs with $$opt$(RESET)"; \
+ ./bin/$(GCC).serial.out $(S_END) $(L_END) $(L_STEP) no > results/$(GCC)_long_serial_$$opt.out; \
+ ./bin/$(oneAPI).serial.out $(S_END) $(L_END) $(L_STEP) no > results/$(oneAPI)_long_serial_$$opt.out; \
+ echo "$(GREEN)Running parallel short runs with $$opt$(RESET)"; \
+ export OMP_NUM_THREADS=8; \
+ export MKL_NUM_THREADS=8; \
+ ./bin/$(GCC).parallel.out $(S_START) $(S_END) $(S_STEP) yes > results/$(GCC)_short_parallel; \
+ ./bin/$(oneAPI).parallel.out $(S_START) $(S_END) $(S_STEP) yes > results/$(oneAPI)_short_parallel; \
+ echo "$(GREEN)Running parallel long runs with $$opt$(RESET)"; \
+ ./bin/$(GCC).parallel.out $(S_END) $(L_END) $(L_STEP) no > results/$(GCC)_long_parallel; \
+ ./bin/$(oneAPI).parallel.out $(S_END) $(L_END) $(L_STEP) no > results/$(oneAPI)_long_parallel; \
+ done
plots:
gnuplot -p plots.gnu
clean:
- rm -rf bin/
- rm -rf results/
+ @echo "$(GREEN)Cleaning bin/ and results/$(RESET)"
+ @rm -rf bin/
+ @rm -rf results/
diff --git a/matrixproduct.f90 b/matrixproduct.f90
index d7b3d13..97417e4 100644
--- a/matrixproduct.f90
+++ b/matrixproduct.f90
@@ -1,31 +1,40 @@
program matrixproduct
+ !> MCSC-6030G Project 1. Connor Moore, 2026 <connor@hhmoore.ca>
!> Performs a matrix-matrix multiplication using various methods
!> and compares the performance of each. The following are used:
!> 1. Basic triple-loop (iterative algorithm)
!> 2. Fortran native matmul routine
!> 3. LAPACK/BLAS library call
+ !> The wall times for these are compared with gfortran from gnu
+ !> and ifx from Intel. OpenMP is used for parallel comparisons
- use omp_lib
- use, intrinsic :: iso_fortran_env
- implicit none
- external :: dgemm !> double-precision general matrix-matrix multiplication
+ use omp_lib !> For OpenMP parallel do loops
+ use, intrinsic :: iso_fortran_env !> For named datatypes, e.g. real64
+ implicit none !> Don't infer any types from names
+ external :: dgemm !> double-precision general matrix-matrix multiplication
+ !> A number of variables will be delcared. This includes three square matrices (allocatable),
+ !> a start/end time holder, variables to hold time for each "technique" (loop/loop/matmul/blas),
+ !> and a (char and logical) for command line arugments. The ANSI escape char is also defined.
real(real64), allocatable, dimension(:,:) :: A, B, C
- real(real64) :: start, end, loop_time, loop_alt_time, matmul_time, blas_time
+ real(real64) :: loop_time, loop_alt_time, matmul_time, blas_time
+ integer(int64) :: start, end, clockrate
integer(int32) :: n, start_num, step_num, stop_num
- character(10) :: temp_in
+ character(len=32) :: temp_in
+ character(len=*), parameter :: ESC_CHAR=achar(27) !> Pretty printing with ANSI escape sequences
logical :: run_loops
!> Start by taking the command-line arguments. This is useful because
!> it lets us call the program from Bash with a variable matrix size
call get_command_argument(1, temp_in)
- read(temp_in,'(i10)') start_num
+ read(temp_in,*) start_num
call get_command_argument(2, temp_in)
- read(temp_in,'(i10)') stop_num
+ read(temp_in,*) stop_num
call get_command_argument(3, temp_in)
- read(temp_in,'(i10)') step_num
+ read(temp_in,*) step_num
+
!> The last argument is a string [yes/no] that instructs the program
!> to either run with the triple-loops or ignore them completely.
call get_command_argument(4, temp_in)
@@ -35,40 +44,43 @@ program matrixproduct
case ('no')
run_loops=.FALSE.
case default
- write(*,'("WARNING:",A," is not a supported argument [yes/no], defaulting to YES")') temp_in
- run_loops=.TRUE.
+ write(error_unit,'(A)') ESC_CHAR // "[31mERROR: Unsupported input (" // trim(temp_in) // ") for loop specification [yes/no]" // ESC_CHAR // "[0m"
+ stop
+
end select
- write(*,'("Compiled with ",A,A," on ",A)') COMPILER_VERSION(), COMPILER_OPTIONS()
- write(*,'("Running with start=",I0,", stop=",I0,", step=",I0)') start_num, stop_num, step_num
+ write(*,'(A)') ESC_CHAR // "[32m" // COMPILER_VERSION() // achar(10) // COMPILER_OPTIONS() // ESC_CHAR // "[0m"
+ write(*,'(A,I0,A,I0,A,I0)') "Running with start=", start_num, ", stop=", stop_num, ", step=", step_num
+
+ call system_clock(count_rate=clockrate)
do n = start_num, stop_num, step_num
call prep_mats(A,B,C,n)
if(run_loops) then
- call cpu_time(start)
+ call system_clock(count=start)
call triple_loop_mul(A,B,C,n)
- call cpu_time(end)
- loop_time = end-start
+ call system_clock(count=end)
+ loop_time = real(end-start, real64)/real(clockrate, real64)
C = 0
- call cpu_time(start)
+ call system_clock(count=start)
call triple_loop_mul_alt(A,B,C,n)
- call cpu_time(end)
- loop_alt_time = end-start
+ call system_clock(count=end)
+ loop_alt_time = real(end-start, real64)/real(clockrate, real64)
C = 0
endif
- call cpu_time(start)
+ call system_clock(count=start)
C = matmul(A,B)
- call cpu_time(end)
- matmul_time = end-start
+ call system_clock(count=end)
+ matmul_time = real(end-start, real64)/real(clockrate, real64)
C = 0
- call cpu_time(start)
+ call system_clock(count=start)
call dgemm('N', 'N', n, n, n, 1.0_real64, A, n, B, n, 0.0_real64, C, n)
- call cpu_time(end)
- blas_time = end-start
+ call system_clock(count=end)
+ blas_time = real(end-start, real64)/real(clockrate, real64)
deallocate(A,B,C)