summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorConnor Moore <connor@hhmoore.ca>2026-02-04 23:58:05 -0500
committerConnor Moore <connor@hhmoore.ca>2026-02-04 23:58:05 -0500
commitec3c5a4856629d626236d9b12e3077c46e907b8f (patch)
tree28453220c6cf837ebd1ffd7b3528d443dc08ebb5
parente771927d4b8c5b32acf28774161c5d2b0c4f32bf (diff)
Added compiler comparison to report. Restructured plots. Added
automation for concatenating results into one big csv.
-rwxr-xr-xexport_to_csv.sh21
-rw-r--r--plots.gnu43
-rw-r--r--report/figures/plots.gnu119
-rw-r--r--report/refs.bib2
-rw-r--r--report/report.tex76
5 files changed, 210 insertions, 51 deletions
diff --git a/export_to_csv.sh b/export_to_csv.sh
new file mode 100755
index 0000000..5dd01dc
--- /dev/null
+++ b/export_to_csv.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+rm -f results/results.csv
+
+for file in $(ls results/*.out); do
+ COMPILER=$(echo $file | cut -d / -f 2 | cut -d _ -f 1)
+ RUN_SIZE=$(echo $file | cut -d / -f 2 | cut -d _ -f 2)
+ RUN_PARL=$(echo $file | cut -d / -f 2 | cut -d _ -f 3)
+ RUN_FLAG=$(echo $file | cut -d / -f 2 | cut -d _ -f 4 | cut -d . -f 1)
+
+ # Get each row of result
+ tail -n +4 $file | while read -r line; do
+ CLEANED_DATA=$(echo $line | tr -s ' ', ',')
+ if [ $RUN_SIZE == "short" ]; then
+ echo $COMPILER,$RUN_FLAG,$RUN_PARL,$CLEANED_DATA >> results/results.csv
+ else
+ CLEANED_DATA=$(echo $CLEANED_DATA | awk -F, -v OFS=, '{$1 = $1 ",-1.00000000E+00,-1.00000000E+00";print}')
+ echo $COMPILER,$RUN_FLAG,$RUN_PARL,$CLEANED_DATA >> results/results.csv
+ fi
+ done
+done
diff --git a/plots.gnu b/plots.gnu
deleted file mode 100644
index 8c728e7..0000000
--- a/plots.gnu
+++ /dev/null
@@ -1,43 +0,0 @@
-set key top left
-set format y "%.1f x 10^{%L}"
-
-short_keys = "Triple-loop-row Triple-loop-col Matmul BLAS-Dgemm"
-
-set terminal x11 0 title "GCC Runs"
-set multiplot layout 2, 1
- set title "Short runs"
- set logscale x
- set logscale y
- set grid
- set xlabel("N")
- set ylabel("Time [s]")
- plot for [i=2:5] 'results/gfortran_short' every ::2 using 1:i with linespoints title word(short_keys,i-1)
-
- set title "Long runs"
- set logscale x
- set logscale y
- set grid
- set xlabel("N")
- set ylabel("Time [s]")
- plot for [i=2:3] 'results/gfortran_long' every ::2 using 1:i with linespoints title word(short_keys,i+1)
-unset multiplot
-
-
-set terminal x11 1 title "OneAPI Runs"
-set multiplot layout 2, 1
- set title "Short runs"
- set logscale x
- set logscale y
- set grid
- set xlabel("N")
- set ylabel("Time [s]")
- plot for [i=2:5] 'results/ifx_short' every ::2 using 1:i with linespoints title word(short_keys,i-1)
-
- set title "Long runs"
- set logscale x
- set logscale y
- set grid
- set xlabel("N")
- set ylabel("Time [s]")
- plot for [i=2:3] 'results/ifx_long' every ::2 using 1:i with linespoints title word(short_keys,i+1)
-unset multiplot
diff --git a/report/figures/plots.gnu b/report/figures/plots.gnu
new file mode 100644
index 0000000..ee56998
--- /dev/null
+++ b/report/figures/plots.gnu
@@ -0,0 +1,119 @@
+set datafile separator ","
+
+# SERIAL, O3, GFORTRAN-SPECIFIC RESULTS (FIGURE 1)
+
+#set terminal x11 0 title "Fig. 1 N-scaling" persist
+set terminal cairolatex pdf size 5in,3in
+set output "f1_n_scaling.tex"
+set logscale y 10
+set logscale x 10
+
+set xlabel "Size [$N \\times N$]"
+set ylabel "Wall Time [s]"
+
+set format y '$10^{%T}$'
+set offsets graph 0.05, graph 0.05, graph 0.05, graph 0.05
+set key bottom right
+set grid
+set xrange[70:15500]
+
+plot 'gfortran_O3_serial.csv' using 1:($2 > 0 ? $2 : 1/0) with linespoints pt 7 ps 0.2 title "Row loop", \
+ 'gfortran_O3_serial.csv' using 1:($3 > 0 ? $3 : 1/0) with linespoints pt 7 ps 0.2 title "Col loop",\
+ 'gfortran_O3_serial.csv' using 1:4 with linespoints pt 7 ps 0.2 title "Matmul", \
+ 'gfortran_O3_serial.csv' using 1:5 with linespoints pt 7 ps 0.2 title "DGEMM"
+
+
+# SERIAL, O3, COMPILER-DEPENDENT WALL TIMES (LOOPS ONLY) (FIGURE 2)
+
+#set terminal x11 1 title "Fig.2 Compiler" persist
+set terminal cairolatex pdf size 5in,3in
+set output "f2_compilers_scaling.tex"
+
+set logscale y 10
+set logscale x 10
+
+set xlabel "Size [$N \\times N$]"
+set ylabel "Wall Time [s]"
+
+set format y '$10^{%T}$'
+set offsets graph 0.05, graph 0.05, graph 0.05, graph 0.05
+set key bottom right
+set grid
+set xrange[70:5000]
+
+plot 'ifx_vs_fortran_O3_serial_results.csv' index 0 using 2:($3 > 0 ? $3 : 1/0) with linespoints pt 7 ps 0.2 title "GCC: Row loop", \
+ 'ifx_vs_fortran_O3_serial_results.csv' index 0 using 2:($4 > 0 ? $4 : 1/0) with linespoints pt 7 ps 0.2 title "GCC: Col loop", \
+ 'ifx_vs_fortran_O3_serial_results.csv' index 1 using 2:($3 > 0 ? $3 : 1/0) with linespoints pt 7 ps 0.2 title "IFX: Row loop", \
+ 'ifx_vs_fortran_O3_serial_results.csv' index 1 using 2:($4 > 0 ? $4 : 1/0) with linespoints pt 7 ps 0.2 title "IFX: Col loop", \
+
+
+# SERIAL, O3, COMPILER-DEPENDENT WALL TIMES (MATMUL/BLAS ONLY) (FIGURE 3)
+
+#set terminal x11 2 title "Fig.3 Compiler" persist
+set terminal cairolatex pdf size 5in,3in
+set output "f3_compilers_scaling.tex"
+
+set logscale y 10
+set logscale x 10
+
+set xlabel "Size [$N \\times N$]"
+set ylabel "Wall Time [s]"
+
+set format y '$10^{%T}$'
+set offsets graph 0.05, graph 0.05, graph 0.05, graph 0.05
+set key bottom right
+set grid
+set xrange[70:15500]
+
+plot 'ifx_vs_fortran_O3_serial_results.csv' index 0 using 2:($5 > 0 ? $5 : 1/0) with linespoints pt 7 ps 0.2 title 'GCC: \texttt{matmul}', \
+ 'ifx_vs_fortran_O3_serial_results.csv' index 0 using 2:($6 > 0 ? $6 : 1/0) with linespoints pt 7 ps 0.2 title 'GCC: OpenBLAS', \
+ 'ifx_vs_fortran_O3_serial_results.csv' index 1 using 2:($5 > 0 ? $5 : 1/0) with linespoints pt 7 ps 0.2 title 'IFX: \texttt{matmul}', \
+ 'ifx_vs_fortran_O3_serial_results.csv' index 1 using 2:($6 > 0 ? $6 : 1/0) with linespoints pt 7 ps 0.2 title 'IFX: MKL BLAS', \
+
+
+# PARALLEL, O3, COMPILER-DEPEDENT WALL TIMES. LOOPS ONLY (FIGURE 4)
+
+set terminal cairolatex pdf size 5in,3in
+set output "f4_compilers_scaling.tex"
+
+set logscale y 10
+set logscale x 10
+
+set xlabel "Size [$N \\times N$]"
+set ylabel "Wall Time [s]"
+
+set format y '$10^{%T}$'
+set offsets graph 0.05, graph 0.05, graph 0.05, graph 0.05
+set key bottom right
+set grid
+set xrange[70:5000]
+
+plot 'ifx_vs_gfortran_parallel_O3_results.csv' index 0 using 2:($3 > 0 ? $3 : 1/0) with linespoints pt 7 ps 0.2 title "GCC: Row loop", \
+ 'ifx_vs_gfortran_parallel_O3_results.csv' index 0 using 2:($4 > 0 ? $4 : 1/0) with linespoints pt 7 ps 0.2 title "GCC: Col loop", \
+ 'ifx_vs_gfortran_parallel_O3_results.csv' index 1 using 2:($3 > 0 ? $3 : 1/0) with linespoints pt 7 ps 0.2 title "IFX: Row loop", \
+ 'ifx_vs_gfortran_parallel_O3_results.csv' index 1 using 2:($4 > 0 ? $4 : 1/0) with linespoints pt 7 ps 0.2 title "IFX: Col loop", \
+
+
+
+# PARALLEL, O3, COMPILER-DEPENDENT WALL TIMES. MATMUL/BLAS ONLY (FIGURE 5)
+
+#set terminal x11 2 title "Fig.3 Compiler" persist
+set terminal cairolatex pdf size 5in,3in
+set output "f5_compilers_scaling.tex"
+
+set logscale y 10
+set logscale x 10
+
+set xlabel "Size [$N \\times N$]"
+set ylabel "Wall Time [s]"
+
+set format y '$10^{%T}$'
+set offsets graph 0.05, graph 0.05, graph 0.05, graph 0.05
+set key bottom right
+set grid
+set xrange[70:15500]
+
+plot 'ifx_vs_gfortran_parallel_O3_results.csv' index 0 using 2:($5 > 0 ? $5 : 1/0) with linespoints pt 7 ps 0.2 title 'GCC: \texttt{matmul}', \
+ 'ifx_vs_gfortran_parallel_O3_results.csv' index 0 using 2:($6 > 0 ? $6 : 1/0) with linespoints pt 7 ps 0.2 title 'GCC: OpenBLAS', \
+ 'ifx_vs_gfortran_parallel_O3_results.csv' index 1 using 2:($5 > 0 ? $5 : 1/0) with linespoints pt 7 ps 0.2 title 'IFX: \texttt{matmul}', \
+ 'ifx_vs_gfortran_parallel_O3_results.csv' index 1 using 2:($6 > 0 ? $6 : 1/0) with linespoints pt 7 ps 0.2 title 'IFX: MKL BLAS', \
diff --git a/report/refs.bib b/report/refs.bib
index 3320ee2..a265b5a 100644
--- a/report/refs.bib
+++ b/report/refs.bib
@@ -36,7 +36,7 @@
}
@manual{GCC2024,
address = {Boston, MA},
- author = {Stallman, Richard and {Free Software Foundation}},
+ author = {Stallman, Richard Matthew and {Free Software Foundation}},
edition = {14.2.0},
organization = {Free Software Foundation},
title = {Using the GNU Compiler Collection},
diff --git a/report/report.tex b/report/report.tex
index 0b4018b..a66de1d 100644
--- a/report/report.tex
+++ b/report/report.tex
@@ -2,9 +2,11 @@
\usepackage[margin=1in]{geometry}
\usepackage{parskip}
+\usepackage{float}
\usepackage{xcolor}
\usepackage{graphicx}
+\graphicspath{{./figures/}}
\usepackage{hyperref}
\hypersetup{colorlinks=true,
@@ -24,7 +26,7 @@
\section{Introduction}
-It is widely regarded in the scientific community that computers can be useful. One notable advantage of using computers is that they can be significantly faster in producing results when compared with humans. Although the computing power of today provides great flexibility in what can be computed, there is continuing merit in trying to be as efficient as possible in writing software. \emph{High performance computing} is computing with a focus on extreme performance \cite{Robey2021}, and is the discipline that enables large-scale simulation efforts in various areas of science and engineering \cite{Curcic2020}.
+It is widely regarded in the scientific community that computers can be useful. Although the computing power of today provides great flexibility in what can be computed, there is continuing merit in trying to be as efficient as possible in writing software. \emph{High-performance computing} is computing with a focus on extreme performance \cite{Robey2021}, and it is the discipline that enables large-scale simulation efforts in various areas of science and engineering \cite{Curcic2020}.
Matrices are commonly found in computing. Because of their widespread use, it is beneficial to implement matrix operations as efficiently as possible, especially when considering prohibitively large systems. This report surveys various ways of taking a square matrix-matrix product from the perspective of minimizing wall times. The impact of different solution techniques, compilers, compiler flags, and matrix sizes are investigated. Different solutions were implemented in Fortran and programmatically timed to achieve this.
@@ -63,14 +65,70 @@ Two different Fortran compilers were tested; \texttt{gfortran} from the GNU comp
Additional flags were specified to optimize performance. When a binary is compiled, efforts are made to keep it `portable' by avoiding specific instruction sets or niche optimizations unavailable in common hardware. Because the tests are only performed locally, both compilers were instructed to compile the highest-performance binary using all available hardware tricks. In \texttt{gfortran} this involved specifying the \texttt{march=native} flag \cite{GCC2024}. On \texttt{ifx} this was performed with the \texttt{xHost} flag \cite{Intel2025}.
\section{Comparisons and Results}
-Runs were conducted parametrically and driven by a GNU Makefile.
+Runs were conducted parametrically and driven by a GNU Makefile. Results were evaluated using the \texttt{system\_clock} subroutine available for both compilers. This is preferred over calling \texttt{cpu\_time} because it natively accounts for the use of parallel workers. OpenMP also has routines which may be more accurate, however the code was compiled both with and without OpenMP, making it impractical to use.
+The runs presented in the following section are a subset of the total data collected. The full dataset is provided in Appendix \ref{apx:results} of the document. All of the data was collected in serial (1 thread) or parallel (8 threads) on an 11th generation Intel i5-11300H CPU running at 4.40 GHz. Runs were done overnight on a bare TTY session with a minimal amount of background daemons running.
\subsection{Matrix Size}
-As expected, an increase in matrix size corresponded with a pro-linear increase in wall time. This was consistent for all compilers, flags, and techniques.
-
-\subsection{Compiler}
-
-\subsubsection{Optimization Flags}
+As expected, an increase in matrix size corresponded with a non-linear increase in wall time. This was consistent for all compilers, flags, and techniques. An example dataset consisting of \texttt{gfortran} runs with \texttt{O3} optimization is presented in Figure \ref{fig:n-scaling}. No runs were conducted using triple-loops for values larger than $N=3500$ as it became prohibitively slow.
+
+\begin{figure}[H]
+ \centering
+ \def\svdwidth{5in}
+ \hspace*{-1.3cm}
+ \input{figures/f1_n_scaling.tex}
+ \caption{Size vs. Wall Time Scaling with \texttt{gfortran} serial \texttt{O3}.}
+ \label{fig:n-scaling}
+\end{figure}
+
+\subsection{Compilers}
+
+The compiler used had a considerable impact on the wall time of the computation. Initial comparisons were done using only serial runs for both \texttt{gfortran} and \texttt{ifx}, which are shown below in Figure \ref{fig:comp-scaling-1}.
+
+\begin{figure}[H]
+ \centering
+ \def\svdwidth{5in}
+ \hspace*{-1.3cm}
+ \input{figures/f2_compilers_scaling.tex}
+ \caption{Short Size vs. Wall Time Scaling with serial \texttt{O3}.}
+ \label{fig:comp-scaling-1}
+\end{figure}
+
+The triple-loop runs showed a roughly 2 order of magnitude gap between the GCC and OneAPI results, with OneAPI being faster. Interestingly, GCC was much closer to the OneAPI performance when using OpenBLAS and faster when using \texttt{matmul}, as shown in Figure \ref{fig:comp-scaling-2}.
+
+\begin{figure}[H]
+ \centering
+ \def\svdwidth{5in}
+ \hspace*{-1.3cm}
+ \input{figures/f3_compilers_scaling.tex}
+ \caption{Long Size vs. Wall Time Scaling with serial \texttt{O3}.}
+ \label{fig:comp-scaling-2}
+\end{figure}
+
+A comparison was also made to see if the compilers provided similar support for parallelism using OpenMP. This was implemented by hand for the triple-loop techniques, but comes default for OpenBLAS and MKL BLAS when compiled with the \texttt{fopenmp} flag. 8 threads were used for all runs, which was verified using \texttt{htop}. The short runs are presented below in Figure \ref{fig:comp-scaling-3}.
+
+\begin{figure}[H]
+ \centering
+ \def\svdwidth{5in}
+ \hspace*{-1.3cm}
+ \input{figures/f4_compilers_scaling.tex}
+ \caption{Short Size vs. Wall Time Scaling with parallel \texttt{O3}.}
+ \label{fig:comp-scaling-3}
+\end{figure}
+
+The scaling is markedly non-linear in the log-log plot when compared with Figure \ref{fig:comp-scaling-1}. It is likely that the balance between the overhead of setting up 8 workers and computing the product is suboptimal for such small values of $N$. That being said, there is still a clear trend of OneAPI offering superior performance for all presented $N$. The long runs show a different trend and are presented in Figure \ref{fig:comp-scaling-4}.
+
+\begin{figure}[H]
+ \centering
+ \def\svdwidth{5in}
+ \hspace*{-1.3cm}
+ \input{figures/f5_compilers_scaling.tex}
+ \caption{Long Size vs. Wall Time Scaling with parallel \texttt{O3}.}
+ \label{fig:comp-scaling-4}
+\end{figure}
+
+When performing calls to \texttt{matmul} and BLAS, GCC has higher performance until $N$ approaches roughly $700\times700$ in size. It is likely that part of this can be attributed to statistical noise as the time scale is on the order of milliseconds and the \texttt{system\_clock} subroutine may be biased between the two. GCC and OneAPI share similar performance for all $N$ up to $12000\times12000$ with the exception of \texttt{ifx}'s \texttt{matmul}, which is consistently slower.
+
+\subsection{Compiler Flags}
\subsection{Parallelism}
@@ -91,4 +149,8 @@ The git repository is hosted at \url{https://git.hhmoore.ca/mcsc-6030g/p1-matrix
\item \texttt{plots.gnu} is a script that generates plots for the report using Gnuplot. The makefile target for plots can be run with \texttt{make plots}. This will produce plots in hybrid \texttt{.pdf} and \texttt{.tex} formats that embed cleanly in the \LaTeX document.
\end{description}
+
+\section{Tabular Results}
+\label{apx:results}
+
\end{document}