diff options
| author | Connor Moore <connor@hhmoore.ca> | 2026-02-13 20:24:50 -0500 |
|---|---|---|
| committer | Connor Moore <connor@hhmoore.ca> | 2026-02-13 20:28:21 -0500 |
| commit | 5c8bacf899769bf56c553abe6c990ba487dcee67 (patch) | |
| tree | 3147505b71ecb46aa087b11980e47235cd3d9c48 | |
| parent | 7001c889be0b8be7733af31d48caad6c63be4266 (diff) | |
| -rw-r--r-- | report/figures/f1_n_scaling.pdf | bin | 9270 -> 9270 bytes | |||
| -rw-r--r-- | report/figures/f2_compilers_scaling.pdf | bin | 7880 -> 7880 bytes | |||
| -rw-r--r-- | report/figures/f3_compilers_scaling.pdf | bin | 10329 -> 10329 bytes | |||
| -rw-r--r-- | report/figures/f4_compilers_scaling.pdf | bin | 7905 -> 7905 bytes | |||
| -rw-r--r-- | report/figures/f5_compilers_scaling.pdf | bin | 10571 -> 10571 bytes | |||
| -rw-r--r-- | report/figures/f6_optflags_scaling.pdf | bin | 7954 -> 7954 bytes | |||
| -rw-r--r-- | report/figures/f7_optflags_scaling.pdf | bin | 7835 -> 7835 bytes | |||
| -rw-r--r-- | report/figures/f8_parallel.csv | 52 | ||||
| -rw-r--r-- | report/figures/f8_parallel_speedup.pdf | bin | 0 -> 8745 bytes | |||
| -rw-r--r-- | report/figures/f8_parallel_speedup.tex | 132 | ||||
| -rw-r--r-- | report/figures/f9_parallel.csv | 35 | ||||
| -rw-r--r-- | report/figures/f9_parallel_speedup.pdf | bin | 0 -> 6758 bytes | |||
| -rw-r--r-- | report/figures/f9_parallel_speedup.tex | 138 | ||||
| -rw-r--r-- | report/figures/plots.gnu | 26 | ||||
| -rw-r--r-- | report/refs.bib | 22 | ||||
| -rw-r--r-- | report/report.pdf | bin | 275490 -> 297193 bytes | |||
| -rw-r--r-- | report/report.tex | 32 |
17 files changed, 434 insertions, 3 deletions
diff --git a/report/figures/f1_n_scaling.pdf b/report/figures/f1_n_scaling.pdf Binary files differindex e16ad62..2ebd95f 100644 --- a/report/figures/f1_n_scaling.pdf +++ b/report/figures/f1_n_scaling.pdf diff --git a/report/figures/f2_compilers_scaling.pdf b/report/figures/f2_compilers_scaling.pdf Binary files differindex bedf2a5..6d6b9f5 100644 --- a/report/figures/f2_compilers_scaling.pdf +++ b/report/figures/f2_compilers_scaling.pdf diff --git a/report/figures/f3_compilers_scaling.pdf b/report/figures/f3_compilers_scaling.pdf Binary files differindex 35d958b..bb55860 100644 --- a/report/figures/f3_compilers_scaling.pdf +++ b/report/figures/f3_compilers_scaling.pdf diff --git a/report/figures/f4_compilers_scaling.pdf b/report/figures/f4_compilers_scaling.pdf Binary files differindex 6ea6095..46a1af1 100644 --- a/report/figures/f4_compilers_scaling.pdf +++ b/report/figures/f4_compilers_scaling.pdf diff --git a/report/figures/f5_compilers_scaling.pdf b/report/figures/f5_compilers_scaling.pdf Binary files differindex af6f1df..00a331d 100644 --- a/report/figures/f5_compilers_scaling.pdf +++ b/report/figures/f5_compilers_scaling.pdf diff --git a/report/figures/f6_optflags_scaling.pdf b/report/figures/f6_optflags_scaling.pdf Binary files differindex d90f1ac..fd4e5c9 100644 --- a/report/figures/f6_optflags_scaling.pdf +++ b/report/figures/f6_optflags_scaling.pdf diff --git a/report/figures/f7_optflags_scaling.pdf b/report/figures/f7_optflags_scaling.pdf Binary files differindex 8b608b7..b78dd26 100644 --- a/report/figures/f7_optflags_scaling.pdf +++ b/report/figures/f7_optflags_scaling.pdf diff --git a/report/figures/f8_parallel.csv b/report/figures/f8_parallel.csv new file mode 100644 index 0000000..6e7ef71 --- /dev/null +++ b/report/figures/f8_parallel.csv @@ -0,0 +1,52 @@ +100,4.0876e-05,4.5092e-05,-10.3141 +200,0.000361261,0.000145145,59.8227 +300,0.00119115,0.00037482,68.5329 +400,0.00287274,0.000869634,69.7281 +500,0.00531736,0.00187232,64.7885 +600,0.00950288,0.00391997,58.7497 +700,0.0149877,0.00771248,48.5413 +800,0.0219227,0.01357,38.1007 +900,0.0318063,0.0193472,39.1718 +1000,0.0447146,0.0218951,51.0337 +1100,0.059236,0.0292933,50.5481 +1200,0.0798487,0.0404661,49.3215 +1300,0.100643,0.045211,55.0778 +1400,0.125711,0.0539213,57.1069 +1500,0.158723,0.0714096,55.0099 +1600,0.180715,0.0867447,51.9992 +1700,0.219032,0.103163,52.9005 +1800,0.251501,0.133704,46.8376 +1900,0.299972,0.154439,48.5155 +2000,0.349973,0.175362,49.8927 +2100,0.401263,0.211944,47.1808 +2200,0.467413,0.276247,40.8987 +2300,0.527726,0.298361,43.4629 +2400,0.599157,0.375264,37.368 +2500,0.689197,0.393948,42.8396 +2600,0.787609,0.449948,42.8717 +2700,0.859734,0.560875,34.7618 +2800,0.950026,0.600975,36.7412 +2900,1.0653,0.690089,35.2212 +3000,1.20447,0.762004,36.7353 +3100,1.30617,0.932075,28.6406 +3200,1.44697,0.99685,31.1078 +3300,1.61321,1.13818,29.4463 +3400,1.69799,1.18396,30.2729 +3500,1.88768,1.34178,28.9191 +4000,2.80924,2.02962,27.752 +4500,3.99312,2.93771,26.4307 +5000,5.53452,4.03783,27.0428 +5500,7.44544,5.40327,27.4285 +6000,9.65941,6.75078,30.1119 +6500,12.091,8.36688,30.8008 +7000,15.0031,10.3283,31.1589 +7500,18.9521,12.5924,33.5567 +8000,22.6041,16.4768,27.107 +8500,26.8982,18.8672,29.857 +9000,32.2098,21.5434,33.1154 +9500,37.634,27.3042,27.4481 +10000,43.6043,29.9856,31.2325 +10500,50.7758,37.0526,27.027 +11000,58.611,39.7522,32.1762 +11500,66.3734,45.7521,31.0686 +12000,75.2866,52.7216,29.9721 diff --git a/report/figures/f8_parallel_speedup.pdf b/report/figures/f8_parallel_speedup.pdf Binary files differnew file mode 100644 index 0000000..5312407 --- /dev/null +++ b/report/figures/f8_parallel_speedup.pdf diff --git a/report/figures/f8_parallel_speedup.tex b/report/figures/f8_parallel_speedup.tex new file mode 100644 index 0000000..54e601c --- /dev/null +++ b/report/figures/f8_parallel_speedup.tex @@ -0,0 +1,132 @@ +% GNUPLOT: LaTeX picture with Postscript +\begingroup + \makeatletter + \providecommand\color[2][]{% + \GenericError{(gnuplot) \space\space\space\@spaces}{% + Package color not loaded in conjunction with + terminal option `colourtext'% + }{See the gnuplot documentation for explanation.% + }{Either use 'blacktext' in gnuplot or load the package + color.sty in LaTeX.}% + \renewcommand\color[2][]{}% + }% + \providecommand\includegraphics[2][]{% + \GenericError{(gnuplot) \space\space\space\@spaces}{% + Package graphicx or graphics not loaded% + }{See the gnuplot documentation for explanation.% + }{The gnuplot epslatex terminal needs graphicx.sty or graphics.sty.}% + \renewcommand\includegraphics[2][]{}% + }% + \providecommand\rotatebox[2]{#2}% + \@ifundefined{ifGPcolor}{% + \newif\ifGPcolor + \GPcolortrue + }{}% + \@ifundefined{ifGPblacktext}{% + \newif\ifGPblacktext + \GPblacktexttrue + }{}% + % define a \g@addto@macro without @ in the name: + \let\gplgaddtomacro\g@addto@macro + % define empty templates for all commands taking text: + \gdef\gplbacktext{}% + \gdef\gplfronttext{}% + \makeatother + \ifGPblacktext + % no textcolor at all + \def\colorrgb#1{}% + \def\colorgray#1{}% + \else + % gray or color? + \ifGPcolor + \def\colorrgb#1{\color[rgb]{#1}}% + \def\colorgray#1{\color[gray]{#1}}% + \expandafter\def\csname LTw\endcsname{\color{white}}% + \expandafter\def\csname LTb\endcsname{\color{black}}% + \expandafter\def\csname LTa\endcsname{\color{black}}% + \expandafter\def\csname LT0\endcsname{\color[rgb]{1,0,0}}% + \expandafter\def\csname LT1\endcsname{\color[rgb]{0,1,0}}% + \expandafter\def\csname LT2\endcsname{\color[rgb]{0,0,1}}% + \expandafter\def\csname LT3\endcsname{\color[rgb]{1,0,1}}% + \expandafter\def\csname LT4\endcsname{\color[rgb]{0,1,1}}% + \expandafter\def\csname LT5\endcsname{\color[rgb]{1,1,0}}% + \expandafter\def\csname LT6\endcsname{\color[rgb]{0,0,0}}% + \expandafter\def\csname LT7\endcsname{\color[rgb]{1,0.3,0}}% + \expandafter\def\csname LT8\endcsname{\color[rgb]{0.5,0.5,0.5}}% + \else + % gray + \def\colorrgb#1{\color{black}}% + \def\colorgray#1{\color[gray]{#1}}% + \expandafter\def\csname LTw\endcsname{\color{white}}% + \expandafter\def\csname LTb\endcsname{\color{black}}% + \expandafter\def\csname LTa\endcsname{\color{black}}% + \expandafter\def\csname LT0\endcsname{\color{black}}% + \expandafter\def\csname LT1\endcsname{\color{black}}% + \expandafter\def\csname LT2\endcsname{\color{black}}% + \expandafter\def\csname LT3\endcsname{\color{black}}% + \expandafter\def\csname LT4\endcsname{\color{black}}% + \expandafter\def\csname LT5\endcsname{\color{black}}% + \expandafter\def\csname LT6\endcsname{\color{black}}% + \expandafter\def\csname LT7\endcsname{\color{black}}% + \expandafter\def\csname LT8\endcsname{\color{black}}% + \fi + \fi + \setlength{\unitlength}{0.0500bp}% + \ifx\gptboxheight\undefined% + \newlength{\gptboxheight}% + \newlength{\gptboxwidth}% + \newsavebox{\gptboxtext}% + \fi% + \setlength{\fboxrule}{0.5pt}% + \setlength{\fboxsep}{1pt}% + \definecolor{tbcol}{rgb}{1,1,1}% +\begin{picture}(7200.00,3600.00)% + \gplgaddtomacro\gplbacktext{% + \csname LTb\endcsname%% + \put(820,865){\makebox(0,0)[r]{\strut{}$10^{-6}$}}% + \csname LTb\endcsname%% + \put(820,1286){\makebox(0,0)[r]{\strut{}$10^{-4}$}}% + \csname LTb\endcsname%% + \put(820,1708){\makebox(0,0)[r]{\strut{}$10^{-2}$}}% + \csname LTb\endcsname%% + \put(820,2129){\makebox(0,0)[r]{\strut{}$10^{0}$}}% + \csname LTb\endcsname%% + \put(820,2550){\makebox(0,0)[r]{\strut{}$10^{2}$}}% + \csname LTb\endcsname%% + \put(1272,450){\makebox(0,0){\strut{}$100$}}% + \csname LTb\endcsname%% + \put(3467,450){\makebox(0,0){\strut{}$1000$}}% + \csname LTb\endcsname%% + \put(5661,450){\makebox(0,0){\strut{}$10000$}}% + \csname LTb\endcsname%% + \put(6191,751){\makebox(0,0)[l]{\strut{}$-20$}}% + \csname LTb\endcsname%% + \put(6191,1133){\makebox(0,0)[l]{\strut{}$0$}}% + \csname LTb\endcsname%% + \put(6191,1516){\makebox(0,0)[l]{\strut{}$20$}}% + \csname LTb\endcsname%% + \put(6191,1899){\makebox(0,0)[l]{\strut{}$40$}}% + \csname LTb\endcsname%% + \put(6191,2282){\makebox(0,0)[l]{\strut{}$60$}}% + \csname LTb\endcsname%% + \put(6191,2665){\makebox(0,0)[l]{\strut{}$80$}}% + }% + \gplgaddtomacro\gplfronttext{% + \csname LTb\endcsname%% + \put(5593,3395){\makebox(0,0)[r]{\strut{}\texttt{gfortran}: Serial OpenBLAS}}% + \csname LTb\endcsname%% + \put(5593,3190){\makebox(0,0)[r]{\strut{}\texttt{gfortran}: Parallel OpenBLAS}}% + \csname LTb\endcsname%% + \put(5593,2986){\makebox(0,0)[r]{\strut{}Percent Difference (From Parallel to Serial)}}% + \csname LTb\endcsname%% + \put(186,1708){\rotatebox{-270.00}{\makebox(0,0){\strut{}Wall Time [s]}}}% + \csname LTb\endcsname%% + \put(6741,1708){\rotatebox{-270.00}{\makebox(0,0){\strut{}Difference [\%]}}}% + \csname LTb\endcsname%% + \put(3505,143){\makebox(0,0){\strut{}Size [$N \times N$]}}% + }% + \gplbacktext + \put(0,0){\includegraphics[width={360.00bp},height={180.00bp}]{f8_parallel_speedup}}% + \gplfronttext + \end{picture}% +\endgroup diff --git a/report/figures/f9_parallel.csv b/report/figures/f9_parallel.csv new file mode 100644 index 0000000..ff39900 --- /dev/null +++ b/report/figures/f9_parallel.csv @@ -0,0 +1,35 @@ +100,0.000309174,0.00859484,-96.4028 +200,0.0035476,0.00494413,-28.2462 +300,0.0127426,0.0074686,70.6156 +400,0.0428153,0.0142021,201.472 +500,0.0611114,0.0305357,100.131 +600,0.108478,0.0589314,84.075 +700,0.205817,0.114039,80.4795 +800,0.528887,0.232331,127.644 +900,0.546812,0.468208,16.7883 +1000,0.78722,0.975576,-19.3072 +1100,1.07064,1.30392,-17.8907 +1200,1.45521,1.67568,-13.157 +1300,1.91523,3.07993,-37.8158 +1400,2.26769,3.36566,-32.6227 +1500,2.82255,5.85835,-51.8201 +1600,4.46515,7.13456,-37.4152 +1700,5.50338,8.98528,-38.7512 +1800,8.26159,10.9244,-24.3749 +1900,8.34089,13.8653,-39.8434 +2000,20.3664,15.7666,29.1743 +2100,24.2152,18.1782,33.2101 +2200,33.078,21.3501,54.9314 +2300,42.6655,25.0572,70.2724 +2400,54.6176,27.6821,97.303 +2500,66.1092,33.3118,98.4558 +2600,79.1405,36.582,116.337 +2700,93.8719,41.6073,125.614 +2800,110.335,48.6649,126.724 +2900,124.424,55.1282,125.699 +3000,140.337,59.958,134.059 +3100,159.299,67.0336,137.641 +3200,204.795,91.2744,124.373 +3300,197.251,82.0796,140.317 +3400,216.409,89.5602,141.635 +3500,242.147,99.0295,144.52 diff --git a/report/figures/f9_parallel_speedup.pdf b/report/figures/f9_parallel_speedup.pdf Binary files differnew file mode 100644 index 0000000..4a6b049 --- /dev/null +++ b/report/figures/f9_parallel_speedup.pdf diff --git a/report/figures/f9_parallel_speedup.tex b/report/figures/f9_parallel_speedup.tex new file mode 100644 index 0000000..cf76f76 --- /dev/null +++ b/report/figures/f9_parallel_speedup.tex @@ -0,0 +1,138 @@ +% GNUPLOT: LaTeX picture with Postscript +\begingroup + \makeatletter + \providecommand\color[2][]{% + \GenericError{(gnuplot) \space\space\space\@spaces}{% + Package color not loaded in conjunction with + terminal option `colourtext'% + }{See the gnuplot documentation for explanation.% + }{Either use 'blacktext' in gnuplot or load the package + color.sty in LaTeX.}% + \renewcommand\color[2][]{}% + }% + \providecommand\includegraphics[2][]{% + \GenericError{(gnuplot) \space\space\space\@spaces}{% + Package graphicx or graphics not loaded% + }{See the gnuplot documentation for explanation.% + }{The gnuplot epslatex terminal needs graphicx.sty or graphics.sty.}% + \renewcommand\includegraphics[2][]{}% + }% + \providecommand\rotatebox[2]{#2}% + \@ifundefined{ifGPcolor}{% + \newif\ifGPcolor + \GPcolortrue + }{}% + \@ifundefined{ifGPblacktext}{% + \newif\ifGPblacktext + \GPblacktexttrue + }{}% + % define a \g@addto@macro without @ in the name: + \let\gplgaddtomacro\g@addto@macro + % define empty templates for all commands taking text: + \gdef\gplbacktext{}% + \gdef\gplfronttext{}% + \makeatother + \ifGPblacktext + % no textcolor at all + \def\colorrgb#1{}% + \def\colorgray#1{}% + \else + % gray or color? + \ifGPcolor + \def\colorrgb#1{\color[rgb]{#1}}% + \def\colorgray#1{\color[gray]{#1}}% + \expandafter\def\csname LTw\endcsname{\color{white}}% + \expandafter\def\csname LTb\endcsname{\color{black}}% + \expandafter\def\csname LTa\endcsname{\color{black}}% + \expandafter\def\csname LT0\endcsname{\color[rgb]{1,0,0}}% + \expandafter\def\csname LT1\endcsname{\color[rgb]{0,1,0}}% + \expandafter\def\csname LT2\endcsname{\color[rgb]{0,0,1}}% + \expandafter\def\csname LT3\endcsname{\color[rgb]{1,0,1}}% + \expandafter\def\csname LT4\endcsname{\color[rgb]{0,1,1}}% + \expandafter\def\csname LT5\endcsname{\color[rgb]{1,1,0}}% + \expandafter\def\csname LT6\endcsname{\color[rgb]{0,0,0}}% + \expandafter\def\csname LT7\endcsname{\color[rgb]{1,0.3,0}}% + \expandafter\def\csname LT8\endcsname{\color[rgb]{0.5,0.5,0.5}}% + \else + % gray + \def\colorrgb#1{\color{black}}% + \def\colorgray#1{\color[gray]{#1}}% + \expandafter\def\csname LTw\endcsname{\color{white}}% + \expandafter\def\csname LTb\endcsname{\color{black}}% + \expandafter\def\csname LTa\endcsname{\color{black}}% + \expandafter\def\csname LT0\endcsname{\color{black}}% + \expandafter\def\csname LT1\endcsname{\color{black}}% + \expandafter\def\csname LT2\endcsname{\color{black}}% + \expandafter\def\csname LT3\endcsname{\color{black}}% + \expandafter\def\csname LT4\endcsname{\color{black}}% + \expandafter\def\csname LT5\endcsname{\color{black}}% + \expandafter\def\csname LT6\endcsname{\color{black}}% + \expandafter\def\csname LT7\endcsname{\color{black}}% + \expandafter\def\csname LT8\endcsname{\color{black}}% + \fi + \fi + \setlength{\unitlength}{0.0500bp}% + \ifx\gptboxheight\undefined% + \newlength{\gptboxheight}% + \newlength{\gptboxwidth}% + \newsavebox{\gptboxtext}% + \fi% + \setlength{\fboxrule}{0.5pt}% + \setlength{\fboxsep}{1pt}% + \definecolor{tbcol}{rgb}{1,1,1}% +\begin{picture}(7200.00,3600.00)% + \gplgaddtomacro\gplbacktext{% + \csname LTb\endcsname%% + \put(820,846){\makebox(0,0)[r]{\strut{}$10^{-6}$}}% + \csname LTb\endcsname%% + \put(820,1229){\makebox(0,0)[r]{\strut{}$10^{-4}$}}% + \csname LTb\endcsname%% + \put(820,1612){\makebox(0,0)[r]{\strut{}$10^{-2}$}}% + \csname LTb\endcsname%% + \put(820,1995){\makebox(0,0)[r]{\strut{}$10^{0}$}}% + \csname LTb\endcsname%% + \put(820,2377){\makebox(0,0)[r]{\strut{}$10^{2}$}}% + \csname LTb\endcsname%% + \put(820,2760){\makebox(0,0)[r]{\strut{}$10^{4}$}}% + \csname LTb\endcsname%% + \put(1768,450){\makebox(0,0){\strut{}$100$}}% + \csname LTb\endcsname%% + \put(3867,450){\makebox(0,0){\strut{}$1000$}}% + \csname LTb\endcsname%% + \put(5967,450){\makebox(0,0){\strut{}$10000$}}% + \csname LTb\endcsname%% + \put(6079,795){\makebox(0,0)[l]{\strut{}$-100$}}% + \csname LTb\endcsname%% + \put(6079,1076){\makebox(0,0)[l]{\strut{}$-50$}}% + \csname LTb\endcsname%% + \put(6079,1357){\makebox(0,0)[l]{\strut{}$0$}}% + \csname LTb\endcsname%% + \put(6079,1637){\makebox(0,0)[l]{\strut{}$50$}}% + \csname LTb\endcsname%% + \put(6079,1918){\makebox(0,0)[l]{\strut{}$100$}}% + \csname LTb\endcsname%% + \put(6079,2199){\makebox(0,0)[l]{\strut{}$150$}}% + \csname LTb\endcsname%% + \put(6079,2480){\makebox(0,0)[l]{\strut{}$200$}}% + \csname LTb\endcsname%% + \put(6079,2760){\makebox(0,0)[l]{\strut{}$250$}}% + }% + \gplgaddtomacro\gplfronttext{% + \csname LTb\endcsname%% + \put(5537,3395){\makebox(0,0)[r]{\strut{}\texttt{gfortran}: Serial Loop}}% + \csname LTb\endcsname%% + \put(5537,3190){\makebox(0,0)[r]{\strut{}\texttt{gfortran}: Parallel Loop}}% + \csname LTb\endcsname%% + \put(5537,2986){\makebox(0,0)[r]{\strut{}Percent Difference (From Parallel to Serial)}}% + \csname LTb\endcsname%% + \put(186,1708){\rotatebox{-270.00}{\makebox(0,0){\strut{}Wall Time [s]}}}% + \csname LTb\endcsname%% + \put(6741,1708){\rotatebox{-270.00}{\makebox(0,0){\strut{}Difference [\%]}}}% + \csname LTb\endcsname%% + \put(3449,143){\makebox(0,0){\strut{}Size [$N \times N$]}}% + }% + \gplbacktext + \put(0,0){\includegraphics[width={360.00bp},height={180.00bp}]{f9_parallel_speedup}}% + \gplfronttext + \end{picture}% +\endgroup diff --git a/report/figures/plots.gnu b/report/figures/plots.gnu index 3342372..03d2bd3 100644 --- a/report/figures/plots.gnu +++ b/report/figures/plots.gnu @@ -95,3 +95,29 @@ plot "ifx_rowloop_optflags.csv" using 1:2 with linespoints title "\\texttt{ifx}: "ifx_rowloop_optflags.csv" using 1:5 with linespoints title "\\texttt{ifx}: Ofast", \ 1e-11*x**3 with line lc rgb 'black' dt 2 notitle +# === FIGURE 8: PARALLEL SPEEDUP BLAS === +set output "f8_parallel_speedup.tex" +set xrange[70:15500] +set ytics nomirror +set y2tics +set y2label "Difference [\\%]" +set y2range [-25:85] + +plot "f8_parallel.csv" using 1:2 with linespoints title "\\texttt{gfortran}: Serial OpenBLAS", \ + "f8_parallel.csv" using 1:3 with linespoints title "\\texttt{gfortran}: Parallel OpenBLAS", \ + "f8_parallel.csv" using 1:4 axes x1y2 with linespoints dt 4 title "Percent Difference (From Parallel to Serial)", \ + 1e-11*x**3 with line lc rgb 'black' dt 2 notitle + + +# === FIGURE 9: PARALLEL SPEEDUP LOOPS === +set output "f9_parallel_speedup.tex" +set xrange[40:10000] +set ytics nomirror +set y2tics +set y2label "Difference [\\%]" +set y2range [-125:250] + +plot "f9_parallel.csv" using 1:2 with linespoints title "\\texttt{gfortran}: Serial Loop", \ + "f9_parallel.csv" using 1:3 with linespoints title "\\texttt{gfortran}: Parallel Loop", \ + "f9_parallel.csv" using 1:4 axes x1y2 with linespoints dt 4 title "Percent Difference (From Parallel to Serial)", \ + 1e-10*x**3 with line lc rgb 'black' dt 2 notitle diff --git a/report/refs.bib b/report/refs.bib index 8014a9f..8cc5067 100644 --- a/report/refs.bib +++ b/report/refs.bib @@ -40,10 +40,30 @@ } @manual{GCC2024, address = {Boston, MA}, - author = {Stallman, Richard Matthew and {Free Software Foundation}}, + author = {Stallman, Richard Matthew and {Free Software Foundation Contributors}}, edition = {14.2.0}, note = {\href{https://gcc.gnu.org/onlinedocs/gcc-14.2.0/gcc/}{Available online.}}, organization = {Free Software Foundation}, title = {Using the GNU Compiler Collection}, year = {2024} } +@online{IntelWebsitei511300, + address = {Santa Clara, CA}, + author = {{Intel Corporation}}, + note = {\href{https://www.intel.com/content/www/us/en/products/sku/196656/intel-core-i511300h-processor-8m-cache-up-to-4-40-ghz-with-ipu/specifications.html}{Available Online.} Accessed Feb. 10th, 2026.}, + title = {{Intel Core i5-11300H Processor}} +} +@article{Hill_2008, + author = {Hill, Mark D. and Marty, Michael R.}, + doi = {10.1109/mc.2008.209}, + issn = {0018-9162}, + journal = {Computer}, + month = jul, + number = {7}, + pages = {33{\textendash}38}, + publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, + title = {{Amdahl{\textquoteright}s Law in the Multicore Era}}, + url = {http://dx.doi.org/10.1109/MC.2008.209}, + volume = {41}, + year = {2008} +} diff --git a/report/report.pdf b/report/report.pdf Binary files differindex 1430065..d08bf23 100644 --- a/report/report.pdf +++ b/report/report.pdf diff --git a/report/report.tex b/report/report.tex index 03eec31..003d01b 100644 --- a/report/report.tex +++ b/report/report.tex @@ -86,6 +86,7 @@ Runs were conducted parametrically and driven by a GNU Makefile. Results were ev The runs presented in the following section are a subset of the total data collected. The full dataset is provided in Appendix \ref{apx:results} of the document. All of the data was collected in serial (1 thread) or parallel (8 threads) on an 11th generation Intel i5-11300H CPU running at 4.40 GHz. Runs were done overnight on a bare TTY session with a minimal amount of background daemons running. \subsection{Matrix Size} +\label{sec:matrix-size} As expected, an increase in matrix size corresponded with a non-linear increase in wall time. Specifically, it tended to $\mathcal{O}(n^3)$, which is the theoretical complexity of the product discussed in Section 2. This was consistent for all compilers, flags, and techniques. An example dataset consisting of \texttt{gfortran} runs with \texttt{O3} optimization is presented in Figure \ref{fig:n-scaling}. No runs were conducted using triple-loops for values larger than $N=3500$ as it became prohibitively slow. \begin{figure}[H] @@ -176,7 +177,35 @@ There is a clear speedup seen by using \texttt{Ofast} compared to other flags up Unlike \texttt{gfortran}, the \texttt{ifx} compiler has a clear difference between \texttt{O1} and the remaining flags for all values of $N$ shown. The remaining flags are all similar in their performance, barring for some small fluctuations. This implies that \texttt{O2} has the largest performance gain, and the others don't impact the wall time as much. -The Intel OneAPI documentation has a page (helpfully titled ``O'') that discusses what the different optimization flags imply \cite{Intel2025}. The \texttt{O2} flag is specifically the lowest-level of optimization that supports auto vectorization of loops, which would explain the large discontinuity in wall time between the flags. The i5-11300H processor supports an extended instruction set including AVX-512, a 512-bit extension of the Advanced Vector Extensions (AVX) instruction set. This allows for SIMD processing of up to 8 \texttt{real64} floating-point numbers simultaneously. It is likely that the use of lanes explains a large part of the 10$\times$ decrease in wall time; other reasons could include loop unrolling or the use of a fused multiply-add (FMA) instruction, both of which are possible with \texttt{O2}-level optimization. +The Intel OneAPI documentation has a page (helpfully titled ``O'') that discusses what the different optimization flags imply \cite{Intel2025}. The \texttt{O2} flag is specifically the lowest-level of optimization that supports auto vectorization of loops, which would explain the large discontinuity in wall time between the flags. The i5-11300H processor supports an extended instruction set including AVX-512, a 512-bit extension of the Advanced Vector Extensions (AVX) instruction set \cite{IntelWebsitei511300}. This allows for SIMD processing of up to 8 \texttt{real64} floating-point numbers simultaneously. It is likely that the use of lanes explains a large part of the 10$\times$ decrease in wall time; other reasons could include loop unrolling or the use of a fused multiply-add (FMA) instruction, both of which are possible with \texttt{O2}-level optimization. + +\subsection{Parallelization} + +For every test run in serial, another ran in parallel. Presently only a binary comparison is made without variation in the number of threads. Results are presented below for the \texttt{gfortran} runs compiled with the \texttt{Ofast} flag. The general trend is consistent for the runs using a call to OpenBLAS, shown in Figure \ref{fig:parallel-blas}. + +\begin{figure}[H] + \centering + \def\svdwidth{5in} + \hspace*{0.1cm} + \input{figures/f8_parallel_speedup.tex} + \caption{Serial and parallel ($n=8$) \texttt{gfortran} performance when calling OpenBLAS. The parallel runs show a consistent decrease in run time, with serial taking anywhere from ~30\% to ~60\% more wall time depending on the size of matrix. No plateau representing the overhead costs is observed.} + \label{fig:parallel-blas} +\end{figure} + +As expected, there is a tangible decrease in wall time when comparing serial to parallel results. The green dashed line presents the percent difference between parallel and serial, and for example a value of 30\% means the serial run has a wall time 30\% larger. Worth noting is that there is a distinct lack of a `plateau' at lower $N$ that could be expected from having to provision the parallel workers. Instead, the threaded runs had consistently lower wall times $\forall N \ne 100$. Also worth discussing is that the greatest difference achieved is roughly 60\% in the lower-$N$ portion, but only ~30\% in as $N$ increases to 3500. It is counter intuitive that, as $N$ grows into the low thousands, the `gains' from computing in parallel stagnate after decreasing by almost half. + +As mentioned earlier, the use of multiple threads was confirmed independently of the testing code via an external resource monitor, so it should not explain the discrepancy. Further, the times recorded were independent of any non-parallelizable activities, such as initializing random numbers or writing results to disk. Paradigms such as Amdahl's law show that the speedup of a program depends heavily on how much is readily `parallelizable' \cite{Hill_2008}, which could explain the poor performance had the \emph{entire} program be timed. It is likely that whatever solution technique was used in OpenBLAS does not have perfect scaling between wall time and the number of threads employed. More variation was seen in the wall times for a triple-loop, shown below in Figure \ref{fig:parallel-loops}. + +\begin{figure}[H] + \centering + \def\svdwidth{5in} + \hspace*{0.1cm} + \input{figures/f9_parallel_speedup.tex} + \caption{Serial and parallel ($n=8$) \texttt{gfortran} performance when comparing triple-loops. Parallel shows a consistent advantage at larger $N$ with serial taking up to ~150\% longer to run after $N=3300$. Uncertainty is shown with smaller $N$, likely as a result of comparing already low wall-times.} + \label{fig:parallel-loops} +\end{figure} + +Similar variations in the wall time difference are expected at lower $N$, as they are in range of milliseconds. As $N$ exceeds 3000, however, there is a consistent difference of over ~100\% observed. This does not make the loop approach inherently `better' than OpenBLAS, but it does show that seems to benefit more from parallelization. Indeed, the significant difference in wall times between the two in serial (Section \ref{sec:matrix-size}, Figure \ref{fig:n-scaling}) agin points to a difference solution technique being utilized that is not as receptive to parallelization. While not conclusive of any direct measure of wall time versus thread count, both plots showed improvements from employing parallel processing to matrix-matrix multiplication. \bibliographystyle{ieeetr} \bibliography{refs.bib} @@ -196,7 +225,6 @@ The git repository is hosted at \url{https://git.hhmoore.ca/mcsc-6030g/p1-matrix \item \texttt{plots.gnu} is a script that generates plots for the report using Gnuplot. The makefile target for plots can be run with \texttt{make plots}. This will produce plots in hybrid \texttt{.pdf} and \texttt{.tex} formats that embed cleanly in the \LaTeX document. \end{description} -\clearpage \section{Tabular Results} \label{apx:results} \begin{longtable}{llcccccc} |
