% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Nassyr:1038510,
author = {Nassyr, Stepan and Pleiter, Dirk},
title = {{E}xploring {P}rocessor {M}icro-architectures {O}ptimised
for {BLAS}3 {M}icro-kernels},
volume = {14802},
address = {Cham},
publisher = {Springer Nature Switzerland},
reportid = {FZJ-2025-01495},
isbn = {978-3-031-69765-4 (print)},
series = {Lecture Notes in Computer Science},
pages = {47 - 61},
year = {2024},
comment = {Euro-Par 2024: Parallel Processing},
booktitle = {Euro-Par 2024: Parallel Processing},
abstract = {Dense matrix-matrix operations are relevant for a broad
range of numerical applications, e.g. for implementing deep
neural networks. Past research has led to a good
understanding of how these operations can be mapped in a
generic manner on typical processor architectures with
multiple cache levels such that near-optimal performance can
be reached. However, while commonly used micro-architectures
are typically suitable for such operations, their
architectural parameters need to be suitably tuned. The
performance of highly optimised implementations of these
operations relies on micro-kernels that are often
handwritten. Given the increased variety of instruction set
architectures and SIMD instruction extensions, this becomes
challenging. In this paper, wepresent and implement a
methodology for an exhaustive exploration of a processor
core micro-architecture design space based on gem5
simulations. Furthermore, we present a tool for generating
efficiently vectorised code leveraging Arm’s SVE and
RISC-V’s RVV instructions. It enables automatisation of
the generation of micro-kernels and, therefore, the
generation of a large range of such kernels. The results
provide insights both, to micro-architecture architects as
well as micro-kernel developers. The assembler generator is
open-sourced and the simulation data is availableas
supplementary material.},
month = {Aug},
date = {2024-08-26},
organization = {30th European Conference on Parallel
and Distributed Processing, Madrid
(Spain), 26 Aug 2024 - 30 Aug 2024},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511) / 5122 - Future Computing
$\&$ Big Data Systems (POF4-512) / PhD no Grant - Doktorand
ohne besondere Förderung (PHD-NO-GRANT-20170405) / EPI SGA2
(16ME0507K)},
pid = {G:(DE-HGF)POF4-5112 / G:(DE-HGF)POF4-5122 /
G:(DE-Juel1)PHD-NO-GRANT-20170405 / G:(BMBF)16ME0507K},
typ = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
UT = {WOS:001308370400004},
doi = {10.1007/978-3-031-69766-1_4},
url = {https://juser.fz-juelich.de/record/1038510},
}