% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Nassyr:1038510,
      author       = {Nassyr, Stepan and Pleiter, Dirk},
      title        = {{E}xploring {P}rocessor {M}icro-architectures {O}ptimised
                      for {BLAS}3 {M}icro-kernels},
      volume       = {14802},
      address      = {Cham},
      publisher    = {Springer Nature Switzerland},
      reportid     = {FZJ-2025-01495},
      isbn         = {978-3-031-69765-4 (print)},
      series       = {Lecture Notes in Computer Science},
      pages        = {47 - 61},
      year         = {2024},
      comment      = {Euro-Par 2024: Parallel Processing},
      booktitle     = {Euro-Par 2024: Parallel Processing},
      abstract     = {Dense matrix-matrix operations are relevant for a broad
                      range of numerical applications, e.g. for implementing deep
                      neural networks. Past research has led to a good
                      understanding of how these operations can be mapped in a
                      generic manner on typical processor architectures with
                      multiple cache levels such that near-optimal performance can
                      be reached. However, while commonly used micro-architectures
                      are typically suitable for such operations, their
                      architectural parameters need to be suitably tuned. The
                      performance of highly optimised implementations of these
                      operations relies on micro-kernels that are often
                      handwritten. Given the increased variety of instruction set
                      architectures and SIMD instruction extensions, this becomes
                      challenging. In this paper, wepresent and implement a
                      methodology for an exhaustive exploration of a processor
                      core micro-architecture design space based on gem5
                      simulations. Furthermore, we present a tool for generating
                      efficiently vectorised code leveraging Arm’s SVE and
                      RISC-V’s RVV instructions. It enables automatisation of
                      the generation of micro-kernels and, therefore, the
                      generation of a large range of such kernels. The results
                      provide insights both, to micro-architecture architects as
                      well as micro-kernel developers. The assembler generator is
                      open-sourced and the simulation data is availableas
                      supplementary material.},
      month         = {Aug},
      date          = {2024-08-26},
      organization  = {30th European Conference on Parallel
                       and Distributed Processing, Madrid
                       (Spain), 26 Aug 2024 - 30 Aug 2024},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / 5122 - Future Computing
                      $\&$ Big Data Systems (POF4-512) / PhD no Grant - Doktorand
                      ohne besondere Förderung (PHD-NO-GRANT-20170405) / EPI SGA2
                      (16ME0507K)},
      pid          = {G:(DE-HGF)POF4-5112 / G:(DE-HGF)POF4-5122 /
                      G:(DE-Juel1)PHD-NO-GRANT-20170405 / G:(BMBF)16ME0507K},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      UT           = {WOS:001308370400004},
      doi          = {10.1007/978-3-031-69766-1_4},
      url          = {https://juser.fz-juelich.de/record/1038510},
}