% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Breuer:1043552,
      author       = {Breuer, Thomas and Guimaraes, Filipe and Himmels, Carina
                      and Frings, Wolfgang and Paschoulas, Chrysovalantis and
                      Göbbert, Jens Henrik},
      title        = {{T}he {A}rt of {P}rocess {P}inning: {T}urning {C}haos into
                      {C}ore {H}armony},
      reportid     = {FZJ-2025-02926},
      year         = {2025},
      note         = {This poster was awarded second prize in the Best Research
                      Poster category.},
      abstract     = {High-Performance Computing (HPC) centres face growing
                      challenges as user numbers and application diversity
                      increase, requiring systems to manage a wide range of
                      workflows. While users prioritise scientific output over
                      specific configurations, administrators strive to maintain
                      fully utilised systems with optimised jobs, avoiding
                      resource waste. However, no single default environment can
                      address the diverse needs of users and applications due to
                      the complex landscape of unique use cases. Process pinning -
                      the binding of tasks and threads to specific CPU cores - is
                      a vital yet often overlooked optimisation that significantly
                      improves job performance. This technique benefits both
                      CPU-intensive and GPU-enabled jobs. Proper pinning prevents
                      process migration, ensures efficient memory access, and
                      enables faster communication, improving system performance
                      by simply adjusting workload manager parameters (e.g.,
                      Slurm) without altering code. Metrics from various
                      applications and benchmarks show that suboptimal pinning can
                      drastically reduce performance, with production scenarios
                      likely impacted even more. Achieving optimal process pinning
                      is challenging due to three interrelated factors: - System
                      side: Application layers and libraries (e.g., MPI, OpenMP,
                      Slurm) interact with hardware architectures, affecting task
                      and thread placement. Updates to these components can
                      disrupt the expected pinning behaviour. - User side: Users
                      must consider system architecture and configuration options,
                      such as how to split processes and threads or distribute
                      them across cores. Even with the same core usage pattern,
                      distribution can vary based on workload options (e.g., Slurm
                      `cpu-bind` and `distribution` values). Portability across
                      systems is not guaranteed, often leading to suboptimal
                      performance. - Operator side: Administrators and support
                      staff must monitor systems to ensure effective resource
                      utilisation and address issues proactively. Identifying
                      problematic jobs is difficult due to the variety of
                      characteristics, with inefficiencies often hidden in core
                      usage patterns. We developed tools and processes based on
                      investigations across diverse HPC systems to address these
                      challenges. These solutions enhance overall system
                      throughput by identifying binding errors, guiding users in
                      optimisation, and monitoring core usage. Our solutions
                      include: - A workflow that validates pinning distributions
                      by running automated test jobs, periodically or manually,
                      via the GitLab-CI framework. Results are compared to
                      expected outputs, with summaries generated and full
                      comparison displayed on the provider-targeted part of the
                      JuPin pinning tool (https://go.fzj.de/pinning). Tests help
                      HPC providers address issues pre-production, update
                      documentation, and notify users of changes. - A
                      user-targeted interactive visualisation functionality of
                      JuPin enables users to test pinning options, visualise task
                      distributions, and generate Slurm-compatible commands.
                      Though focused on Slurm, it can be adapted for other
                      workload managers. - LLview (https://go.fzj.de/llview), an
                      open-source monitoring and operational data analytics tool,
                      has been extended to monitor core usage patterns, providing
                      statistics and aggregated computing times. This helps
                      identify inefficiencies and intervene proactively. JuPin and
                      LLview collectively improve node utilisation, reduce waste,
                      and simplify achieving optimal pinning. These advancements
                      translate to delivering more results in less time. We
                      published JuPin as open-source software on GitHub in May
                      2025 (https://github.com/FZJ-JSC/jupin). In conclusion,
                      resolving pinning challenges is critical for optimising HPC
                      systems. Our tools establish a foundation for scaling
                      operations, including preparations for the JUPITER exascale
                      supercomputer.},
      month         = {Jun},
      date          = {2025-06-10},
      organization  = {ISC High Performance 2025, Hamburg
                       (Germany), 10 Jun 2025 - 14 Jun 2025},
      subtyp        = {After Call},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / 5121 - Supercomputing $\&$
                      Big Data Facilities (POF4-512) / BMBF 01 1H1 6013, NRW 325
                      – 8.03 – 133340 - SiVeGCS (DB001492) / ATMLAO - ATML
                      Application Optimization and User Service Tools (ATMLAO)},
      pid          = {G:(DE-HGF)POF4-5112 / G:(DE-HGF)POF4-5121 /
                      G:(DE-Juel-1)DB001492 / G:(DE-Juel-1)ATMLAO},
      typ          = {PUB:(DE-HGF)24},
      doi          = {10.34734/FZJ-2025-02926},
      url          = {https://juser.fz-juelich.de/record/1043552},
}