% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Breuer:1043552,
author = {Breuer, Thomas and Guimaraes, Filipe and Himmels, Carina
and Frings, Wolfgang and Paschoulas, Chrysovalantis and
Göbbert, Jens Henrik},
title = {{T}he {A}rt of {P}rocess {P}inning: {T}urning {C}haos into
{C}ore {H}armony},
reportid = {FZJ-2025-02926},
year = {2025},
note = {This poster was awarded second prize in the Best Research
Poster category.},
abstract = {High-Performance Computing (HPC) centres face growing
challenges as user numbers and application diversity
increase, requiring systems to manage a wide range of
workflows. While users prioritise scientific output over
specific configurations, administrators strive to maintain
fully utilised systems with optimised jobs, avoiding
resource waste. However, no single default environment can
address the diverse needs of users and applications due to
the complex landscape of unique use cases. Process pinning -
the binding of tasks and threads to specific CPU cores - is
a vital yet often overlooked optimisation that significantly
improves job performance. This technique benefits both
CPU-intensive and GPU-enabled jobs. Proper pinning prevents
process migration, ensures efficient memory access, and
enables faster communication, improving system performance
by simply adjusting workload manager parameters (e.g.,
Slurm) without altering code. Metrics from various
applications and benchmarks show that suboptimal pinning can
drastically reduce performance, with production scenarios
likely impacted even more. Achieving optimal process pinning
is challenging due to three interrelated factors: - System
side: Application layers and libraries (e.g., MPI, OpenMP,
Slurm) interact with hardware architectures, affecting task
and thread placement. Updates to these components can
disrupt the expected pinning behaviour. - User side: Users
must consider system architecture and configuration options,
such as how to split processes and threads or distribute
them across cores. Even with the same core usage pattern,
distribution can vary based on workload options (e.g., Slurm
`cpu-bind` and `distribution` values). Portability across
systems is not guaranteed, often leading to suboptimal
performance. - Operator side: Administrators and support
staff must monitor systems to ensure effective resource
utilisation and address issues proactively. Identifying
problematic jobs is difficult due to the variety of
characteristics, with inefficiencies often hidden in core
usage patterns. We developed tools and processes based on
investigations across diverse HPC systems to address these
challenges. These solutions enhance overall system
throughput by identifying binding errors, guiding users in
optimisation, and monitoring core usage. Our solutions
include: - A workflow that validates pinning distributions
by running automated test jobs, periodically or manually,
via the GitLab-CI framework. Results are compared to
expected outputs, with summaries generated and full
comparison displayed on the provider-targeted part of the
JuPin pinning tool (https://go.fzj.de/pinning). Tests help
HPC providers address issues pre-production, update
documentation, and notify users of changes. - A
user-targeted interactive visualisation functionality of
JuPin enables users to test pinning options, visualise task
distributions, and generate Slurm-compatible commands.
Though focused on Slurm, it can be adapted for other
workload managers. - LLview (https://go.fzj.de/llview), an
open-source monitoring and operational data analytics tool,
has been extended to monitor core usage patterns, providing
statistics and aggregated computing times. This helps
identify inefficiencies and intervene proactively. JuPin and
LLview collectively improve node utilisation, reduce waste,
and simplify achieving optimal pinning. These advancements
translate to delivering more results in less time. We
published JuPin as open-source software on GitHub in May
2025 (https://github.com/FZJ-JSC/jupin). In conclusion,
resolving pinning challenges is critical for optimising HPC
systems. Our tools establish a foundation for scaling
operations, including preparations for the JUPITER exascale
supercomputer.},
month = {Jun},
date = {2025-06-10},
organization = {ISC High Performance 2025, Hamburg
(Germany), 10 Jun 2025 - 14 Jun 2025},
subtyp = {After Call},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511) / 5121 - Supercomputing $\&$
Big Data Facilities (POF4-512) / BMBF 01 1H1 6013, NRW 325
– 8.03 – 133340 - SiVeGCS (DB001492) / ATMLAO - ATML
Application Optimization and User Service Tools (ATMLAO)},
pid = {G:(DE-HGF)POF4-5112 / G:(DE-HGF)POF4-5121 /
G:(DE-Juel-1)DB001492 / G:(DE-Juel-1)ATMLAO},
typ = {PUB:(DE-HGF)24},
doi = {10.34734/FZJ-2025-02926},
url = {https://juser.fz-juelich.de/record/1043552},
}