% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Orland:1052361,
author = {Orland, Fabian and Hilgers, Tom and Hübenthal, Fabian and
Sarma, Rakesh and Lintermann, Andreas and Terboven,
Christian},
title = {{H}ybrid {I}nference {O}ptimization for {AI}-{E}nhanced
{T}urbulent {B}oundary {L}ayer {S}imulation on
{H}eterogeneous {S}ystems},
address = {New York, NY, USA},
publisher = {ACM},
reportid = {FZJ-2026-00960},
pages = {165-176},
year = {2026},
comment = {Proceedings of the Supercomputing Asia and International
Conference on High Performance Computing in Asia Pacific
Region Workshops},
booktitle = {Proceedings of the Supercomputing Asia
and International Conference on High
Performance Computing in Asia Pacific
Region Workshops},
abstract = {Active drag reduction (ADR) using spanwise traveling
surface waves is a promising approach to reduce drag of
airplanes by manipulating the turbulent boundary layer (TBL)
around an airfoil, which directly translates into power
savings and lower emission of greenhouse gases harming the
environment. However, no analytical solution is known to
determine the optimal actuation parameters of these surface
waves based on given flow conditions. Data-driven deep
learning (DL) techniques from artificial intelligence (AI)
area promising alterna tive approach, but their training
requires a huge amount of high-fidelity data from
computationally expensive computational fluid dynamics (CFD)
simulations. Previous works proposed a TBL-Transformer
architecture for the expensive time-marching of turbulent
flow fields and coupled it with a finite volume solver from
the multi-physics PDE solver framework m-AIA to accelerate
the generation of TBL data. To accelerate the
computationally expensive inference of the TBL-Transformer,
the AIxeleratorService library was used to offload the
inference task to GPUs. While this approach significantly
accelerates the inference task, it leaves the CPU resources
allocated by the solver unutilized during inference. To
fully exploit modern heterogeneous computer systems, we
introduce a hybrid inference method based on a hybrid work
distribution model and implement it into the
AIxeleratorService library. Moreover, we present a formal
model to derive the optimal hybrid work distribution. To
evaluate the computational performance and scalability of
hybrid inference, we benchmark the coupled m-AIA solver from
previous work on a heterogeneous HPC system comprising Intel
Sapphire Rapids CPUs and NVIDIA H100 GPUs. Our results show
that hybrid inference achieves a performance speedup, that
grows as the ratio of allocated CPU cores to GPU devices
increases. We further demonstrate that the runtime
improvement by hybrid inference also increases the energy
efficiency of the coupled solver application. Finally, we
highlight that the theoretical hybrid work distribution
derived from our formal model yields near optimal results in
practice.},
month = {Jan},
date = {2026-01-26},
organization = {SCA/HPCAsia 2026 Workshops:
Supercomputing Asia and International
Conference on High Performance
Computing in Asia Pacific Region
Workshops, Osaka (Japan), 26 Jan 2026 -
29 Jan 2026},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511) / SDLFSE - SDL Fluids
$\&$ Solids Engineering (SDLFSE) / RAISE - Research on AI-
and Simulation-Based Engineering at Exascale (951733)},
pid = {G:(DE-HGF)POF4-5111 / G:(DE-Juel-1)SDLFSE /
G:(EU-Grant)951733},
typ = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
doi = {10.1145/3784828.3785255},
url = {https://juser.fz-juelich.de/record/1052361},
}