% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Davidovi:852392,
      author       = {Davidović, Davor and Fabregat-Traver, Diego and
                      Höhnerbach, Markus and Di Napoli, Edoardo},
      title        = {{A}ccelerating the computation of {FLAPW} methods on
                      heterogeneous architectures},
      journal      = {Concurrency and computation},
      volume       = {30},
      number       = {24},
      issn         = {1532-0626},
      address      = {Chichester},
      publisher    = {Wiley},
      reportid     = {FZJ-2018-05355},
      pages        = {e4905 -},
      year         = {2018},
      abstract     = {Legacy codes in computational science and engineering have
                      been very successful in providing essential functionality to
                      researchers. However, they are not capable of exploiting the
                      massive parallelism provided by emerging heterogeneous
                      architectures. The lack of portable performance and
                      scalability puts them at high risk, ie, either they evolve
                      or they are destined to be executed on older platforms and
                      small clusters. One example of a legacy code which would
                      heavily benefit from a modern redesign is FLEUR, a software
                      for electronic structure calculations. In previous work, the
                      computational bottleneck of FLEUR was partially reengineered
                      to have a modular design that relies on standard building
                      blocks, namely, BLAS and LAPACK libraries. In this paper, we
                      demonstrate how the initial redesign enables the portability
                      to heterogeneous architectures. More specifically, we study
                      different approaches to port the code to architectures
                      consisting of multi-core CPUs equipped with one or more
                      coprocessors such as Nvidia GPUs and Intel Xeon Phis. Our
                      final code attains over $70\%$ of the architectures' peak
                      performance, and outperforms Nvidia's and Intel's libraries.
                      On JURECA, the large tier-0 cluster where FLEUR is often
                      executed, the code takes advantage of the full power of the
                      computing nodes, attaining 5× speedup over the sole use of
                      the CPUs.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / Simulation and Data Laboratory Quantum
                      Materials (SDLQM) (SDLQM)},
      pid          = {G:(DE-HGF)POF3-511 / G:(DE-Juel1)SDLQM},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000450236200021},
      doi          = {10.1002/cpe.4905},
      url          = {https://juser.fz-juelich.de/record/852392},
}