% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Winkel:17919,
      author       = {Winkel, M. and Speck, R. and Hübner, H. and Arnold, L. and
                      Krause, R. and Gibbon, P.},
      title        = {{A} massively parallel, multi-disciplinary {B}arnes-{H}ut
                      tree code for extreme-scale {N}-body simulations},
      journal      = {Computer physics communications},
      volume       = {183},
      issn         = {0010-4655},
      address      = {Amsterdam},
      publisher    = {North Holland Publ. Co.},
      reportid     = {PreJuSER-17919},
      pages        = {880 - 889},
      year         = {2012},
      note         = {The authors gratefully acknowledge the helpful support by
                      Julich Supercomputing Centre and the JSC staff, especially
                      M. Stephan and J. Docter. This work was supported in part by
                      the Alliance Program of the Helmholtz Association
                      (HA216/EMMI), the BMBF project ScaFaCoS and the EU TEXT
                      project, as well as additional computing time via the VSR
                      project JZAM04. R.S. and R.K. would like to thank the Swiss
                      Platform for High-Performance and High-Productivity
                      Computing (HP2C) for funding and support.},
      abstract     = {The efficient parallelization of fast multipole-based
                      algorithms for the N-body problem is one of the most
                      challenging topics in high performance scientific computing.
                      The emergence of non-local, irregular communication patterns
                      generated by these algorithms can easily create an
                      insurmountable bottleneck on supercomputers with hundreds of
                      thousands of cores. To overcome this obstacle we have
                      developed an innovative parallelization strategy for
                      Barnes-Hut tree codes on present and upcoming HPC multicore
                      architectures. This scheme, based on a combined MPI-Pthreads
                      approach, permits an efficient overlap of computation and
                      data exchange. We highlight the capabilities of this method
                      on the full IBM Blue Gene/P system JUGENE at inch
                      Supercomputing Centre and demonstrate scaling across 299,912
                      cores with up to 2,048,000,000 particles. Applying our
                      implementation PEPC to laser-plasma interaction and vortex
                      particle methods close to the continuum limit, we
                      demonstrate its potential for ground-breaking advances in
                      large-scale particle simulations. (C) 2011 Elsevier B.V. All
                      rights reserved.},
      keywords     = {J (WoSType)},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {Scientific Computing (FUEK411) / 411 - Computational
                      Science and Mathematical Methods (POF2-411)},
      pid          = {G:(DE-Juel1)FUEK411 / G:(DE-HGF)POF2-411},
      shelfmark    = {Computer Science, Interdisciplinary Applications / Physics,
                      Mathematical},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000301028700004},
      doi          = {10.1016/j.cpc.2011.12.013},
      url          = {https://juser.fz-juelich.de/record/17919},
}