% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{AlvarezMallon:281229,
      author       = {Alvarez Mallon, Damian and Taboada, Guillermo L. and
                      Koesterke, Lars},
      title        = {{MPI} and {UPC} broadcast, scatter and gather algorithms in
                      {X}eon {P}hi},
      journal      = {Concurrency and computation},
      volume       = {28},
      number       = {8},
      issn         = {1532-0626},
      address      = {Chichester},
      publisher    = {Wiley},
      reportid     = {FZJ-2016-00928},
      pages        = {2322–2340},
      year         = {2016},
      abstract     = {Accelerators have revolutionised the high performance
                      computing (HPC) community. Despite their advantages, their
                      very specific programming models and limited communication
                      capabilities have kept them in a supporting role of the main
                      processors. With the introduction of Xeon Phi, this is no
                      longer true, as it can be programmed as the main processor
                      and has direct access to the InfiniBand network adapter.
                      Collective operations play a key role in many HPC
                      applications. Therefore, studying its behaviour in the
                      context of manycore coprocessors has great importance. This
                      work analyses the performance of different algorithms for
                      broadcast, scatter and gather, in a large-scale Xeon Phi
                      supercomputer. The algorithms evaluated are those available
                      in the reference message passing interface (MPI)
                      implementation for Xeon Phi (Intel MPI), the default
                      algorithm in an optimised MPI implementation (MVAPICH2-MIC),
                      and a new set of algorithms, developed by the authors of
                      this work, designed with modern processors and new
                      communication features in mind. The latter are implemented
                      in Unified Parallel C (UPC), a partitioned global address
                      space language, leveraging one-sided communications,
                      hierarchical trees and message pipelining. This study scales
                      the experiments to 15360 cores in the Stampede supercomputer
                      and compares the results to Xeon and hybrid Xeon + Xeon Phi
                      experiments, with up to 19456 cores.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {513 - Supercomputer Facility (POF3-513)},
      pid          = {G:(DE-HGF)POF3-513},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000376263300002},
      doi          = {10.1002/cpe.3552},
      url          = {https://juser.fz-juelich.de/record/281229},
}