% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{AlvarezMallon:172874,
      author       = {Alvarez Mallon, Damian and Taboada, Guillermo L. and
                      Teijeiro, Carlos and González-Domínguez, Jorge and Gómez,
                      Andrés and Wibecan, Brian},
      title        = {{S}calable {PGAS} collective operations in {NUMA} clusters},
      journal      = {Cluster computing},
      volume       = {17},
      number       = {4},
      issn         = {1573-7543},
      address      = {Dordrecht [u.a.]},
      publisher    = {Springer Science + Business Media B.V},
      reportid     = {FZJ-2014-06308},
      pages        = {1473 - 1495},
      year         = {2014},
      abstract     = {The increasing number of cores per processor is turning
                      manycore-based systems in pervasive. This involves dealing
                      with multiple levels of memory in non uniform memory access
                      (NUMA) systems and processor cores hierarchies, accessible
                      via complex interconnects in order to dispatch the
                      increasing amount of data required by the processing
                      elements. The key for efficient and scalable provision of
                      data is the use of collective communication operations that
                      minimize the impact of bottlenecks. Leveraging one sided
                      communications becomes more important in these systems, to
                      avoid unnecessary synchronization between pairs of processes
                      in collective operations implemented in terms of two sided
                      point to point functions. This work proposes a series of
                      algorithms that provide a good performance and scalability
                      in collective operations, based on the use of hierarchical
                      trees, overlapping one-sided communications, message
                      pipelining and the available NUMA binding features. An
                      implementation has been developed for Unified Parallel C, a
                      Partitioned Global Address Space language, which presents a
                      shared memory view across the nodes for programmability,
                      while keeping private memory regions for performance. The
                      performance evaluation of the proposed implementation,
                      conducted on five representative systems (JuRoPA, JUDGE,
                      Finis Terrae, SVG and Superdome), has shown generally good
                      performance and scalability, even outperforming MPI in some
                      cases, which confirms the suitability of the developed
                      algorithms for manycore architectures.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {41G - Supercomputer Facility (POF2-41G21)},
      pid          = {G:(DE-HGF)POF2-41G21},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000345077400027},
      doi          = {10.1007/s10586-014-0377-9},
      url          = {https://juser.fz-juelich.de/record/172874},
}