% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Schltter:861601,
      author       = {Schlütter, Marc and Feld, Christian and Saviankou, Pavel
                      and Knobloch, Michael and Hermanns, Marc-André and Mohr,
                      Bernd},
      title        = {{SCIPHI} {S}core-{P} and {C}ube {E}xtensions for {I}ntel
                      {P}hi},
      address      = {Cham},
      publisher    = {Springer International Publishing},
      reportid     = {FZJ-2019-02051},
      isbn         = {978-3-030-11987-4},
      pages        = {85-104},
      year         = {2019},
      comment      = {Tools for High Performance Computing 2017},
      booktitle     = {Tools for High Performance Computing
                       2017},
      abstract     = {The Knights Landing processors offers unique features with
                      regards to memory hierarchy and vectorization capabilities.
                      To improve tool support within these two areas, we present
                      extensions to the Score-P measurement infrastructure and the
                      Cube report explorer. With the Knights Landing edition,
                      Intel introduced a new memory architecture, utilizing two
                      types of memory, MCDRAM and DDR4 SDRAM. To assist the user
                      in the decision where to place data structures, we introduce
                      a MCDRAM candidate metric to the Cube report explorer. In
                      addition we track all MCDRAM allocations through the
                      hbwmalloc interface, providing memory metrics like leaked
                      memory or the high-water mark on a per-region basis, as
                      already known for the ubiquitous malloc/free. A Score-P
                      metric plugin that records memory statistics via numastat on
                      a per process level enables a timeline analysis using the
                      Vampir toolset. To get the best performance out of , the
                      large vector processing units need to be utilized
                      effectively. The ratio between computation and data access
                      and the vector processing unit (VPU) intensity are
                      introduced as metrics to identify vectorization candidates
                      on a per-region basis. The Portable Hardware Locality
                      (hwloc) Broquedis et al. (hwloc: a generic framework for
                      managing hardware affinities in hpc applications, 2010 [2])
                      library allows us to visualize the distribution of the
                      KNL-specific performance metrics within the Cube report
                      explorer, taking the hardware topology consisting of
                      processor tiles and cores into account.},
      month         = {Sep},
      date          = {2017-09-11},
      organization  = {11th International Workshop on
                       Parallel Tools for High Performance
                       Computing, Dresden (Germany), 11 Sep
                       2017 - 12 Sep 2017},
      cin          = {JSC / JARA-HPC},
      cid          = {I:(DE-Juel1)JSC-20090406 / $I:(DE-82)080012_20140620$},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / ATMLPP - ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF3-511 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      doi          = {10.1007/978-3-030-11987-4_6},
      url          = {https://juser.fz-juelich.de/record/861601},
}