% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Pronold:908930,
      author       = {Pronold, J. and Jordan, J. and Wylie, B. J. N. and
                      Kitayama, Itaru and Diesmann, M. and Kunkel, Susanne},
      title        = {{R}outing brain traffic through the von {N}eumann
                      bottleneck: {E}fficient cache usage in spiking neural
                      network simulation code on general purpose computers},
      journal      = {Parallel computing},
      volume       = {113},
      issn         = {0167-8191},
      address      = {Amsterdam [u.a.]},
      publisher    = {North-Holland, Elsevier Science},
      reportid     = {FZJ-2022-02910},
      pages        = {102952 -},
      year         = {2022},
      abstract     = {Simulation is a third pillar next to experiment and theory
                      in the study of complex dynamic systems such as biological
                      neural networks. Contemporary brain-scale networks
                      correspond to directed random graphs of a few million nodes,
                      each with an in-degree and out-degree of several thousands
                      of edges, where nodes and edges correspond to the
                      fundamental biological units, neurons and synapses,
                      respectively. The activity in neuronal networks is also
                      sparse. Each neuron occasionally transmits a brief signal,
                      called spike, via its outgoing synapses to the corresponding
                      target neurons. In distributed computing these targets are
                      scattered across thousands of parallel processes. The
                      spatial and temporal sparsity represents an inherent
                      bottleneck for simulations on conventional computers:
                      irregular memory-access patterns cause poor cache
                      utilization. Using an established neuronal network
                      simulation code as a reference implementation, we
                      investigate how common techniques to recover cache
                      performance such as software-induced prefetching and
                      software pipelining can benefit a real-world application.
                      The algorithmic changes reduce simulation time by up to
                      $50\%.$ The study exemplifies that many-core systems
                      assigned with an intrinsically parallel computational
                      problem can alleviate the von Neumann bottleneck of
                      conventional computer architectures.},
      cin          = {INM-6 / IAS-6 / INM-10},
      ddc          = {620},
      cid          = {I:(DE-Juel1)INM-6-20090406 / I:(DE-Juel1)IAS-6-20130828 /
                      I:(DE-Juel1)INM-10-20170113},
      pnm          = {5234 - Emerging NC Architectures (POF4-523) / HBP SGA2 -
                      Human Brain Project Specific Grant Agreement 2 (785907) /
                      HBP SGA3 - Human Brain Project Specific Grant Agreement 3
                      (945539) / DEEP-EST - DEEP - Extreme Scale Technologies
                      (754304) / ACA - Advanced Computing Architectures (SO-092) /
                      GRK 2416:  MultiSenses-MultiScales: Novel approaches to
                      decipher neural processing in multisensory integration
                      (368482240) / Open-Access-Publikationskosten
                      Forschungszentrum Jülich (OAPKFZJ) (491111487) / PhD no
                      Grant - Doktorand ohne besondere Förderung
                      (PHD-NO-GRANT-20170405) / BTN-Peta - The Next-Generation
                      Integrated Simulation of Living Matter (BTN-Peta-2008-2012)
                      / Brain-Scale Simulations $(jinb33_20220812)$ / ATMLPP -
                      ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF4-5234 / G:(EU-Grant)785907 /
                      G:(EU-Grant)945539 / G:(EU-Grant)754304 / G:(DE-HGF)SO-092 /
                      G:(GEPRIS)368482240 / G:(GEPRIS)491111487 /
                      G:(DE-Juel1)PHD-NO-GRANT-20170405 /
                      G:(DE-Juel1)BTN-Peta-2008-2012 /
                      $G:(DE-Juel1)jinb33_20220812$ / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000857033800002},
      doi          = {10.1016/j.parco.2022.102952},
      url          = {https://juser.fz-juelich.de/record/908930},
}