% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Pronold:906624,
      author       = {Pronold, Jari and Jordan, Jakob and Wylie, Brian J. N. and
                      Kitayama, Itaru and Diesmann, Markus and Kunkel, Susanne},
      title        = {{R}outing brain traffic through the von {N}eumann
                      bottleneck: {E}fficient cache usage in spiking neural
                      network simulation code on general purpose computers},
      publisher    = {arXiv},
      reportid     = {FZJ-2022-01560},
      year         = {2021},
      abstract     = {Simulation is a third pillar next to experiment and theory
                      in the study of complex dynamic systems such as biological
                      neural networks. Contemporary brain-scale networks
                      correspond to directed graphs of a few million nodes, each
                      with an in-degree and out-degree of several thousands of
                      edges, where nodes and edges correspond to the fundamental
                      biological units, neurons and synapses, respectively. When
                      considering a random graph, each node's edges are
                      distributed across thousands of parallel processes. The
                      activity in neuronal networks is also sparse. Each neuron
                      occasionally transmits a brief signal, called spike, via its
                      outgoing synapses to the corresponding target neurons. This
                      spatial and temporal sparsity represents an inherent
                      bottleneck for simulations on conventional computers:
                      Fundamentally irregular memory-access patterns cause poor
                      cache utilization. Using an established neuronal network
                      simulation code as a reference implementation, we
                      investigate how common techniques to recover cache
                      performance such as software-induced prefetching and
                      software pipelining can benefit a real-world application.
                      The algorithmic changes reduce simulation time by up to
                      $50\%.$ The study exemplifies that many-core systems
                      assigned with an intrinsically parallel computational
                      problem can overcome the von Neumann bottleneck of
                      conventional computer architectures.},
      keywords     = {Distributed, Parallel, and Cluster Computing (cs.DC)
                      (Other) / FOS: Computer and information sciences (Other)},
      cin          = {INM-6 / IAS-6 / INM-10},
      cid          = {I:(DE-Juel1)INM-6-20090406 / I:(DE-Juel1)IAS-6-20130828 /
                      I:(DE-Juel1)INM-10-20170113},
      pnm          = {5234 - Emerging NC Architectures (POF4-523) / HBP SGA2 -
                      Human Brain Project Specific Grant Agreement 2 (785907) /
                      HBP SGA3 - Human Brain Project Specific Grant Agreement 3
                      (945539) / DEEP-EST - DEEP - Extreme Scale Technologies
                      (754304) / ACA - Advanced Computing Architectures (SO-092) /
                      GRK 2416:  MultiSenses-MultiScales: Novel approaches to
                      decipher neural processing in multisensory integration
                      (368482240) / ATMLPP - ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF4-5234 / G:(EU-Grant)785907 /
                      G:(EU-Grant)945539 / G:(EU-Grant)754304 / G:(DE-HGF)SO-092 /
                      G:(GEPRIS)368482240 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.48550/ARXIV.2109.12855},
      url          = {https://juser.fz-juelich.de/record/906624},
}