% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Pronold:906547,
      author       = {Pronold, Jari and Jordan, Jakob and Wylie, Brian J. N. and
                      Kitayama, Itaru and Diesmann, Markus and Kunkel, Susanne},
      title        = {{R}outing {B}rain {T}raffic {T}hrough the {V}on {N}eumann
                      {B}ottleneck: {P}arallel {S}orting and {R}efactoring},
      journal      = {Frontiers in neuroinformatics},
      volume       = {15},
      issn         = {1662-5196},
      address      = {Lausanne},
      publisher    = {Frontiers Research Foundation},
      reportid     = {FZJ-2022-01510},
      pages        = {785068},
      year         = {2022},
      abstract     = {Generic simulation code for spiking neuronal networks
                      spends the major part of the time in the phase where spikes
                      have arrived at a compute node and need to be delivered to
                      their target neurons. These spikes were emitted over the
                      last interval between communication steps by source neurons
                      distributed across many compute nodes and are inherently
                      irregular and unsorted with respect to their targets. For
                      finding those targets, the spikes need to be dispatched to a
                      three-dimensional data structure with decisions on target
                      thread and synapse type to be made on the way. With growing
                      network size, a compute node receives spikes from an
                      increasing number of different source neurons until in the
                      limit each synapse on the compute node has a unique source.
                      Here, we show analytically how this sparsity emerges over
                      the practically relevant range of network sizes from a
                      hundred thousand to a billion neurons. By profiling a
                      production code we investigate opportunities for algorithmic
                      changes to avoid indirections and branching. Every thread
                      hosts an equal share of the neurons on a compute node. In
                      the original algorithm, all threads search through all
                      spikes to pick out the relevant ones. With increasing
                      network size, the fraction of hits remains invariant but the
                      absolute number of rejections grows. Our new alternative
                      algorithm equally divides the spikes among the threads and
                      immediately sorts them in parallel according to target
                      thread and synapse type. After this, every thread completes
                      delivery solely of the section of spikes for its own
                      neurons. Independent of the number of threads, all spikes
                      are looked at only two times. The new algorithm halves the
                      number of instructions in spike delivery which leads to a
                      reduction of simulation time of up to $40 \%.$ Thus, spike
                      delivery is a fully parallelizable process with a single
                      synchronization point and thereby well suited for many-core
                      systems. Our analysis indicates that further progress
                      requires a reduction of the latency that the instructions
                      experience in accessing memory. The study provides the
                      foundation for the exploration of methods of latency hiding
                      like software pipelining and software-induced prefetching.},
      cin          = {INM-6 / IAS-6 / INM-10},
      ddc          = {610},
      cid          = {I:(DE-Juel1)INM-6-20090406 / I:(DE-Juel1)IAS-6-20130828 /
                      I:(DE-Juel1)INM-10-20170113},
      pnm          = {5234 - Emerging NC Architectures (POF4-523) / HBP SGA2 -
                      Human Brain Project Specific Grant Agreement 2 (785907) /
                      HBP SGA3 - Human Brain Project Specific Grant Agreement 3
                      (945539) / DEEP-EST - DEEP - Extreme Scale Technologies
                      (754304) / ACA - Advanced Computing Architectures (SO-092) /
                      GRK 2416:  MultiSenses-MultiScales: Novel approaches to
                      decipher neural processing in multisensory integration
                      (368482240) / ATMLPP - ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF4-5234 / G:(EU-Grant)785907 /
                      G:(EU-Grant)945539 / G:(EU-Grant)754304 / G:(DE-HGF)SO-092 /
                      G:(GEPRIS)368482240 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {35300490},
      UT           = {WOS:000773679000001},
      doi          = {10.3389/fninf.2021.785068},
      url          = {https://juser.fz-juelich.de/record/906547},
}