% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Pronold:906547,
author = {Pronold, Jari and Jordan, Jakob and Wylie, Brian J. N. and
Kitayama, Itaru and Diesmann, Markus and Kunkel, Susanne},
title = {{R}outing {B}rain {T}raffic {T}hrough the {V}on {N}eumann
{B}ottleneck: {P}arallel {S}orting and {R}efactoring},
journal = {Frontiers in neuroinformatics},
volume = {15},
issn = {1662-5196},
address = {Lausanne},
publisher = {Frontiers Research Foundation},
reportid = {FZJ-2022-01510},
pages = {785068},
year = {2022},
abstract = {Generic simulation code for spiking neuronal networks
spends the major part of the time in the phase where spikes
have arrived at a compute node and need to be delivered to
their target neurons. These spikes were emitted over the
last interval between communication steps by source neurons
distributed across many compute nodes and are inherently
irregular and unsorted with respect to their targets. For
finding those targets, the spikes need to be dispatched to a
three-dimensional data structure with decisions on target
thread and synapse type to be made on the way. With growing
network size, a compute node receives spikes from an
increasing number of different source neurons until in the
limit each synapse on the compute node has a unique source.
Here, we show analytically how this sparsity emerges over
the practically relevant range of network sizes from a
hundred thousand to a billion neurons. By profiling a
production code we investigate opportunities for algorithmic
changes to avoid indirections and branching. Every thread
hosts an equal share of the neurons on a compute node. In
the original algorithm, all threads search through all
spikes to pick out the relevant ones. With increasing
network size, the fraction of hits remains invariant but the
absolute number of rejections grows. Our new alternative
algorithm equally divides the spikes among the threads and
immediately sorts them in parallel according to target
thread and synapse type. After this, every thread completes
delivery solely of the section of spikes for its own
neurons. Independent of the number of threads, all spikes
are looked at only two times. The new algorithm halves the
number of instructions in spike delivery which leads to a
reduction of simulation time of up to $40 \%.$ Thus, spike
delivery is a fully parallelizable process with a single
synchronization point and thereby well suited for many-core
systems. Our analysis indicates that further progress
requires a reduction of the latency that the instructions
experience in accessing memory. The study provides the
foundation for the exploration of methods of latency hiding
like software pipelining and software-induced prefetching.},
cin = {INM-6 / IAS-6 / INM-10},
ddc = {610},
cid = {I:(DE-Juel1)INM-6-20090406 / I:(DE-Juel1)IAS-6-20130828 /
I:(DE-Juel1)INM-10-20170113},
pnm = {5234 - Emerging NC Architectures (POF4-523) / HBP SGA2 -
Human Brain Project Specific Grant Agreement 2 (785907) /
HBP SGA3 - Human Brain Project Specific Grant Agreement 3
(945539) / DEEP-EST - DEEP - Extreme Scale Technologies
(754304) / ACA - Advanced Computing Architectures (SO-092) /
GRK 2416: MultiSenses-MultiScales: Novel approaches to
decipher neural processing in multisensory integration
(368482240) / ATMLPP - ATML Parallel Performance (ATMLPP)},
pid = {G:(DE-HGF)POF4-5234 / G:(EU-Grant)785907 /
G:(EU-Grant)945539 / G:(EU-Grant)754304 / G:(DE-HGF)SO-092 /
G:(GEPRIS)368482240 / G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)16},
pubmed = {35300490},
UT = {WOS:000773679000001},
doi = {10.3389/fninf.2021.785068},
url = {https://juser.fz-juelich.de/record/906547},
}