% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{DiNapoli:150536,
      author       = {Di Napoli, Edoardo},
      title        = {{I}mproving the performance of applied science numerical
                      simulations: an application to {D}ensity {F}unctional
                      {T}heory.},
      reportid     = {FZJ-2014-00590},
      year         = {2013},
      abstract     = {In the early days of numerical simulations, advances were
                      based on the ingenuity of pioneer scientists writing codes
                      for relatively simple machines. Nowadays the investigation
                      of large physical systems requires scaling simulations up to
                      massively parallel computers whose optimal usage can often
                      be challenging. On the one hand the algorithmic structure of
                      many legacy codes can be a limiting factor to their
                      portability on large supercomputers. More importantly in
                      many cases algorithmic libraries are used as black boxes and
                      no information coming from the physics of the specific
                      application is exploited to improve the overall performance
                      of the simulation. What is needed is a more
                      interdisciplinary approach where the tools of scientific
                      computing and knowledge extracted from the specific
                      application are merged together in a new computational
                      paradigm. One of the most promising new paradigms borrows
                      from the "inverse problem" concept and, by reversing the
                      logical arrow going from mathematical modeling to numerical
                      simulations, extracts from the latter specific information
                      that can be used to modify the algorithm. The resulting
                      methodology, named "reverse simulation", produces an
                      algorithm variant specifically tailored to the scientific
                      application. Additionally such a variant can be optimally
                      implemented for multiple parallel computing architectures.
                      To demonstrate its applicability I will exemplify the
                      workings of reverse simulation on a computational method
                      widely used in the framework of Density Functional Theory
                      (DFT): the Full-potential Linearized Augmented Plane Wave
                      (FLAPW) method. FLAPW provides the means to solve a
                      high-dimensional quantum mechanical problem by representing
                      it as a non-linear generalized eigenvalue problem which is
                      solved self-consistently through a series of successive
                      outer-iteration cycles. By applying the principles of
                      reverse simulation it can be shown that eigenvectors of
                      successive eigenproblems become progressively more collinear
                      to each other as the outer-iteration index increases. This
                      result suggests that one could use eigenvectors, computed at
                      a certain outer-iteration, as approximate solutions to
                      improve the performance of the eigensolver at the next
                      iteration. In order to maximally exploit the approximate
                      solution, we developed a subspace iteration method augmented
                      with an optimized Chebyshev polynomial accelerator together
                      with an efficient locking mechanism (ChFSI). The resulting
                      eigensolver was implemented in C language and can be
                      parallelized for both shared and distributed architectures.
                      Numerical tests show that, when the eigensolver is
                      preconditioned with approximate solutions instead of random
                      vectors, it achieves up to a 5X speedup. Moreover ChFSI
                      takes great advantage of computational resources by
                      obtaining levels of efficiency up to $80\%$ of the
                      theoretical peak performance. In particular, by making
                      better use of massively parallel architectures, the
                      distributed memory version will allow users of the FLAPW
                      method to simulate larger physical systems than are
                      currently accessible.},
      month         = {Mar},
      date          = {2013-03-05},
      organization  = {Seminar at Columbia University, New
                       York (United States), 5 Mar 2013},
      subtyp        = {Invited},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {411 - Computational Science and Mathematical Methods
                      (POF2-411) / Simulation and Data Laboratory Quantum
                      Materials (SDLQM) (SDLQM)},
      pid          = {G:(DE-HGF)POF2-411 / G:(DE-Juel1)SDLQM},
      typ          = {PUB:(DE-HGF)31},
      url          = {https://juser.fz-juelich.de/record/150536},
}