% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Betancourt:875320,
author = {Betancourt, Clara and Schröder, Sabine and Hagemeier,
Björn and Schultz, Martin},
title = {{P}erformance analysis and optimization of a {TB}yte-scale
atmospheric observation database},
reportid = {FZJ-2020-01951},
year = {2020},
abstract = {<p>The Tropospheric Ozone Assessment Report (TOAR) created
one of the $world\’s$ largest databases for
near-surface air quality measurements. More than 150 users
from 35 countries have accessed TOAR data via a graphical
web interface (https://join.fz-juelich.de) or a REST API
(https://join.fz-juelich.de/services/rest/surfacedata/) and
downloaded station information and aggregated statistics of
ozone and associated variables. All statistics are
calculated online from the hourly data that are stored in
the database to allow for maximum user flexibility (it is
possible, for example, to specify the minimum data capture
criterion that shall be used in the aggregation). Thus, it
is of paramount importance to measure and, if necessary,
optimize the performance of the database and of the web
services, which are connected to it. In this work, two
aspects of the TOAR database service infrastructure are
investigated: Performance enhancements by database tuning
and the implementation of flux-based ozone metrics, which
$\–$ unlike the already existing concentration based
metrics $\–$ require meteorological data and embedded
modeling.</p><p>The TOAR database is a PostgreSQL V10
relational database hosted on a virtual machine, connected
to the JOIN web server. In the current set-up the web
services trigger SQL queries and the resulting raw data are
transferred on demand to the JOIN server and processed
locally to derive the requested statistical quantities. We
tested the following measures to increase the database
performance: optimal definition of indices, server-side
programming in PL/pgSQL and PL/Python, on-line aggregation
to avoid transfer of large data, and query enhancement by
the explain-analyze tool of PostgreSQL. Through a
combination of the above mentioned techniques, the
performance of JOIN can be improved in a range of 20 - 70
$\%.</p><p>Flux-based$ ozone metrics are necessary for an
accurate quantification of ozone damage on vegetation. In
contrast to the already available concentration based
metrics of ozone, they require the input of meteorological
and soil data, as well as a consistent parametrization of
vegetation growing seasons and the inclusion of a stomatal
flux model. Embedding this model with the TOAR database will
make a global assessment of stomatal ozone fluxes possible
for the first time ever. This requires new query patterns,
which need to merge several variables onto a consistent time
axis, as well as more elaborate calculations, which are
presently coded in FORTRAN.</p><p>The presentation will
present the results from the performance tuning and discuss
the pros and cons of various ways how the ozone flux
calculations can be implemented.</p>},
month = {May},
date = {2020-05-04},
organization = {EGU2020: Sharing Geoscience Online, ,
4 May 2020 - 8 May 2020},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {512 - Data-Intensive Science and Federated Computing
(POF3-512) / IntelliAQ - Artificial Intelligence for Air
Quality (787576) / PhD no Grant - Doktorand ohne besondere
Förderung (PHD-NO-GRANT-20170405) / Earth System Data
Exploration (ESDE)},
pid = {G:(DE-HGF)POF3-512 / G:(EU-Grant)787576 /
G:(DE-Juel1)PHD-NO-GRANT-20170405 / G:(DE-Juel-1)ESDE},
typ = {PUB:(DE-HGF)1},
doi = {10.5194/egusphere-egu2020-13637},
url = {https://juser.fz-juelich.de/record/875320},
}