Thomas Viehmann tv@lernapparat.de
Dieser Code und die Verfahren werden in https://lernapparat.de/rki-nowcasting/ dokumentiert.
Bitte beachten Sie die Hinweise dort zu den Bedingungen der Nutzung dieses Codes und der Berechnungsresultate.
import pandas
if 'get_ipython' in dir():
INTERACTIVE = True
%matplotlib inline
else:
INTERACTIVE = False
from matplotlib import pyplot
import numpy
import datetime
import math
import matplotlib
import scipy.stats
#import seaborn
import pathlib
#seaborn.set()
import os
import shutil
import itertools
import ot
import tqdm
import time
def reverse_cumprod(x):
return x[::-1].cumprod()[::-1]
def reverse_cumsum(x):
return x[::-1].cumsum()[::-1]
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
def smooth(s, size=4):
return pandas.Series(numpy.convolve(s, numpy.full((size,), 1/size))[:-(size-1)], index=s.index, name=s.name)
bl_abbr = {"Deutschland": "D", "Baden-Württemberg": "BW", "Bayern": "BY", "Berlin": "BE", "Brandenburg": "BB", "Bremen": "HB", "Hamburg": "HH",
"Hessen": "HE", "Mecklenburg-Vorpommern": "MV", "Niedersachsen": "NI", "Nordrhein-Westfalen": "NW",
"Rheinland-Pfalz": "RP", "Saarland": "SL", "Sachsen": "SN", "Sachsen-Anhalt": "ST", "Schleswig-Holstein": "SH",
"Thüringen": "TH"}
p_output = pathlib.Path('./outputs/')
def load_df(date, augmented=0):
# 0 = unaugmented
# 1 = with many columns
# 2 = with fractional cases
dstr = date.date().isoformat()
if augmented == 0:
fn = f'data/rki_covid19.{dstr}.csv'
elif augmented == 1:
fn = f'data/rki_covid19.viele_daten.{dstr}.csv'
elif augmented == 2:
fn = f'data/rki_covid19.uedatum.{dstr}.csv'
else:
raise Exception("invalid value for augmented", augmented)
adf = pandas.read_csv(fn)
adf['Datenstand'] = pandas.to_datetime(adf.Datenstand.apply(lambda s: '-'.join(s.split(',')[0].split('.')[::-1])))
assert (date==adf['Datenstand'].min()) and (date==adf['Datenstand'].max())
adf.loc[adf.Refdatum.isna(), 'Refdatum'] = adf.loc[adf.Refdatum.isna(), 'Meldedatum']
adf.loc[adf.IstErkrankungsbeginn.isna(), 'IstErkrankungsbeginn'] = (
adf.loc[adf.IstErkrankungsbeginn.isna(), 'Refdatum'] != adf.loc[adf.IstErkrankungsbeginn.isna(), 'Meldedatum']
).astype(int) # misses Melde=Ref, but we cannot do better
adf['AnzahlGenesen'] = adf.AnzahlGenesen.fillna(0)
adf['NeuGenesen'] = adf.NeuGenesen.fillna(-9).astype(int)
if 'ObjectId' in adf.columns:
adf.rename(columns={'ObjectId': 'FID'}, inplace=True)
for k in ('Meldedatum', 'Datenstand', 'Refdatum', 'Uebermittlungsdatum'):
if k in adf.columns:
adf[k] = pandas.to_datetime(adf[k])
for k in ('AnzahlFall', 'AnzahlTodesfall', 'AnzahlGenesen'):
adf[k] = adf[k].astype(float)
if adf.Datenstand.max()==pandas.to_datetime('2020-04-14') or adf.Datenstand.max()==pandas.to_datetime('2020-04-13'): # fix
adf.loc[adf.Landkreis == 'LK Göttingen (alt)', 'Landkreis'] = 'LK Göttingen'
if adf.Datenstand.max()>=pandas.to_datetime('2020-04-13') and adf.Datenstand.max()<=pandas.to_datetime('2020-04-15'): # fix
adf.loc[adf.Landkreis == 'LK Göttingen', 'IdLandkreis'] = 3159
uebermittlungsdatum = adf.Datenstand.max() - pandas.Timedelta(days=1)
if 'ue_unbekannt' not in adf.columns and 'Uebermittlungsdatum' not in adf.columns:
adf['ue_unbekannt'] = (adf.NeuerFall < 1) * adf.AnzahlFall.clip(lower=0)
adf['ue_'+uebermittlungsdatum.date().isoformat().replace('-', '_')] = (adf.NeuerFall == 1) * adf.AnzahlFall
if 'Uebermittlungsdatum' not in adf.columns:
for d in pandas.date_range('01-01-2020', uebermittlungsdatum, freq='D'):
c = 'ue_'+d.date().isoformat().replace('-', '_')
if c not in adf.columns:
adf[c] = 0 # we don't want the substitute here: (adf.Meldedatum == d) * adf.AnzahlFall
return adf
# Distanzfunktion zwischen Schlüsseln, sollte eine Metrik sein!
bl_map = None
def key_dist(k1, k2, gr=None):
global bl_map
d = 0
if k1[0] != k2[0]: # LK
if bl_map is None:
bl_map = gr["Bundesland"].groupby(["Landkreis"]).first()
if bl_map[k1[0]] != bl_map[k2[0]]:
d += 30
else:
d += 5
if k1[1] != k2[1]: # Meldedatum
d += 8
if k1[2] != k2[2]: # Altersband, evtl benachbarte verbessern? --> sollte aber Metrik bleiben
if k1[2] == 'unbekannt' or k2[2] == 'unbekannt':
d += 1
else:
d += 2
if k1[3] != k2[3]: # Geschlecht
if k1[3] == 'unbekannt' or k2[3] == 'unbekannt':
d += 1
else:
d += 2
if k1[4] != k2[4]: # Erkrankungsbeginn
d += 0.5
elif k1[4] == 1 and k1[5] != k2[5]: # habe Erkrankungsbeginn, unterschiedliches Refdatum
d += 1 # distanz zwischen daten einbeziehen? muss aber mit IstErkrankungsbeginn==0 metrik bleiben...
return d
def save_df(df):
date = df.Datenstand.max()
dstr = date.date().isoformat()
if 'ue_unbekannt' in df.columns:
fn = f'data/rki_covid19.viele_daten.{dstr}.csv'
elif 'Uebermittlungsdatum' in df.columns:
fn = f'data/rki_covid19.uedatum.{dstr}.csv'
else:
raise Exception("Unbekanntes Format")
df.to_csv(fn, index=False)
def group_df(df):
df = df.copy()
df['AnzahlFallOrigPos'] = df.AnzahlFall.clip(lower=0)
df['AnzahlTodesfallOrigPos'] = df.AnzahlTodesfall.clip(lower=0)
df['AnzahlGenesenOrigPos'] = df.AnzahlGenesen.clip(lower=0)
df['AnzahlFallOrigNeg'] = df.AnzahlFall.clip(upper=0)
df['AnzahlTodesfallOrigNeg'] =df.AnzahlTodesfall.clip(upper=0)
df['AnzahlGenesenOrigNeg'] = df.AnzahlGenesen.clip(upper=0)
# A sind Aktive Fälle (d.h. ohne Tote / Genesene)
df['a_neu'] = df.AnzahlFall * (df.NeuerFall == 1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -9)
df['g_neu'] = df.AnzahlFall * (df.NeuerFall == 1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == 1)
df['t_neu'] = df.AnzahlFall * (df.NeuerFall == 1) * (df.NeuerTodesfall == 1) * (df.NeuGenesen == -9)
df['a_gel'] = -df.AnzahlFall * (df.NeuerFall == -1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -9)
df['g_gel'] = -df.AnzahlFall * (df.NeuerFall == -1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -1)
df['t_gel'] = -df.AnzahlFall * (df.NeuerFall == -1) * (df.NeuerTodesfall == -1) * (df.NeuGenesen == -9)
df['a_to_g'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == 1)
df['a_to_t'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == 1) * (df.NeuGenesen == -9)
df['g_to_a'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -1)
df['g_to_t'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == 1) * (df.NeuGenesen == -1)
df['t_to_a'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == -1) * (df.NeuGenesen == -9)
df['t_to_g'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == -1) * (df.NeuGenesen == 1)
df['a_to_a'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -9)
df['t_to_t'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == 0) * (df.NeuGenesen == -9)
df['g_to_g'] = df.AnzahlFall * (df.NeuerFall == 0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == 0)
df['a'] = df.a_to_a + df.g_to_a + df.t_to_a + df.a_neu
df['a_prev'] = df.a_to_a + df.a_to_g + df.a_to_t + df.a_gel
df['g'] = df.a_to_g + df.g_to_g + df.t_to_g + df.g_neu
df['g_prev'] = df.g_to_a + df.g_to_g + df.g_to_t + df.g_gel
df['t'] = df.a_to_t + df.g_to_t + df.t_to_t + df.t_neu
df['t_prev'] = df.t_to_a + df.t_to_g + df.t_to_t + df.t_gel
df = df.drop(['Altersgruppe2', 'NeuerFall', 'NeuGenesen', 'NeuerTodesfall', 'AnzahlFall', 'AnzahlTodesfall', 'AnzahlGenesen'], axis=1)
df_grouped = df.groupby(['Datenstand','IdBundesland','IdLandkreis', 'Altersgruppe', 'Geschlecht',
'Landkreis', 'Meldedatum', 'IstErkrankungsbeginn', 'Refdatum', 'Bundesland']).sum().reset_index()
df_group = df_grouped.groupby(['Landkreis', 'Meldedatum', 'Altersgruppe', 'Geschlecht',
'IstErkrankungsbeginn', 'Refdatum']).first()
print(len(df_grouped), len(df_group))
assert len(df_grouped) == len(df_group) # we don't want to lose things here
return df_group
eps = 1e-6
def do_matching(bdf_group, adf_group):
idx = adf_group.index.union(bdf_group.index)
adf_group = adf_group.reindex(idx, fill_value=0)
bdf_group = bdf_group.reindex(idx, fill_value=0)
def smin(a,b):
return pandas.concat((a, b), axis=1).min(axis=1)
adf_group['unmatched_a_prev'] = adf_group.a_prev.copy() # .copy needed? we will in-place modify
bdf_group['unmatched_a_next'] = bdf_group.a.copy()
adf_group['unmatched_t_prev'] = adf_group.t_prev.copy()
bdf_group['unmatched_t_next'] = bdf_group.t.copy()
adf_group['unmatched_g_prev'] = adf_group.g_prev.copy()
bdf_group['unmatched_g_next'] = bdf_group.g.copy()
print("consistency", adf_group.unmatched_a_prev.sum(), bdf_group.unmatched_a_next.sum(),
adf_group.unmatched_t_prev.sum(), bdf_group.unmatched_t_next.sum(),
adf_group.unmatched_g_prev.sum(), bdf_group.unmatched_g_next.sum()
)
assert numpy.abs(adf_group.unmatched_a_prev.sum() - bdf_group.unmatched_a_next.sum()) < eps
assert numpy.abs(adf_group.unmatched_t_prev.sum() - bdf_group.unmatched_t_next.sum()) < eps
assert numpy.abs(adf_group.unmatched_g_prev.sum() - bdf_group.unmatched_g_next.sum()) < eps
matching_a = smin(adf_group.unmatched_a_prev, bdf_group.unmatched_a_next)
matching_t = smin(adf_group.unmatched_t_prev, bdf_group.unmatched_t_next)
matching_g = smin(adf_group.unmatched_g_prev, bdf_group.unmatched_g_next)
adf_group['idmatch_a_prev'] = matching_a
adf_group['idmatch_t_prev'] = matching_t
adf_group['idmatch_g_prev'] = matching_g
bdf_group['idmatch_a_next'] = matching_a
bdf_group['idmatch_t_next'] = matching_t
bdf_group['idmatch_g_next'] = matching_g
adf_group['delmatch_a_prev'] = 0
adf_group['delmatch_t_prev'] = 0
adf_group['delmatch_g_prev'] = 0
bdf_group['delmatch_a_next'] = 0
bdf_group['delmatch_t_next'] = 0
bdf_group['delmatch_g_next'] = 0
adf_group.loc[:, 'unmatched_a_prev'] -= matching_a
bdf_group.loc[:, 'unmatched_a_next'] -= matching_a
adf_group.loc[:, 'unmatched_t_prev'] -= matching_t
bdf_group.loc[:, 'unmatched_t_next'] -= matching_t
adf_group.loc[:, 'unmatched_g_prev'] -= matching_g
bdf_group.loc[:, 'unmatched_g_next'] -= matching_g
remaining_adf = adf_group[ (adf_group.unmatched_a_prev != 0)
| (adf_group.unmatched_t_prev != 0)
| (adf_group.unmatched_g_prev != 0)]
remaining_bdf = bdf_group[ (bdf_group.unmatched_a_next != 0)
| (bdf_group.unmatched_t_next != 0)
| (bdf_group.unmatched_g_next != 0)]
assert numpy.abs(remaining_adf.unmatched_a_prev.sum() - remaining_bdf.unmatched_a_next.sum()) < eps
assert numpy.abs(remaining_adf.unmatched_t_prev.sum() - remaining_bdf.unmatched_t_next.sum()) < eps
assert numpy.abs(remaining_adf.unmatched_g_prev.sum() - remaining_bdf.unmatched_g_next.sum()) < eps
print(len(remaining_adf), len(remaining_bdf),
remaining_adf.unmatched_a_prev.sum(),
remaining_bdf.unmatched_a_next.sum(),
remaining_adf.unmatched_t_prev.sum(),
remaining_bdf.unmatched_t_next.sum(),
remaining_adf.unmatched_g_prev.sum(),
remaining_bdf.unmatched_g_next.sum(),)
# neues Erkrankungsdatum
it = list(remaining_adf.index)
if len(it) > 1000:
it = tqdm.tqdm(it)
for key in it:
ar = adf_group.loc[key]
key2 = key[:4]+(0.0,)+key[1:2] # Fallback: ohne Erkrankungsdatum
try:
br = bdf_group.loc[key2]
except KeyError as _:
br = None
if br is not None:
matching_a = min(ar.unmatched_a_prev, br.unmatched_a_next)
matching_t = min(ar.unmatched_t_prev, br.unmatched_t_next)
matching_g = min(ar.unmatched_g_prev, br.unmatched_g_next)
adf_group.loc[key, 'delmatch_a_prev'] = matching_a
adf_group.loc[key, 'delmatch_t_prev'] = matching_t
adf_group.loc[key, 'delmatch_g_prev'] = matching_g
bdf_group.loc[key2, 'delmatch_a_next'] = matching_a
bdf_group.loc[key2, 'delmatch_t_next'] = matching_t
bdf_group.loc[key2, 'delmatch_g_next'] = matching_g
adf_group.loc[key, 'unmatched_a_prev'] -= matching_a
bdf_group.loc[key2, 'unmatched_a_next'] -= matching_a
adf_group.loc[key, 'unmatched_t_prev'] -= matching_t
bdf_group.loc[key2, 'unmatched_t_next'] -= matching_t
adf_group.loc[key, 'unmatched_g_prev'] -= matching_g
bdf_group.loc[key2, 'unmatched_g_next'] -= matching_g
remaining_adf = adf_group[ (adf_group.unmatched_a_prev != 0)
| (adf_group.unmatched_t_prev != 0)
| (adf_group.unmatched_g_prev != 0)]
remaining_bdf = bdf_group[ (bdf_group.unmatched_a_next != 0)
| (bdf_group.unmatched_t_next != 0)
| (bdf_group.unmatched_g_next != 0)]
assert numpy.abs(remaining_adf.unmatched_a_prev.sum() - remaining_bdf.unmatched_a_next.sum()) < eps
assert numpy.abs(remaining_adf.unmatched_t_prev.sum() - remaining_bdf.unmatched_t_next.sum()) < eps
assert numpy.abs(remaining_adf.unmatched_g_prev.sum() - remaining_bdf.unmatched_g_next.sum()) < eps
print(len(remaining_adf), len(remaining_bdf),
remaining_adf.unmatched_a_prev.sum(),
remaining_bdf.unmatched_a_next.sum(),
remaining_adf.unmatched_t_prev.sum(),
remaining_bdf.unmatched_t_next.sum(),
remaining_adf.unmatched_g_prev.sum(),
remaining_bdf.unmatched_g_next.sum(),)
return bdf_group, adf_group
# NeuerF NTF NG FactorAF, FATF, FAG
map_to_neuerkey = {
'a_neu': ( 1, -9, -9, 1, 0, 0),
'g_neu': ( 1, -9, 1, 1, 0, 1),
't_neu': ( 1, 1, -9, 1, 1, 0),
'a_gel': (-1, -9, -9, -1, 0, 0),
'g_gel': (-1, -9, -1, -1, -1, 0),
't_gel': (-1, -1, -9, -1, 0, -1),
'a_to_a': ( 0, -9, -9, 1, 0, 0),
'a_to_g': ( 0, -9, 1, 1, 0, 1),
'a_to_t': ( 0, 1, -9, 1, 1, 0),
't_to_a': ( 0, -1, -9, 1, -1, 0),
't_to_g': ( 0, -1, 1, 1, -1, 1),
't_to_t': ( 0, 0, -9, 1, 1, 0),
'g_to_a': ( 0, -9, -1, 1, 0, -1),
'g_to_g': ( 0, -9, 0, 1, 0, 1),
'g_to_t': ( 0, 1, -1, 1, 1, -1),
}
if 0:
assert (old_dfs.groupby(["NeuerFall", "NeuerTodesfall", "NeuGenesen"])
[["AnzahlFall", "AnzahlTodesfall", "AnzahlGenesen"]]
.sum().abs() ==
old_dfs[["AnzahlFall", "AnzahlTodesfall", "AnzahlGenesen", "NeuerFall", "NeuerTodesfall", "NeuGenesen"]]
.apply(lambda x: x.abs() if x.name.startswith('Anzahl') else x, axis=0)
.groupby(["NeuerFall", "NeuerTodesfall", "NeuGenesen"]).sum()).all().all()
old_dfs.groupby(["NeuerFall", "NeuerTodesfall", "NeuGenesen"])[["AnzahlFall"]].sum().abs().sum()
def augment_df(old_df, new_df):
new_old_df = new_df[new_df.NeuerFall <= 0]
old_gr = group_df(old_df)
new_gr = group_df(new_old_df)
old_match, new_match = do_matching(old_gr, new_gr)
ue_columns = [c for c in old_gr.columns if c.startswith('ue_')]
old_ue_sum = old_match.loc[:, ue_columns].sum(1)
for c in ue_columns:
new_match[c] = ((new_match.idmatch_a_prev + new_match.idmatch_g_prev + new_match.idmatch_t_prev)
* ((new_match.a_prev + new_match.g_prev + new_match.t_prev
- new_match.a_gel - new_match.g_gel - new_match.t_gel)
/ (new_match.a_prev + new_match.g_prev + new_match.t_prev).clip(lower=1e-9))
* old_match[c] / old_ue_sum.clip(lower=1e-9))
delmatches = new_match[(new_match.delmatch_a_prev > 0) | (new_match.delmatch_g_prev > 0) | (new_match.delmatch_t_prev > 0)]
# better with join...
for key, ar in delmatches.iterrows():
bkey = key[:4]+(0.0,)+key[1:2] # Fallback: ohne Erkrankungsdatum
br = old_match.loc[bkey]
new_match.loc[key, ue_columns] += ((ar.delmatch_a_prev + ar.delmatch_g_prev + ar.delmatch_t_prev)
* (ar.a_prev + ar.g_prev + ar.t_prev - ar.a_gel - ar.g_gel - ar.t_gel)
/ max(ar.a_prev + ar.g_prev + ar.t_prev, 1e-9)
* br[ue_columns] / max(old_ue_sum.loc[bkey], 1e-9))
old_dfs = old_df
new_dfs = new_df.copy()
adf_group = new_match
bdf_group = old_match
remaining_adf = adf_group[ (adf_group.unmatched_a_prev != 0)
| (adf_group.unmatched_t_prev != 0)
| (adf_group.unmatched_g_prev != 0)]
remaining_bdf = bdf_group[ (bdf_group.unmatched_a_next != 0)
| (bdf_group.unmatched_t_next != 0)
| (bdf_group.unmatched_g_next != 0)]
dists = numpy.zeros((len(remaining_adf), len(remaining_bdf)))
for i, k1 in enumerate(remaining_adf.index):
for j, k2 in enumerate(remaining_bdf.index):
dists[i, j] = key_dist(k1, k2, adf_group)
if remaining_adf.unmatched_a_prev.sum() > 1e-9:
p1_a = remaining_adf.unmatched_a_prev.values/remaining_adf.unmatched_a_prev.sum()
p2_a = remaining_bdf.unmatched_a_next.values/remaining_bdf.unmatched_a_next.sum()
mapping_a = ot.emd(p1_a, p2_a, dists)
mapping_a *= remaining_adf.unmatched_a_prev.sum()
else:
mapping_a = numpy.zeros_like(dists)
p1_a = mapping_a[:, 0]
p2_a = mapping_a[0, :]
if remaining_adf.unmatched_g_prev.sum() > 1e-9:
p1_g = remaining_adf.unmatched_g_prev.values/remaining_adf.unmatched_g_prev.sum()
p2_g = remaining_bdf.unmatched_g_next.values/remaining_bdf.unmatched_g_next.sum()
mapping_g = ot.emd(p1_g, p2_g, dists)
mapping_g *= remaining_adf.unmatched_g_prev.sum()
else:
mapping_g = numpy.zeros_like(dists)
p1_g = mapping_g[:, 0]
p2_g = mapping_g[0, :]
if remaining_adf.unmatched_t_prev.sum() > 1e-9:
p1_t = remaining_adf.unmatched_t_prev.values/remaining_adf.unmatched_t_prev.sum()
p2_t = remaining_bdf.unmatched_t_next.values/remaining_bdf.unmatched_t_next.sum()
mapping_t = ot.emd(p1_t, p2_t, dists)
mapping_t *= remaining_adf.unmatched_t_prev.sum()
else:
mapping_t = numpy.zeros_like(dists)
p1_t = mapping_t[:, 0]
p2_t = mapping_t[0, :]
mapping = mapping_a + mapping_g + mapping_t
for idx, (key, ar) in enumerate(remaining_adf.iterrows()):
for bidx, (bkey, br) in enumerate(remaining_bdf.iterrows()):
# as the mapping is sparse, it is somewhat wasteful, we could just pick nonzeros...
if mapping[idx, bidx] > 1e-9:
### eigentlich meldedatum vergleichen!!!
new_match.loc[key, ue_columns] += (mapping[idx, bidx]
* (ar.a_prev + ar.g_prev + ar.t_prev - ar.a_gel - ar.g_gel - ar.t_gel)
/ max(ar.a_prev + ar.g_prev + ar.t_prev, 1e-9)
* br[ue_columns] / max(old_ue_sum.loc[bkey], 1e-9))
broken = new_match.index[(new_match[ue_columns].sum(axis=1) - (new_match.a + new_match.g + new_match.t)).abs()>1e-9]
assert len(broken) == 0
new_old_df_with_uevz = new_old_df.drop(columns=ue_columns).join(new_match[ue_columns], new_match.index.names)
factor = new_old_df_with_uevz.AnzahlFall.clip(lower=0) / new_old_df_with_uevz[ue_columns].sum(axis=1).clip(lower=1e-9)
for c in ue_columns:
new_old_df_with_uevz[c] *= factor
new_new_df_with_uevz = new_df[new_df.NeuerFall == 1].copy()
new_complete_with_uevz = pandas.concat((new_old_df_with_uevz, new_new_df_with_uevz), axis=0, sort=True)
print(new_complete_with_uevz[new_complete_with_uevz.NeuerFall>=0].AnzahlFall.sum(), new_complete_with_uevz[new_complete_with_uevz.NeuerFall>=0][ue_columns].sum().sum())
return new_complete_with_uevz
def ergaenze_uedatum(df):
detailed_df = []
for c in df.columns:
if c.startswith('ue_'):
part = df[(df.NeuerFall >= 0) & (df[c]>0)].copy()
if c.startswith('ue_2020'):
part['Uebermittlungsdatum'] = pandas.to_datetime(c[3:].replace('_', '-'))
part['IstUebermittlung'] = 1
else:
# print(c)
part['Uebermittlungsdatum'] = part.Meldedatum
part['IstUebermittlung'] = 0
factor = df[c] / df.AnzahlFall
part['AnzahlFall'] *= factor
part['AnzahlTodesfall'] *= factor
part['AnzahlGenesen'] *= factor
detailed_df.append(part)
detailed_df = pandas.concat(detailed_df)
detailed_df['delay'] = (detailed_df.Uebermittlungsdatum - detailed_df.Refdatum).dt.days
detailed_df.drop(columns=[c for c in detailed_df.columns if c.startswith('ue_')], inplace=True)
return detailed_df
if 0:
date0 = pandas.to_datetime('2020-04-29')
date1 = date0 + pandas.Timedelta(days=1)
new_new_df = load_df(date0, augmented=False)
for date1 in pandas.date_range(date0 + pandas.Timedelta(days=1), '2020-05-30'):
old_df = new_new_df
new_df = load_df(date1, augmented=False)
new_new_df = augment_df(old_df, new_df)
print(date1, new_new_df.ue_unbekannt.sum())
save_df(new_new_df)
df_mit_uedatum = ergaenze_uedatum(new_new_df)
save_df(df_mit_uedatum)
date0 = pandas.to_datetime('2020-06-04')
date1 = date0 + pandas.Timedelta(days=1)
old_df = load_df(date0, augmented=1)
new_df = load_df(date1, augmented=0)
new_new_df = augment_df(old_df, new_df)
print(date1, new_new_df.ue_unbekannt.sum())
save_df(new_new_df)
df_mit_uedatum = ergaenze_uedatum(new_new_df)
save_df(df_mit_uedatum)
144475 144475 144613 144613 consistency 6340.0 6340.0 8581.0 8581.0 167843.0 167843.0 620 579 98.0 98.0 24.0 24.0 522.0 522.0 119 117 12.0 12.0 5.0 5.0 111.0 111.0 183271.0 182695.99999999988 2020-06-05 00:00:00 154565.10445833244
old_df.AnzahlFall.clip(lower=0).sum(), new_df[new_df.NeuerFall<1].AnzahlFall.abs().clip(lower=0).sum()
(181815.0, 181815.0)