Veredelung der RKI_COVID19.csv

Thomas Viehmann [email protected]

Dieser Code und die Verfahren werden in https://lernapparat.de/rki-nowcasting/ dokumentiert.

Bitte beachten Sie die Hinweise dort zu den Bedingungen der Nutzung dieses Codes und der Berechnungsresultate.

In [1]:
import pandas
if 'get_ipython' in dir():
    INTERACTIVE = True
    %matplotlib inline
else:
    INTERACTIVE = False
from matplotlib import pyplot
import numpy
import datetime
import math
import matplotlib
import scipy.stats
#import seaborn
import pathlib
#seaborn.set()
import os
import shutil
import itertools
import ot
import tqdm
import time

def reverse_cumprod(x):
    return x[::-1].cumprod()[::-1]
def reverse_cumsum(x):
    return x[::-1].cumsum()[::-1]

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

def smooth(s, size=4):
    return pandas.Series(numpy.convolve(s, numpy.full((size,), 1/size))[:-(size-1)], index=s.index, name=s.name)

bl_abbr = {"Deutschland": "D", "Baden-Württemberg": "BW", "Bayern": "BY", "Berlin": "BE", "Brandenburg": "BB", "Bremen": "HB", "Hamburg": "HH",
           "Hessen": "HE", "Mecklenburg-Vorpommern": "MV", "Niedersachsen": "NI", "Nordrhein-Westfalen": "NW",
           "Rheinland-Pfalz": "RP", "Saarland": "SL", "Sachsen": "SN", "Sachsen-Anhalt": "ST", "Schleswig-Holstein": "SH",
           "Thüringen": "TH"}

p_output = pathlib.Path('./outputs/')
In [54]:
def load_df(date, augmented=0):
    # 0 = unaugmented
    # 1 = with many columns
    # 2 = with fractional cases
    dstr = date.date().isoformat()
    if augmented == 0:
        fn = f'data/rki_covid19.{dstr}.csv'
    elif augmented == 1:
        fn = f'data/rki_covid19.viele_daten.{dstr}.csv'
    elif augmented == 2:
        fn = f'data/rki_covid19.uedatum.{dstr}.csv'
    else:
        raise Exception("invalid value for augmented", augmented)
    adf = pandas.read_csv(fn)
    adf['Datenstand'] = pandas.to_datetime(adf.Datenstand.apply(lambda s: '-'.join(s.split(',')[0].split('.')[::-1])))
    assert (date==adf['Datenstand'].min()) and (date==adf['Datenstand'].max())
    adf.loc[adf.Refdatum.isna(), 'Refdatum'] = adf.loc[adf.Refdatum.isna(), 'Meldedatum']
    adf.loc[adf.IstErkrankungsbeginn.isna(), 'IstErkrankungsbeginn'] = (
        adf.loc[adf.IstErkrankungsbeginn.isna(), 'Refdatum'] != adf.loc[adf.IstErkrankungsbeginn.isna(), 'Meldedatum']
        ).astype(int) # misses Melde=Ref, but we cannot do better

    adf['AnzahlGenesen'] = adf.AnzahlGenesen.fillna(0)
    adf['NeuGenesen'] = adf.NeuGenesen.fillna(-9).astype(int)
    if 'ObjectId' in adf.columns:
        adf.rename(columns={'ObjectId': 'FID'}, inplace=True)
    for k in ('Meldedatum', 'Datenstand', 'Refdatum', 'Uebermittlungsdatum'):
        if k in adf.columns:
            adf[k] = pandas.to_datetime(adf[k])
    for k in ('AnzahlFall', 'AnzahlTodesfall', 'AnzahlGenesen'):
        adf[k] = adf[k].astype(float)

    if adf.Datenstand.max()==pandas.to_datetime('2020-04-14') or adf.Datenstand.max()==pandas.to_datetime('2020-04-13'): # fix
        adf.loc[adf.Landkreis == 'LK Göttingen (alt)', 'Landkreis'] = 'LK Göttingen'
    if adf.Datenstand.max()>=pandas.to_datetime('2020-04-13') and adf.Datenstand.max()<=pandas.to_datetime('2020-04-15'): # fix
        adf.loc[adf.Landkreis == 'LK Göttingen', 'IdLandkreis'] = 3159

    uebermittlungsdatum = adf.Datenstand.max() - pandas.Timedelta(days=1)
    if 'ue_unbekannt' not in adf.columns and 'Uebermittlungsdatum' not in adf.columns:
        adf['ue_unbekannt'] = (adf.NeuerFall < 1) * adf.AnzahlFall.clip(lower=0)
        adf['ue_'+uebermittlungsdatum.date().isoformat().replace('-', '_')] = (adf.NeuerFall == 1) * adf.AnzahlFall
    if 'Uebermittlungsdatum' not in adf.columns:
        for d in pandas.date_range('01-01-2020', uebermittlungsdatum, freq='D'):
            c = 'ue_'+d.date().isoformat().replace('-', '_')
            if c not in adf.columns:
                adf[c] = 0 # we don't want the substitute here: (adf.Meldedatum == d) * adf.AnzahlFall
    return adf
In [ ]:
 
In [3]:
# Distanzfunktion zwischen Schlüsseln, sollte eine Metrik sein!
bl_map = None
def key_dist(k1, k2, gr=None):
    global bl_map
    d = 0
    if k1[0] != k2[0]: # LK
        if bl_map is None:
            bl_map = gr["Bundesland"].groupby(["Landkreis"]).first()
        if bl_map[k1[0]] != bl_map[k2[0]]:
            d += 30
        else:
            d += 5
    if k1[1] != k2[1]: # Meldedatum
        d += 8
    if k1[2] != k2[2]: # Altersband, evtl benachbarte verbessern? --> sollte aber Metrik bleiben
        if k1[2] == 'unbekannt' or k2[2] == 'unbekannt':
            d += 1
        else:
            d += 2
    if k1[3] != k2[3]: # Geschlecht
        if k1[3] == 'unbekannt' or k2[3] == 'unbekannt':
            d += 1
        else:
            d += 2
    if k1[4] != k2[4]: # Erkrankungsbeginn
        d += 0.5
    elif k1[4] == 1 and k1[5] != k2[5]: # habe Erkrankungsbeginn, unterschiedliches Refdatum
        d += 1  # distanz zwischen daten einbeziehen? muss aber mit IstErkrankungsbeginn==0 metrik bleiben...
    return d
In [49]:
def save_df(df):
    date = df.Datenstand.max()
    dstr = date.date().isoformat()
    if 'ue_unbekannt' in df.columns:
        fn = f'data/rki_covid19.viele_daten.{dstr}.csv'
    elif 'Uebermittlungsdatum' in df.columns:
        fn = f'data/rki_covid19.uedatum.{dstr}.csv'
    else:
        raise Exception("Unbekanntes Format")
    df.to_csv(fn, index=False)
In [5]:
def group_df(df):
    df = df.copy()
    df['AnzahlFallOrigPos'] = df.AnzahlFall.clip(lower=0)
    df['AnzahlTodesfallOrigPos'] = df.AnzahlTodesfall.clip(lower=0)
    df['AnzahlGenesenOrigPos'] = df.AnzahlGenesen.clip(lower=0)
    df['AnzahlFallOrigNeg'] = df.AnzahlFall.clip(upper=0)
    df['AnzahlTodesfallOrigNeg'] =df.AnzahlTodesfall.clip(upper=0)
    df['AnzahlGenesenOrigNeg'] = df.AnzahlGenesen.clip(upper=0)

    # A sind Aktive Fälle (d.h. ohne Tote / Genesene)
    df['a_neu'] =  df.AnzahlFall * (df.NeuerFall ==  1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -9)
    df['g_neu'] =  df.AnzahlFall * (df.NeuerFall ==  1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen ==  1)
    df['t_neu'] =  df.AnzahlFall * (df.NeuerFall ==  1) * (df.NeuerTodesfall ==  1) * (df.NeuGenesen == -9)
    df['a_gel'] = -df.AnzahlFall * (df.NeuerFall == -1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -9)
    df['g_gel'] = -df.AnzahlFall * (df.NeuerFall == -1) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -1)
    df['t_gel'] = -df.AnzahlFall * (df.NeuerFall == -1) * (df.NeuerTodesfall == -1) * (df.NeuGenesen == -9)

    df['a_to_g'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen ==  1)
    df['a_to_t'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall ==  1) * (df.NeuGenesen == -9)
    df['g_to_a'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen == -1)
    df['g_to_t'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall ==  1) * (df.NeuGenesen == -1)
    df['t_to_a'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall == -1) * (df.NeuGenesen == -9)
    df['t_to_g'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall == -1) * (df.NeuGenesen ==  1)

    df['a_to_a'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen ==  -9)
    df['t_to_t'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall ==  0) * (df.NeuGenesen ==  -9)
    df['g_to_g'] = df.AnzahlFall * (df.NeuerFall ==  0) * (df.NeuerTodesfall == -9) * (df.NeuGenesen ==   0)

    df['a']       = df.a_to_a + df.g_to_a + df.t_to_a + df.a_neu
    df['a_prev']  = df.a_to_a + df.a_to_g + df.a_to_t + df.a_gel
    df['g']       = df.a_to_g + df.g_to_g + df.t_to_g + df.g_neu
    df['g_prev']  = df.g_to_a + df.g_to_g + df.g_to_t + df.g_gel
    df['t']       = df.a_to_t + df.g_to_t + df.t_to_t + df.t_neu
    df['t_prev']  = df.t_to_a + df.t_to_g + df.t_to_t + df.t_gel

    df = df.drop(['Altersgruppe2', 'NeuerFall', 'NeuGenesen', 'NeuerTodesfall', 'AnzahlFall', 'AnzahlTodesfall', 'AnzahlGenesen'], axis=1)
    
    df_grouped = df.groupby(['Datenstand','IdBundesland','IdLandkreis', 'Altersgruppe', 'Geschlecht', 
                         'Landkreis', 'Meldedatum', 'IstErkrankungsbeginn', 'Refdatum', 'Bundesland']).sum().reset_index()
    df_group = df_grouped.groupby(['Landkreis', 'Meldedatum', 'Altersgruppe', 'Geschlecht', 
                                      'IstErkrankungsbeginn', 'Refdatum']).first()
    print(len(df_grouped), len(df_group))
    assert len(df_grouped) == len(df_group) # we don't want to lose things here

    return df_group
In [6]:
eps = 1e-6
def do_matching(bdf_group, adf_group):
    idx = adf_group.index.union(bdf_group.index)
    adf_group = adf_group.reindex(idx, fill_value=0)
    bdf_group = bdf_group.reindex(idx, fill_value=0)

    def smin(a,b):
        return pandas.concat((a, b), axis=1).min(axis=1)

    adf_group['unmatched_a_prev'] = adf_group.a_prev.copy() # .copy needed? we will in-place modify
    bdf_group['unmatched_a_next'] = bdf_group.a.copy()
    adf_group['unmatched_t_prev'] = adf_group.t_prev.copy()
    bdf_group['unmatched_t_next'] = bdf_group.t.copy()
    adf_group['unmatched_g_prev'] = adf_group.g_prev.copy()
    bdf_group['unmatched_g_next'] = bdf_group.g.copy()

    print("consistency", adf_group.unmatched_a_prev.sum(), bdf_group.unmatched_a_next.sum(),
            adf_group.unmatched_t_prev.sum(), bdf_group.unmatched_t_next.sum(),
            adf_group.unmatched_g_prev.sum(), bdf_group.unmatched_g_next.sum()
         )
    assert numpy.abs(adf_group.unmatched_a_prev.sum() - bdf_group.unmatched_a_next.sum()) < eps
    assert numpy.abs(adf_group.unmatched_t_prev.sum() - bdf_group.unmatched_t_next.sum()) < eps
    assert numpy.abs(adf_group.unmatched_g_prev.sum() - bdf_group.unmatched_g_next.sum()) < eps

    matching_a = smin(adf_group.unmatched_a_prev, bdf_group.unmatched_a_next)
    matching_t = smin(adf_group.unmatched_t_prev, bdf_group.unmatched_t_next)
    matching_g = smin(adf_group.unmatched_g_prev, bdf_group.unmatched_g_next)

    adf_group['idmatch_a_prev'] = matching_a
    adf_group['idmatch_t_prev'] = matching_t
    adf_group['idmatch_g_prev'] = matching_g
    bdf_group['idmatch_a_next'] = matching_a
    bdf_group['idmatch_t_next'] = matching_t
    bdf_group['idmatch_g_next'] = matching_g
    adf_group['delmatch_a_prev'] = 0
    adf_group['delmatch_t_prev'] = 0
    adf_group['delmatch_g_prev'] = 0
    bdf_group['delmatch_a_next'] = 0
    bdf_group['delmatch_t_next'] = 0
    bdf_group['delmatch_g_next'] = 0


    adf_group.loc[:, 'unmatched_a_prev'] -= matching_a
    bdf_group.loc[:, 'unmatched_a_next'] -= matching_a
    adf_group.loc[:, 'unmatched_t_prev'] -= matching_t
    bdf_group.loc[:, 'unmatched_t_next'] -= matching_t
    adf_group.loc[:, 'unmatched_g_prev'] -= matching_g
    bdf_group.loc[:, 'unmatched_g_next'] -= matching_g

    remaining_adf = adf_group[  (adf_group.unmatched_a_prev != 0)
                              | (adf_group.unmatched_t_prev != 0)
                              | (adf_group.unmatched_g_prev != 0)]
    remaining_bdf = bdf_group[   (bdf_group.unmatched_a_next != 0)
                               | (bdf_group.unmatched_t_next != 0)
                               | (bdf_group.unmatched_g_next != 0)]

    assert numpy.abs(remaining_adf.unmatched_a_prev.sum() - remaining_bdf.unmatched_a_next.sum()) < eps
    assert numpy.abs(remaining_adf.unmatched_t_prev.sum() - remaining_bdf.unmatched_t_next.sum()) < eps
    assert numpy.abs(remaining_adf.unmatched_g_prev.sum() - remaining_bdf.unmatched_g_next.sum()) < eps
    print(len(remaining_adf), len(remaining_bdf), 
          remaining_adf.unmatched_a_prev.sum(),
          remaining_bdf.unmatched_a_next.sum(),
          remaining_adf.unmatched_t_prev.sum(),
          remaining_bdf.unmatched_t_next.sum(),
          remaining_adf.unmatched_g_prev.sum(),
          remaining_bdf.unmatched_g_next.sum(),)

    # neues Erkrankungsdatum
    it = list(remaining_adf.index)
    if len(it) > 1000:
        it = tqdm.tqdm(it)
    for key in it:
        ar = adf_group.loc[key]
        key2 = key[:4]+(0.0,)+key[1:2] # Fallback: ohne Erkrankungsdatum
        try:
            br = bdf_group.loc[key2]
        except KeyError as _:
            br = None
        if br is not None:    
            matching_a = min(ar.unmatched_a_prev, br.unmatched_a_next)
            matching_t = min(ar.unmatched_t_prev, br.unmatched_t_next)
            matching_g = min(ar.unmatched_g_prev, br.unmatched_g_next)

            adf_group.loc[key,  'delmatch_a_prev'] = matching_a
            adf_group.loc[key,  'delmatch_t_prev'] = matching_t
            adf_group.loc[key,  'delmatch_g_prev'] = matching_g
            bdf_group.loc[key2, 'delmatch_a_next'] = matching_a
            bdf_group.loc[key2, 'delmatch_t_next'] = matching_t
            bdf_group.loc[key2, 'delmatch_g_next'] = matching_g

            adf_group.loc[key,  'unmatched_a_prev'] -= matching_a
            bdf_group.loc[key2, 'unmatched_a_next'] -= matching_a
            adf_group.loc[key,  'unmatched_t_prev'] -= matching_t
            bdf_group.loc[key2, 'unmatched_t_next'] -= matching_t
            adf_group.loc[key,  'unmatched_g_prev'] -= matching_g
            bdf_group.loc[key2, 'unmatched_g_next'] -= matching_g

    remaining_adf = adf_group[  (adf_group.unmatched_a_prev != 0)
                              | (adf_group.unmatched_t_prev != 0)
                              | (adf_group.unmatched_g_prev != 0)]
    remaining_bdf = bdf_group[   (bdf_group.unmatched_a_next != 0)
                               | (bdf_group.unmatched_t_next != 0)
                               | (bdf_group.unmatched_g_next != 0)]

    assert numpy.abs(remaining_adf.unmatched_a_prev.sum() - remaining_bdf.unmatched_a_next.sum()) < eps
    assert numpy.abs(remaining_adf.unmatched_t_prev.sum() - remaining_bdf.unmatched_t_next.sum()) < eps
    assert numpy.abs(remaining_adf.unmatched_g_prev.sum() - remaining_bdf.unmatched_g_next.sum()) < eps
    print(len(remaining_adf), len(remaining_bdf), 
          remaining_adf.unmatched_a_prev.sum(),
          remaining_bdf.unmatched_a_next.sum(),
          remaining_adf.unmatched_t_prev.sum(),
          remaining_bdf.unmatched_t_next.sum(),
          remaining_adf.unmatched_g_prev.sum(),
          remaining_bdf.unmatched_g_next.sum(),)
    return bdf_group, adf_group
In [7]:
# NeuerF NTF NG FactorAF, FATF, FAG
map_to_neuerkey = {
    'a_neu':  ( 1, -9, -9,  1,  0,  0),
    'g_neu':  ( 1, -9,  1,  1,  0,  1),
    't_neu':  ( 1,  1, -9,  1,  1,  0),
    'a_gel':  (-1, -9, -9, -1,  0,  0),
    'g_gel':  (-1, -9, -1, -1, -1,  0),
    't_gel':  (-1, -1, -9, -1,  0, -1),
    'a_to_a': ( 0, -9, -9,  1,  0,  0),
    'a_to_g': ( 0, -9,  1,  1,  0,  1),
    'a_to_t': ( 0,  1, -9,  1,  1,  0),
    't_to_a': ( 0, -1, -9,  1, -1,  0),
    't_to_g': ( 0, -1,  1,  1, -1,  1),
    't_to_t': ( 0,  0, -9,  1,  1,  0),
    'g_to_a': ( 0, -9, -1,  1,  0, -1),
    'g_to_g': ( 0, -9,  0,  1,  0,  1),
    'g_to_t': ( 0,  1, -1,  1,  1, -1),
}
In [ ]:
 
In [8]:
if 0:
    assert (old_dfs.groupby(["NeuerFall", "NeuerTodesfall", "NeuGenesen"])
                    [["AnzahlFall", "AnzahlTodesfall", "AnzahlGenesen"]]
                    .sum().abs() == 
            old_dfs[["AnzahlFall", "AnzahlTodesfall", "AnzahlGenesen", "NeuerFall", "NeuerTodesfall", "NeuGenesen"]]
                     .apply(lambda x: x.abs() if x.name.startswith('Anzahl') else x, axis=0)
                    .groupby(["NeuerFall", "NeuerTodesfall", "NeuGenesen"]).sum()).all().all()
    old_dfs.groupby(["NeuerFall", "NeuerTodesfall", "NeuGenesen"])[["AnzahlFall"]].sum().abs().sum()
In [ ]:
 
In [ ]:
def augment_df(old_df, new_df):
    new_old_df = new_df[new_df.NeuerFall <= 0]
    old_gr = group_df(old_df)
    new_gr = group_df(new_old_df)
    old_match, new_match = do_matching(old_gr, new_gr)
    ue_columns = [c for c in old_gr.columns if c.startswith('ue_')]
    old_ue_sum = old_match.loc[:, ue_columns].sum(1)

    for c in ue_columns:
        new_match[c] = ((new_match.idmatch_a_prev + new_match.idmatch_g_prev + new_match.idmatch_t_prev)
                         * ((new_match.a_prev + new_match.g_prev + new_match.t_prev
                             - new_match.a_gel - new_match.g_gel - new_match.t_gel)
                           / (new_match.a_prev + new_match.g_prev + new_match.t_prev).clip(lower=1e-9))
                           * old_match[c] / old_ue_sum.clip(lower=1e-9))

    delmatches = new_match[(new_match.delmatch_a_prev > 0) | (new_match.delmatch_g_prev > 0) | (new_match.delmatch_t_prev > 0)]
    # better with join...
    for key, ar in delmatches.iterrows():
        bkey = key[:4]+(0.0,)+key[1:2] # Fallback: ohne Erkrankungsdatum
        br = old_match.loc[bkey]
        new_match.loc[key, ue_columns] += ((ar.delmatch_a_prev + ar.delmatch_g_prev + ar.delmatch_t_prev)
                                      * (ar.a_prev + ar.g_prev + ar.t_prev - ar.a_gel - ar.g_gel - ar.t_gel)
                                       / max(ar.a_prev + ar.g_prev + ar.t_prev, 1e-9)
                                       * br[ue_columns] / max(old_ue_sum.loc[bkey], 1e-9))

    old_dfs = old_df
    new_dfs = new_df.copy()
    adf_group = new_match
    bdf_group = old_match

    remaining_adf = adf_group[  (adf_group.unmatched_a_prev != 0)
                          | (adf_group.unmatched_t_prev != 0)
                          | (adf_group.unmatched_g_prev != 0)]
    remaining_bdf = bdf_group[   (bdf_group.unmatched_a_next != 0)
                           | (bdf_group.unmatched_t_next != 0)
                           | (bdf_group.unmatched_g_next != 0)]

    dists = numpy.zeros((len(remaining_adf), len(remaining_bdf)))
    for i, k1 in enumerate(remaining_adf.index):
        for j, k2 in enumerate(remaining_bdf.index):
            dists[i, j] = key_dist(k1, k2, adf_group)


    if remaining_adf.unmatched_a_prev.sum() > 1e-9:
        p1_a = remaining_adf.unmatched_a_prev.values/remaining_adf.unmatched_a_prev.sum()
        p2_a = remaining_bdf.unmatched_a_next.values/remaining_bdf.unmatched_a_next.sum()
        mapping_a = ot.emd(p1_a, p2_a, dists)
        mapping_a *= remaining_adf.unmatched_a_prev.sum()
    else:
        mapping_a = numpy.zeros_like(dists)
        p1_a = mapping_a[:, 0]
        p2_a = mapping_a[0, :]

    if remaining_adf.unmatched_g_prev.sum() > 1e-9:
        p1_g = remaining_adf.unmatched_g_prev.values/remaining_adf.unmatched_g_prev.sum()
        p2_g = remaining_bdf.unmatched_g_next.values/remaining_bdf.unmatched_g_next.sum()
        mapping_g = ot.emd(p1_g, p2_g, dists)
        mapping_g *= remaining_adf.unmatched_g_prev.sum()
    else:
        mapping_g = numpy.zeros_like(dists)
        p1_g = mapping_g[:, 0]
        p2_g = mapping_g[0, :]

    if remaining_adf.unmatched_t_prev.sum() > 1e-9:
        p1_t = remaining_adf.unmatched_t_prev.values/remaining_adf.unmatched_t_prev.sum()
        p2_t = remaining_bdf.unmatched_t_next.values/remaining_bdf.unmatched_t_next.sum()
        mapping_t = ot.emd(p1_t, p2_t, dists)
        mapping_t *= remaining_adf.unmatched_t_prev.sum()
    else:
        mapping_t = numpy.zeros_like(dists)
        p1_t = mapping_t[:, 0]
        p2_t = mapping_t[0, :]

    mapping = mapping_a + mapping_g + mapping_t

    for idx, (key, ar) in enumerate(remaining_adf.iterrows()):
        for bidx, (bkey, br) in enumerate(remaining_bdf.iterrows()):
            # as the mapping is sparse, it is somewhat wasteful, we could just pick nonzeros...
            if mapping[idx, bidx] > 1e-9:
                ### eigentlich meldedatum vergleichen!!!
                new_match.loc[key, ue_columns] += (mapping[idx, bidx]
                                              * (ar.a_prev + ar.g_prev + ar.t_prev - ar.a_gel - ar.g_gel - ar.t_gel)
                                               / max(ar.a_prev + ar.g_prev + ar.t_prev, 1e-9)
                                               * br[ue_columns] / max(old_ue_sum.loc[bkey], 1e-9))


    broken = new_match.index[(new_match[ue_columns].sum(axis=1) - (new_match.a + new_match.g + new_match.t)).abs()>1e-9]

    assert len(broken) == 0

    new_old_df_with_uevz = new_old_df.drop(columns=ue_columns).join(new_match[ue_columns], new_match.index.names)

    factor = new_old_df_with_uevz.AnzahlFall.clip(lower=0) / new_old_df_with_uevz[ue_columns].sum(axis=1).clip(lower=1e-9)
    for c in ue_columns:
        new_old_df_with_uevz[c] *= factor



    new_new_df_with_uevz = new_df[new_df.NeuerFall == 1].copy()
    new_complete_with_uevz = pandas.concat((new_old_df_with_uevz, new_new_df_with_uevz), axis=0, sort=True)
    print(new_complete_with_uevz[new_complete_with_uevz.NeuerFall>=0].AnzahlFall.sum(), new_complete_with_uevz[new_complete_with_uevz.NeuerFall>=0][ue_columns].sum().sum())
    return new_complete_with_uevz
In [57]:
def ergaenze_uedatum(df):
    detailed_df = [] 
    for c in df.columns:
        if c.startswith('ue_'):
            part = df[(df.NeuerFall >= 0) & (df[c]>0)].copy() 
            if c.startswith('ue_2020'):
                part['Uebermittlungsdatum'] = pandas.to_datetime(c[3:].replace('_', '-'))
                part['IstUebermittlung'] = 1
            else:
                # print(c)
                part['Uebermittlungsdatum'] = part.Meldedatum
                part['IstUebermittlung'] = 0
            factor = df[c] / df.AnzahlFall 
            part['AnzahlFall'] *= factor 
            part['AnzahlTodesfall'] *= factor 
            part['AnzahlGenesen'] *= factor 
            detailed_df.append(part) 
    detailed_df = pandas.concat(detailed_df)
    detailed_df['delay'] = (detailed_df.Uebermittlungsdatum - detailed_df.Refdatum).dt.days
    detailed_df.drop(columns=[c for c in detailed_df.columns if c.startswith('ue_')], inplace=True)
    return detailed_df

Für die Massenproduktion am Anfang

In [11]:
if 0:
    date0 = pandas.to_datetime('2020-04-29')
    date1 = date0 + pandas.Timedelta(days=1)
    new_new_df = load_df(date0, augmented=False)
    for date1 in pandas.date_range(date0 + pandas.Timedelta(days=1), '2020-05-30'):
        old_df = new_new_df
        new_df = load_df(date1, augmented=False)
        new_new_df = augment_df(old_df, new_df)
        print(date1, new_new_df.ue_unbekannt.sum())
        save_df(new_new_df)
        df_mit_uedatum = ergaenze_uedatum(new_new_df)
        save_df(df_mit_uedatum)
In [ ]:
 

Von einem Tag auf den nächsten

In [55]:
date0 = pandas.to_datetime('2020-06-04')
date1 = date0 + pandas.Timedelta(days=1)
old_df = load_df(date0, augmented=1)
new_df = load_df(date1, augmented=0)
new_new_df = augment_df(old_df, new_df)
print(date1, new_new_df.ue_unbekannt.sum())
save_df(new_new_df)
df_mit_uedatum = ergaenze_uedatum(new_new_df)
save_df(df_mit_uedatum)
144475 144475
144613 144613
consistency 6340.0 6340.0 8581.0 8581.0 167843.0 167843.0
620 579 98.0 98.0 24.0 24.0 522.0 522.0
119 117 12.0 12.0 5.0 5.0 111.0 111.0
183271.0 182695.99999999988
2020-06-05 00:00:00 154565.10445833244
In [105]:
old_df.AnzahlFall.clip(lower=0).sum(), new_df[new_df.NeuerFall<1].AnzahlFall.abs().clip(lower=0).sum()
Out[105]:
(181815.0, 181815.0)