Prepare example input in DataFrame¶
Please look at country codes here:- https://www.lifetable.de/cgi-bin/country_codes.php
[2]:
df = pd.read_csv("data/covid-cedc-quot.csv", delimiter=";")
list(df)
[2]:
['reg', 'cl_age90', 'Dc_Elec_Covid_cum', 'jour']
[3]:
df["jour"].head()
[3]:
0 2020-03-06
1 2020-03-07
2 2020-03-08
3 2020-03-09
4 2020-03-10
Name: jour, dtype: object
[4]:
df.columns = ["reg", "age", "n_deaths", "date"]
df.drop(df.loc[df.age == 0].index, inplace=True)
df
[4]:
| reg | age | n_deaths | date | |
|---|---|---|---|---|
| 73 | 1 | 9 | 0 | 2020-03-06 |
| 74 | 1 | 9 | 0 | 2020-03-07 |
| 75 | 1 | 9 | 0 | 2020-03-08 |
| 76 | 1 | 9 | 0 | 2020-03-09 |
| 77 | 1 | 9 | 0 | 2020-03-10 |
| ... | ... | ... | ... | ... |
| 16055 | 94 | 90 | 0 | 2020-05-13 |
| 16056 | 94 | 90 | 0 | 2020-05-14 |
| 16057 | 94 | 90 | 0 | 2020-05-15 |
| 16058 | 94 | 90 | 0 | 2020-05-16 |
| 16059 | 94 | 90 | 0 | 2020-05-17 |
14600 rows × 4 columns
[5]:
df2 = df.groupby(["date", "age"]).agg({"n_deaths": sum})
df2
/tmp/ipykernel_3020/1108104031.py:1: FutureWarning: The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
df2 = df.groupby(["date", "age"]).agg({"n_deaths": sum})
[5]:
| n_deaths | ||
|---|---|---|
| date | age | |
| 2020-03-06 | 9 | 0 |
| 19 | 0 | |
| 29 | 0 | |
| 39 | 0 | |
| 49 | 0 | |
| ... | ... | ... |
| 2020-05-17 | 59 | 400 |
| 69 | 1021 | |
| 79 | 2060 | |
| 89 | 3966 | |
| 90 | 2645 |
730 rows × 1 columns
[6]:
df2.unstack().plot(kind="area", stacked=True, figsize=(12, 8))
[6]:
<Axes: xlabel='date'>
[7]:
df2.unstack().plot(logy=True, figsize=(12, 8))
[7]:
<Axes: xlabel='date'>
[8]:
df2.reset_index(inplace=True)
df2
[8]:
| date | age | n_deaths | |
|---|---|---|---|
| 0 | 2020-03-06 | 9 | 0 |
| 1 | 2020-03-06 | 19 | 0 |
| 2 | 2020-03-06 | 29 | 0 |
| 3 | 2020-03-06 | 39 | 0 |
| 4 | 2020-03-06 | 49 | 0 |
| ... | ... | ... | ... |
| 725 | 2020-05-17 | 59 | 400 |
| 726 | 2020-05-17 | 69 | 1021 |
| 727 | 2020-05-17 | 79 | 2060 |
| 728 | 2020-05-17 | 89 | 3966 |
| 729 | 2020-05-17 | 90 | 2645 |
730 rows × 3 columns
[9]:
df2["year"] = 2020
df2["country"] = "FRA"
df2["sex"] = "M"
[10]:
df2
[10]:
| date | age | n_deaths | year | country | sex | |
|---|---|---|---|---|---|---|
| 0 | 2020-03-06 | 9 | 0 | 2020 | FRA | M |
| 1 | 2020-03-06 | 19 | 0 | 2020 | FRA | M |
| 2 | 2020-03-06 | 29 | 0 | 2020 | FRA | M |
| 3 | 2020-03-06 | 39 | 0 | 2020 | FRA | M |
| 4 | 2020-03-06 | 49 | 0 | 2020 | FRA | M |
| ... | ... | ... | ... | ... | ... | ... |
| 725 | 2020-05-17 | 59 | 400 | 2020 | FRA | M |
| 726 | 2020-05-17 | 69 | 1021 | 2020 | FRA | M |
| 727 | 2020-05-17 | 79 | 2060 | 2020 | FRA | M |
| 728 | 2020-05-17 | 89 | 3966 | 2020 | FRA | M |
| 729 | 2020-05-17 | 90 | 2645 | 2020 | FRA | M |
730 rows × 6 columns
[11]:
fr_ldf = lost_years_who(df2, {"age": "age", "country": "country", "sex": "sex", "year": "year"})
fr_ldf.head()
[11]:
| date | age | n_deaths | year | country | sex | who_age | who_country | who_sex | who_year | who_life_expectancy | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-03-06 | 9 | 0 | 2020 | FRA | M | 1 | FRA | MLE | 2020 | 79.064192 |
| 1 | 2020-03-06 | 19 | 0 | 2020 | FRA | M | 1 | FRA | MLE | 2020 | 79.064192 |
| 2 | 2020-03-06 | 29 | 0 | 2020 | FRA | M | 1 | FRA | MLE | 2020 | 79.064192 |
| 3 | 2020-03-06 | 39 | 0 | 2020 | FRA | M | 1 | FRA | MLE | 2020 | 79.064192 |
| 4 | 2020-03-06 | 49 | 0 | 2020 | FRA | M | 1 | FRA | MLE | 2020 | 79.064192 |
[12]:
gldf = fr_ldf.groupby(["date", "age", "who_life_expectancy"]).agg({"n_deaths": sum})
gldf
/tmp/ipykernel_3020/4129139556.py:1: FutureWarning: The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
gldf = fr_ldf.groupby(["date", "age", "who_life_expectancy"]).agg({"n_deaths": sum})
[12]:
| n_deaths | |||
|---|---|---|---|
| date | age | who_life_expectancy | |
| 2020-03-06 | 9 | 79.064192 | 0 |
| 19 | 79.064192 | 0 | |
| 29 | 79.064192 | 0 | |
| 39 | 79.064192 | 0 | |
| 49 | 79.064192 | 0 | |
| ... | ... | ... | ... |
| 2020-05-17 | 59 | 79.064192 | 400 |
| 69 | 79.064192 | 1021 | |
| 79 | 79.064192 | 2060 | |
| 89 | 79.064192 | 3966 | |
| 90 | 79.064192 | 2645 |
730 rows × 1 columns
[13]:
gldf.reset_index(inplace=True)
gldf
[13]:
| date | age | who_life_expectancy | n_deaths | |
|---|---|---|---|---|
| 0 | 2020-03-06 | 9 | 79.064192 | 0 |
| 1 | 2020-03-06 | 19 | 79.064192 | 0 |
| 2 | 2020-03-06 | 29 | 79.064192 | 0 |
| 3 | 2020-03-06 | 39 | 79.064192 | 0 |
| 4 | 2020-03-06 | 49 | 79.064192 | 0 |
| ... | ... | ... | ... | ... |
| 725 | 2020-05-17 | 59 | 79.064192 | 400 |
| 726 | 2020-05-17 | 69 | 79.064192 | 1021 |
| 727 | 2020-05-17 | 79 | 79.064192 | 2060 |
| 728 | 2020-05-17 | 89 | 79.064192 | 3966 |
| 729 | 2020-05-17 | 90 | 79.064192 | 2645 |
730 rows × 4 columns
[14]:
gldf["years_lost"] = gldf["who_life_expectancy"] * gldf["n_deaths"]
gldf
[14]:
| date | age | who_life_expectancy | n_deaths | years_lost | |
|---|---|---|---|---|---|
| 0 | 2020-03-06 | 9 | 79.064192 | 0 | 0.000000 |
| 1 | 2020-03-06 | 19 | 79.064192 | 0 | 0.000000 |
| 2 | 2020-03-06 | 29 | 79.064192 | 0 | 0.000000 |
| 3 | 2020-03-06 | 39 | 79.064192 | 0 | 0.000000 |
| 4 | 2020-03-06 | 49 | 79.064192 | 0 | 0.000000 |
| ... | ... | ... | ... | ... | ... |
| 725 | 2020-05-17 | 59 | 79.064192 | 400 | 31625.676724 |
| 726 | 2020-05-17 | 69 | 79.064192 | 1021 | 80724.539838 |
| 727 | 2020-05-17 | 79 | 79.064192 | 2060 | 162872.235129 |
| 728 | 2020-05-17 | 89 | 79.064192 | 3966 | 313568.584718 |
| 729 | 2020-05-17 | 90 | 79.064192 | 2645 | 209124.787337 |
730 rows × 5 columns
[15]:
gldf2 = gldf.groupby(["date"]).agg({"years_lost": sum})
gldf2
/tmp/ipykernel_3020/3749646989.py:1: FutureWarning: The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
gldf2 = gldf.groupby(["date"]).agg({"years_lost": sum})
[15]:
| years_lost | |
|---|---|
| date | |
| 2020-03-06 | 79.064192 |
| 2020-03-07 | 316.256767 |
| 2020-03-08 | 553.449343 |
| 2020-03-09 | 790.641918 |
| 2020-03-10 | 948.770302 |
| ... | ... |
| 2020-05-13 | 795702.026376 |
| 2020-05-14 | 799259.915007 |
| 2020-05-15 | 803687.509749 |
| 2020-05-16 | 807798.847723 |
| 2020-05-17 | 810724.222820 |
73 rows × 1 columns
[16]:
gldf2 = gldf2.join(fr_ldf.groupby(["date"]).agg({"n_deaths": sum}))
gldf2["avg_years_lost"] = gldf2.years_lost / gldf2.n_deaths
gldf2
/tmp/ipykernel_3020/4090051171.py:1: FutureWarning: The provided callable <built-in function sum> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
gldf2 = gldf2.join(fr_ldf.groupby(["date"]).agg({"n_deaths": sum}))
[16]:
| years_lost | n_deaths | avg_years_lost | |
|---|---|---|---|
| date | |||
| 2020-03-06 | 79.064192 | 1 | 79.064192 |
| 2020-03-07 | 316.256767 | 4 | 79.064192 |
| 2020-03-08 | 553.449343 | 7 | 79.064192 |
| 2020-03-09 | 790.641918 | 10 | 79.064192 |
| 2020-03-10 | 948.770302 | 12 | 79.064192 |
| ... | ... | ... | ... |
| 2020-05-13 | 795702.026376 | 10064 | 79.064192 |
| 2020-05-14 | 799259.915007 | 10109 | 79.064192 |
| 2020-05-15 | 803687.509749 | 10165 | 79.064192 |
| 2020-05-16 | 807798.847723 | 10217 | 79.064192 |
| 2020-05-17 | 810724.222820 | 10254 | 79.064192 |
73 rows × 3 columns
[17]:
g = sns.relplot(x="date", y="avg_years_lost", kind="line", data=gldf2.reset_index())
g.fig.autofmt_xdate()
g.fig.set_figwidth(16)
g.fig.set_figheight(6)
for ax in g.axes.flatten():
ax.xaxis.set_major_locator(ticker.MultipleLocator(10))
[ ]: