In [280]:

import os
import shutil
import datetime
import pandas as pd
import requests

import numpy as np

Data Links¶

In [281]:

census_url='http://censusindia.gov.in/2011census/C-series/c-14/DDW-0000C-14.xls'
raw_data_covid='https://api.covid19india.org/raw_data.json'

Processing Census Data¶

In [282]:

censusDS = pd.read_excel(census_url,skiprows=range(0, 1), skipinitialspace=True)
censusDS.head(10)

Out[282]:

	Table	State	Distt.	Area Name	Age-group	Total	Unnamed: 6	Unnamed: 7	Rural	Unnamed: 9	Unnamed: 10	Urban	Unnamed: 12	Unnamed: 13
0	Name	Code	Code	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	NaN	Persons	Males	Females	Persons	Males	Females	Persons	Males	Females
2	NaN	NaN	NaN	NaN	1	2	3	4	5	6	7	8	9	10
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	C4114	00	000	India	All ages	1210854977	623270258	587584719	833748852	427781058	405967794	377106125	195489200	181616925
6	C4114	00	000	India	0-4	112806778	58632074	54174704	82986660	43036377	39950283	29820118	15595697	14224421
7	C4114	00	000	India	5-9	126928126	66300466	60627660	93807612	48825259	44982353	33120514	17475207	15645307
8	C4114	00	000	India	10-14	132709212	69418835	63290377	96804494	50488158	46316336	35904718	18930677	16974041
9	C4114	00	000	India	15-19	120526449	63982396	56544053	83902472	44570557	39331915	36623977	19411839	17212138

In [283]:

# Clean the Headers
censusDS.columns = [c.strip().lower().replace(' ','_') for c in censusDS.columns]

# We are interested only in total India population
censusDF = pd.DataFrame(censusDS[censusDS['area_name'] == 'India'][['age-group','total']])

# Rename the column for better verbosity
censusDF.rename(columns={'total': 'total_population'}, inplace=True)

# Remove the all-age group summary data
censusDF = censusDF[censusDF['age-group'] !='All ages']

# Correcting the age column 5-9 for helping in grouping
censusDF.loc[(censusDF['age-group'] == '5-9'),'age-group']='05-9'

# Strip the age-group by keeping only the first character. So 1 denoter 10-19, 2 denote 20-29 and so on
censusDF['age-group'] = censusDF['age-group'].str[:1]

# Group by the Age-Group and Sum
censusAvgDF = censusDF.groupby(['age-group']).agg('sum')

# Calculate the Percentage for each age group
censusAvgDF['population_percentage']=(censusAvgDF['total_population']/censusAvgDF['total_population'].sum())*100

censusAvgDF

Out[283]:

	total_population	population_percentage
age-group
0	239734904	19.798812
1	253235661	20.913789
2	212838187	17.577513
3	173735635	14.348179
4	134756439	11.129032
5	88215309	7.285374
6	64118690	5.295324
7	28441345	2.348865
8	11289005	0.932317
A	4489802	0.370796

Processing Covid Data¶

In [284]:

# Download the covid data
r=requests.get(raw_data_covid)
o=r.json()
df_orig = pd.DataFrame(o['raw_data'])
df_orig.loc[:, 'datekey'] = df_orig.dateannounced.apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y').strftime("%Y-%m-%d"))

# Set the empty age as 555 to denote the unknown
df_orig['agebracket'].replace('', '55', inplace=True)

# if there are any range value set in Age Bracket, keep the start age. For eg if the age is '28-35', then keep 28 as the age
df_clean_covid = df_orig.apply(lambda x: x['agebracket'][:2] if '-' in x.agebracket else x,axis=1)

print('Shape of patient records', df_clean_covid.shape)

Shape of patient records (27891, 21)

In [275]:

# We are only interested in Deceased patients
df_deceased = df_clean_covid[df_clean_covid['currentstatus'] == 'Deceased']
print ('Shape after filter deceased patient records', df_deceased.shape)

# Bucket them into age groups
df_deceased['agebracket'] = df_deceased['agebracket'].astype(float)
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 150, 1000]
bins = pd.cut(df_deceased['agebracket'], age_bins)

# Group age wise and get the summary
deceased_agewise = df_deceased.groupby([bins,'currentstatus'])['agebracket'].agg(['count'])

# Rename the column for better verbosity
deceased_agewise.rename(columns={'count': 'total_deceased'}, inplace=True)

deceased_agewise

Shape after filter deceased patient records (46, 21)

c:\users\sanal\appdata\local\programs\python\python37\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys

Out[275]:

		total_deceased
agebracket	currentstatus
(0, 10]	Deceased	1
(10, 20]	Deceased	0
(20, 30]	Deceased	1
(30, 40]	Deceased	2
(40, 50]	Deceased	8
(50, 60]	Deceased	7
(60, 70]	Deceased	17
(70, 80]	Deceased	9
(80, 150]	Deceased	1
(150, 1000]	Deceased	0

In [285]:

# Combine the sensus and covid data
censusAvgDF.index = deceased_agewise.index

data = pd.concat([deceased_agewise,censusAvgDF], axis=1)

In [286]:

data

Out[286]:

		total_deceased	total_population	population_percentage
agebracket	currentstatus
(0, 10]	Deceased	1	239734904	19.798812
(10, 20]	Deceased	0	253235661	20.913789
(20, 30]	Deceased	1	212838187	17.577513
(30, 40]	Deceased	2	173735635	14.348179
(40, 50]	Deceased	8	134756439	11.129032
(50, 60]	Deceased	7	88215309	7.285374
(60, 70]	Deceased	17	64118690	5.295324
(70, 80]	Deceased	9	28441345	2.348865
(80, 150]	Deceased	1	11289005	0.932317
(150, 1000]	Deceased	0	4489802	0.370796

In [ ]: