import os
import shutil
import datetime
import pandas as pd
import requests
import numpy as np
census_url='http://censusindia.gov.in/2011census/C-series/c-14/DDW-0000C-14.xls'
raw_data_covid='https://api.covid19india.org/raw_data.json'
censusDS = pd.read_excel(census_url,skiprows=range(0, 1), skipinitialspace=True)
censusDS.head(10)
Table | State | Distt. | Area Name | Age-group | Total | Unnamed: 6 | Unnamed: 7 | Rural | Unnamed: 9 | Unnamed: 10 | Urban | Unnamed: 12 | Unnamed: 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Name | Code | Code | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN | NaN | Persons | Males | Females | Persons | Males | Females | Persons | Males | Females |
2 | NaN | NaN | NaN | NaN | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | C4114 | 00 | 000 | India | All ages | 1210854977 | 623270258 | 587584719 | 833748852 | 427781058 | 405967794 | 377106125 | 195489200 | 181616925 |
6 | C4114 | 00 | 000 | India | 0-4 | 112806778 | 58632074 | 54174704 | 82986660 | 43036377 | 39950283 | 29820118 | 15595697 | 14224421 |
7 | C4114 | 00 | 000 | India | 5-9 | 126928126 | 66300466 | 60627660 | 93807612 | 48825259 | 44982353 | 33120514 | 17475207 | 15645307 |
8 | C4114 | 00 | 000 | India | 10-14 | 132709212 | 69418835 | 63290377 | 96804494 | 50488158 | 46316336 | 35904718 | 18930677 | 16974041 |
9 | C4114 | 00 | 000 | India | 15-19 | 120526449 | 63982396 | 56544053 | 83902472 | 44570557 | 39331915 | 36623977 | 19411839 | 17212138 |
# Clean the Headers
censusDS.columns = [c.strip().lower().replace(' ','_') for c in censusDS.columns]
# We are interested only in total India population
censusDF = pd.DataFrame(censusDS[censusDS['area_name'] == 'India'][['age-group','total']])
# Rename the column for better verbosity
censusDF.rename(columns={'total': 'total_population'}, inplace=True)
# Remove the all-age group summary data
censusDF = censusDF[censusDF['age-group'] !='All ages']
# Correcting the age column 5-9 for helping in grouping
censusDF.loc[(censusDF['age-group'] == '5-9'),'age-group']='05-9'
# Strip the age-group by keeping only the first character. So 1 denoter 10-19, 2 denote 20-29 and so on
censusDF['age-group'] = censusDF['age-group'].str[:1]
# Group by the Age-Group and Sum
censusAvgDF = censusDF.groupby(['age-group']).agg('sum')
# Calculate the Percentage for each age group
censusAvgDF['population_percentage']=(censusAvgDF['total_population']/censusAvgDF['total_population'].sum())*100
censusAvgDF
total_population | population_percentage | |
---|---|---|
age-group | ||
0 | 239734904 | 19.798812 |
1 | 253235661 | 20.913789 |
2 | 212838187 | 17.577513 |
3 | 173735635 | 14.348179 |
4 | 134756439 | 11.129032 |
5 | 88215309 | 7.285374 |
6 | 64118690 | 5.295324 |
7 | 28441345 | 2.348865 |
8 | 11289005 | 0.932317 |
A | 4489802 | 0.370796 |
# Download the covid data
r=requests.get(raw_data_covid)
o=r.json()
df_orig = pd.DataFrame(o['raw_data'])
df_orig.loc[:, 'datekey'] = df_orig.dateannounced.apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y').strftime("%Y-%m-%d"))
# Set the empty age as 555 to denote the unknown
df_orig['agebracket'].replace('', '55', inplace=True)
# if there are any range value set in Age Bracket, keep the start age. For eg if the age is '28-35', then keep 28 as the age
df_clean_covid = df_orig.apply(lambda x: x['agebracket'][:2] if '-' in x.agebracket else x,axis=1)
print('Shape of patient records', df_clean_covid.shape)
Shape of patient records (27891, 21)
# We are only interested in Deceased patients
df_deceased = df_clean_covid[df_clean_covid['currentstatus'] == 'Deceased']
print ('Shape after filter deceased patient records', df_deceased.shape)
# Bucket them into age groups
df_deceased['agebracket'] = df_deceased['agebracket'].astype(float)
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 150, 1000]
bins = pd.cut(df_deceased['agebracket'], age_bins)
# Group age wise and get the summary
deceased_agewise = df_deceased.groupby([bins,'currentstatus'])['agebracket'].agg(['count'])
# Rename the column for better verbosity
deceased_agewise.rename(columns={'count': 'total_deceased'}, inplace=True)
deceased_agewise
Shape after filter deceased patient records (46, 21)
c:\users\sanal\appdata\local\programs\python\python37\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy import sys
total_deceased | ||
---|---|---|
agebracket | currentstatus | |
(0, 10] | Deceased | 1 |
(10, 20] | Deceased | 0 |
(20, 30] | Deceased | 1 |
(30, 40] | Deceased | 2 |
(40, 50] | Deceased | 8 |
(50, 60] | Deceased | 7 |
(60, 70] | Deceased | 17 |
(70, 80] | Deceased | 9 |
(80, 150] | Deceased | 1 |
(150, 1000] | Deceased | 0 |
# Combine the sensus and covid data
censusAvgDF.index = deceased_agewise.index
data = pd.concat([deceased_agewise,censusAvgDF], axis=1)
data
total_deceased | total_population | population_percentage | ||
---|---|---|---|---|
agebracket | currentstatus | |||
(0, 10] | Deceased | 1 | 239734904 | 19.798812 |
(10, 20] | Deceased | 0 | 253235661 | 20.913789 |
(20, 30] | Deceased | 1 | 212838187 | 17.577513 |
(30, 40] | Deceased | 2 | 173735635 | 14.348179 |
(40, 50] | Deceased | 8 | 134756439 | 11.129032 |
(50, 60] | Deceased | 7 | 88215309 | 7.285374 |
(60, 70] | Deceased | 17 | 64118690 | 5.295324 |
(70, 80] | Deceased | 9 | 28441345 | 2.348865 |
(80, 150] | Deceased | 1 | 11289005 | 0.932317 |
(150, 1000] | Deceased | 0 | 4489802 | 0.370796 |