import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed()
# create a dataframe with two columns "ages" and "sex"
df = pd.DataFrame({"ages" : np.random.randint(low = 10, high = 85, size = 51),
"sex" : np.random.choice(a = ["M", "F", "U"], size = 51, #p = (0.3, 0.4, 0.3)
)})
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 51 entries, 0 to 50 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ages 51 non-null int32 1 sex 51 non-null object dtypes: int32(1), object(1) memory usage: 740.0+ bytes
df
ages | sex | |
---|---|---|
0 | 27 | F |
1 | 48 | U |
2 | 67 | F |
3 | 38 | U |
4 | 36 | M |
5 | 28 | M |
6 | 61 | F |
7 | 33 | F |
8 | 37 | U |
9 | 16 | M |
10 | 48 | U |
11 | 26 | F |
12 | 35 | M |
13 | 38 | U |
14 | 76 | F |
15 | 84 | M |
16 | 81 | F |
17 | 59 | F |
18 | 82 | F |
19 | 33 | F |
20 | 20 | M |
21 | 51 | U |
22 | 43 | F |
23 | 15 | F |
24 | 54 | M |
25 | 77 | U |
26 | 61 | U |
27 | 75 | F |
28 | 81 | M |
29 | 78 | M |
30 | 72 | F |
31 | 36 | M |
32 | 39 | U |
33 | 27 | U |
34 | 75 | M |
35 | 27 | U |
36 | 42 | U |
37 | 55 | U |
38 | 83 | F |
39 | 54 | F |
40 | 72 | F |
41 | 26 | U |
42 | 19 | F |
43 | 49 | U |
44 | 76 | U |
45 | 12 | U |
46 | 23 | F |
47 | 33 | U |
48 | 21 | M |
49 | 31 | F |
50 | 38 | F |
# define function to create custom intervals
def age_intervals(age):
if age <= 17:
return "17 or below"
elif 17 < age <= 34:
return "17 - 34"
elif 34 < age < 51:
return "34 - 51"
elif 51 < age <= 71:
return "51 - 71"
else:
return "71+"
# create new column with customized intervals
df["age_group"] = df["ages"].apply(age_intervals)
df["age_group"].value_counts()
17 - 34 14 71+ 14 34 - 51 13 51 - 71 7 17 or below 3 Name: age_group, dtype: int64
df_grouped = df.groupby(["sex", "age_group"], dropna = False, as_index = False).agg({"ages" : np.size}).sort_values("age_group")
# append additional rows to avoid length mismatch when
# Missing groups may be required as data was generated randomly
# df_grouped.loc[len(df_grouped)] = ["M", "71+", 0]
# df_grouped.loc[len(df_grouped)] = ["U", "17 or below", 0]
# df_grouped.sort_values(["sex", "ages"])
fig, ax = plt.subplots(figsize = (12, 5))
sns.barplot(data = df,
x = "age_group",
y = "ages",
hue = "sex",
ci = 0,
estimator = np.size)
plt.show()
# age_groups to be used as labels
labels = df["age_group"].unique()
print("labels:", labels)
# no. of male patients for each age_group
men_count = df_grouped.loc[df_grouped["sex"] == "M", "ages"].values
# no. of female patients for each age_group
women_count = df_grouped.loc[df_grouped["sex"] == "F", "ages"].values
# no. of unknown gender patients for each age_group
unknown_count = df_grouped.loc[df_grouped["sex"] == "U", "ages"].values
print("Men count = {}, Women count = {}, Unknown count = {}".format(men_count, women_count, unknown_count))
x = np.arange(len(labels)) # the label locations
width = 0.3 # the width of the bars
fig, ax = plt.subplots(figsize = (12, 5))
rects1 = ax.bar(x = x - width,
height = men_count,
width = width,
label='Men')
rects2 = ax.bar(x = x ,
height = women_count,
width = width,
label='Women')
rects3 = ax.bar(x = x + width,
height = unknown_count,
width = width,
label='Unknown')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Scores by group and gender')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(bbox_to_anchor = (1.15, 1))
ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)
ax.bar_label(rects3, padding=3)
fig.tight_layout()
plt.show()
labels: ['17 - 34' '34 - 51' '51 - 71' '17 or below' '71+'] Men count = [3 1 3 1 4], Women count = [7 1 2 4 7], Unknown count = [4 1 8 2 3]
filter_list = ['myo','peri', 'aortic', 'heart', 'artery', 'veins',]
print("filter_list:", filter_list, "\n")
dummy_df = pd.DataFrame({"symptom1" : ['myo','peri', 'aortic', 'heart'],
"symptom2" : ['patient has MYO','Peri conditions', 'aorticregion', 'heart disease']})
# this will match as each word is same as present in filter list
print(dummy_df["symptom1"].isin(filter_list))
# this will not match
print(dummy_df["symptom2"].isin(filter_list))
filter_list: ['myo', 'peri', 'aortic', 'heart', 'artery', 'veins'] 0 True 1 True 2 True 3 True Name: symptom1, dtype: bool 0 False 1 False 2 False 3 False Name: symptom2, dtype: bool
# width = 0.3
# babies = covid_vaers_cardio[(covid_vaers_cardio.AGE_YRS < 7)]
# kids = covid_vaers_cardio[(7 <= covid_vaers_cardio.AGE_YRS ) & (covid_vaers_cardio.AGE_YRS < 13)]
# teens = covid_vaers_cardio[(13 <= covid_vaers_cardio.AGE_YRS ) & (covid_vaers_cardio.AGE_YRS < 19)]
# young_adults = covid_vaers_cardio[(19 <= covid_vaers_cardio.AGE_YRS) & (covid_vaers_cardio.AGE_YRS < 30)]
# adults = covid_vaers_cardio[(30 <= covid_vaers_cardio.AGE_YRS) & (covid_vaers_cardio.AGE_YRS < 40)]
# older_adults = covid_vaers_cardio[(40 <= covid_vaers_cardio.AGE_YRS) & (covid_vaers_cardio.AGE_YRS < 50)]
# halfway_adults = covid_vaers_cardio[(50 <= covid_vaers_cardio.AGE_YRS) & (covid_vaers_cardio.AGE_YRS < 60)]
# retirement_adults = covid_vaers_cardio[(60 <= covid_vaers_cardio.AGE_YRS) & (covid_vaers_cardio.AGE_YRS < 70)]
# oldies = covid_vaers_cardio[(covid_vaers_cardio.AGE_YRS >= 70)]
# x = ['babies', 'kids', 'teens', 'young_adults', 'adults', 'older_adults', 'halfway_adults', 'retirement_adults', 'oldies']
# Male = covid_vaers_cardio[(covid_vaers_cardio.SEX == 'M')]
# Female = covid_vaers_cardio[(covid_vaers_cardio.SEX == 'F')]
# Unknown = covid_vaers_cardio[(covid_vaers_cardio.SEX == 'U')]
# bar1 = np.arange(len(x))
# bar2 = [i+width for i in bar1]
# bar3 = [i+width for i in bar2]
# # bar1, bar2, bar3
# plt.bar(bar1, Male, width, label = 'Male')
# plt.bar(bar2, Female, width, label = 'Female')
# plt.bar(bar3, Unknown, width, label = 'Unknown')
# plt.xlabel("Age")
# plt.ylabel("Total Number of Adverse Events")
# plt.title("How Cardiovascular Symptoms are Distributed among the COVID19 Vaccinated by Age and Sex")
# plt.xticks(bar1+width, x)
# plt.legend()
# plt.show()