import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# create a dataframe with one column "ages" with (50) randomly generated integers between 10 and 85
df = pd.DataFrame({"ages" : np.random.randint(low = 10, high = 85, size = 50)})
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ages 50 non-null int32 dtypes: int32(1) memory usage: 328.0 bytes
df
ages | |
---|---|
0 | 50 |
1 | 73 |
2 | 21 |
3 | 61 |
4 | 57 |
5 | 75 |
6 | 82 |
7 | 47 |
8 | 69 |
9 | 19 |
10 | 27 |
11 | 37 |
12 | 16 |
13 | 36 |
14 | 81 |
15 | 44 |
16 | 58 |
17 | 45 |
18 | 35 |
19 | 41 |
20 | 30 |
21 | 62 |
22 | 34 |
23 | 44 |
24 | 17 |
25 | 50 |
26 | 82 |
27 | 50 |
28 | 21 |
29 | 65 |
30 | 30 |
31 | 37 |
32 | 23 |
33 | 63 |
34 | 32 |
35 | 68 |
36 | 35 |
37 | 24 |
38 | 77 |
39 | 13 |
40 | 11 |
41 | 61 |
42 | 64 |
43 | 42 |
44 | 13 |
45 | 58 |
46 | 83 |
47 | 77 |
48 | 36 |
49 | 49 |
The age column will be divided into 10 intervals
plt.hist(df["ages"], bins = 10) # you can customize bins using any integer
plt.show()
plt.bar(x = df["ages"].value_counts().index,
height = df["ages"].value_counts().values)
plt.show()
temp_series = df["ages"].value_counts(bins = 10).sort_index()
fig, ax = plt.subplots(figsize = (15, 5))
ax.bar(x = temp_series.index.astype(str),
height = temp_series.values)
# set the ticks first before setting the labels
ax.set_xticks(ax.get_xticks())
# set the labels as the upper bound of each interval prefixed with 'xyz'
ax.set_xticklabels(["xyz " + str(each)[-5 : -1] for each in temp_series.index])
plt.show()
# define function to create custom intervals
def age_intervals(age):
if age <= 17:
return "17 or below"
elif 17 < age <= 34:
return "17 - 34"
elif 34 < age < 51:
return "34 - 51"
elif 51 < age <= 71:
return "51 - 71"
else:
return "71+"
# create new column with customized intervals
df["age_group"] = df["ages"].apply(age_intervals)
df["age_group"].value_counts()
34 - 51 16 51 - 71 11 17 - 34 10 71+ 8 17 or below 5 Name: age_group, dtype: int64
# use a plt.hist on the age_group column which by default calculates the count of each group
plt.hist(x = df.sort_values(by = "ages")["age_group"])
plt.show()