Import Libraries¶

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Set up the data¶

In [2]:

# create a dataframe with one column "ages" with (50) randomly generated integers between 10 and 85
df = pd.DataFrame({"ages" : np.random.randint(low = 10, high = 85, size = 50)})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ages    50 non-null     int32
dtypes: int32(1)
memory usage: 328.0 bytes

Dataframe¶

In [3]:

df

Out[3]:

	ages
0	50
1	73
2	21
3	61
4	57
5	75
6	82
7	47
8	69
9	19
10	27
11	37
12	16
13	36
14	81
15	44
16	58
17	45
18	35
19	41
20	30
21	62
22	34
23	44
24	17
25	50
26	82
27	50
28	21
29	65
30	30
31	37
32	23
33	63
34	32
35	68
36	35
37	24
38	77
39	13
40	11
41	61
42	64
43	42
44	13
45	58
46	83
47	77
48	36
49	49

Option 1. Create a histogram of ages with bins set to 10¶

The age column will be divided into 10 intervals

In [4]:

plt.hist(df["ages"], bins = 10) # you can customize bins using any integer
plt.show()

Option 2A. Series.value_counts() without bins argument¶

In [5]:

plt.bar(x = df["ages"].value_counts().index,
        height = df["ages"].value_counts().values)
plt.show()

Option 2B. Series.value_counts() with bins argument and custom xticklabels defined¶

In [ ]:

temp_series = df["ages"].value_counts(bins = 10).sort_index()

fig, ax = plt.subplots(figsize = (15, 5))

ax.bar(x = temp_series.index.astype(str),
       height = temp_series.values)

# set the ticks first before setting the labels
ax.set_xticks(ax.get_xticks())

# set the labels as the upper bound of each interval prefixed with 'xyz'
ax.set_xticklabels(["xyz " + str(each)[-5 : -1] for each in temp_series.index])

plt.show()

Option 3. Divide age column in custom intervals¶

In [7]:

# define function to create custom intervals
def age_intervals(age):
    if age <= 17:
        return "17 or below"
    elif 17 < age <= 34:
        return "17 - 34"
    elif 34 < age < 51:
        return "34 - 51"
    elif 51 < age <= 71:
        return "51 - 71"
    else:
        return "71+"

# create new column with customized intervals    
df["age_group"] = df["ages"].apply(age_intervals)

df["age_group"].value_counts()

Out[7]:

34 - 51        16
51 - 71        11
17 - 34        10
71+             8
17 or below     5
Name: age_group, dtype: int64

In [8]:

# use a plt.hist on the age_group column which by default calculates the count of each group
plt.hist(x = df.sort_values(by = "ages")["age_group"])
plt.show()

	ages
0	50
1	73
2	21
3	61
4	57
5	75
6	82
7	47
8	69
9	19
10	27
11	37
12	16
13	36
14	81
15	44
16	58
17	45
18	35
19	41
20	30
21	62
22	34
23	44
24	17
25	50
26	82
27	50
28	21
29	65
30	30
31	37
32	23
33	63
34	32
35	68
36	35
37	24
38	77
39	13
40	11
41	61
42	64
43	42
44	13
45	58
46	83
47	77
48	36
49	49

	ages
0	50
1	73
2	21
3	61
4	57
5	75
6	82
7	47
8	69
9	19
10	27
11	37
12	16
13	36
14	81
15	44
16	58
17	45
18	35
19	41
20	30
21	62
22	34
23	44
24	17
25	50
26	82
27	50
28	21
29	65
30	30
31	37
32	23
33	63
34	32
35	68
36	35
37	24
38	77
39	13
40	11
41	61
42	64
43	42
44	13
45	58
46	83
47	77
48	36
49	49

	ages
0	50
1	73
2	21
3	61
4	57
5	75
6	82
7	47
8	69
9	19
10	27
11	37
12	16
13	36
14	81
15	44
16	58
17	45
18	35
19	41
20	30
21	62
22	34
23	44
24	17
25	50
26	82
27	50
28	21
29	65
30	30
31	37
32	23
33	63
34	32
35	68
36	35
37	24
38	77
39	13
40	11
41	61
42	64
43	42
44	13
45	58
46	83
47	77
48	36
49	49