import pandas as pd
import jenkspy
Create an example dataframe
sales = {
'account': [
'Jones Inc', 'Alpha Co', 'Blue Inc', 'Super Star Inc', 'Wamo',
'Next Gen', 'Giga Co', 'IniTech', 'Beta LLC'
],
'Total': [1500, 2100, 50, 20, 75, 1100, 950, 1300, 1400]
}
df = pd.DataFrame(sales)
df.sort_values(by='Total')
account | Total | |
---|---|---|
3 | Super Star Inc | 20 |
2 | Blue Inc | 50 |
4 | Wamo | 75 |
6 | Giga Co | 950 |
5 | Next Gen | 1100 |
7 | IniTech | 1300 |
8 | Beta LLC | 1400 |
0 | Jones Inc | 1500 |
1 | Alpha Co | 2100 |
Try cutting the data using qcut
df['quantile'] = pd.qcut(df['Total'], q=2, labels=['bucket_1', 'bucket_2'])
df.sort_values(by='Total')
account | Total | quantile | |
---|---|---|---|
3 | Super Star Inc | 20 | bucket_1 |
2 | Blue Inc | 50 | bucket_1 |
4 | Wamo | 75 | bucket_1 |
6 | Giga Co | 950 | bucket_1 |
5 | Next Gen | 1100 | bucket_1 |
7 | IniTech | 1300 | bucket_2 |
8 | Beta LLC | 1400 | bucket_2 |
0 | Jones Inc | 1500 | bucket_2 |
1 | Alpha Co | 2100 | bucket_2 |
Compare with using cut
df['cut_bins'] = pd.cut(df['Total'],
bins=2,
labels=['bucket_1', 'bucket_2'])
df.sort_values(by='Total')
account | Total | quantile | cut_bins | |
---|---|---|---|---|
3 | Super Star Inc | 20 | bucket_1 | bucket_1 |
2 | Blue Inc | 50 | bucket_1 | bucket_1 |
4 | Wamo | 75 | bucket_1 | bucket_1 |
6 | Giga Co | 950 | bucket_1 | bucket_1 |
5 | Next Gen | 1100 | bucket_1 | bucket_2 |
7 | IniTech | 1300 | bucket_2 | bucket_2 |
8 | Beta LLC | 1400 | bucket_2 | bucket_2 |
0 | Jones Inc | 1500 | bucket_2 | bucket_2 |
1 | Alpha Co | 2100 | bucket_2 | bucket_2 |
Show how jenkspy works
breaks = jenkspy.jenks_breaks(df['Total'], nb_class=2)
print(breaks)
[20.0, 75.0, 2100.0]
df['cut_jenks'] = pd.cut(df['Total'],
bins=breaks,
labels=['bucket_1', 'bucket_2'])
df.sort_values(by='Total')
account | Total | quantile | cut_bins | cut_jenks | |
---|---|---|---|---|---|
3 | Super Star Inc | 20 | bucket_1 | bucket_1 | NaN |
2 | Blue Inc | 50 | bucket_1 | bucket_1 | bucket_1 |
4 | Wamo | 75 | bucket_1 | bucket_1 | bucket_1 |
6 | Giga Co | 950 | bucket_1 | bucket_1 | bucket_2 |
5 | Next Gen | 1100 | bucket_1 | bucket_2 | bucket_2 |
7 | IniTech | 1300 | bucket_2 | bucket_2 | bucket_2 |
8 | Beta LLC | 1400 | bucket_2 | bucket_2 | bucket_2 |
0 | Jones Inc | 1500 | bucket_2 | bucket_2 | bucket_2 |
1 | Alpha Co | 2100 | bucket_2 | bucket_2 | bucket_2 |
Fix the NaN by using include_lowest
df['cut_jenksv2'] = pd.cut(df['Total'],
bins=breaks,
labels=['bucket_1', 'bucket_2'],
include_lowest=True)
df.sort_values(by='Total')
account | Total | quantile | cut_bins | cut_jenks | cut_jenksv2 | |
---|---|---|---|---|---|---|
3 | Super Star Inc | 20 | bucket_1 | bucket_1 | NaN | bucket_1 |
2 | Blue Inc | 50 | bucket_1 | bucket_1 | bucket_1 | bucket_1 |
4 | Wamo | 75 | bucket_1 | bucket_1 | bucket_1 | bucket_1 |
6 | Giga Co | 950 | bucket_1 | bucket_1 | bucket_2 | bucket_2 |
5 | Next Gen | 1100 | bucket_1 | bucket_2 | bucket_2 | bucket_2 |
7 | IniTech | 1300 | bucket_2 | bucket_2 | bucket_2 | bucket_2 |
8 | Beta LLC | 1400 | bucket_2 | bucket_2 | bucket_2 | bucket_2 |
0 | Jones Inc | 1500 | bucket_2 | bucket_2 | bucket_2 | bucket_2 |
1 | Alpha Co | 2100 | bucket_2 | bucket_2 | bucket_2 | bucket_2 |
Try some other examples
df['quantilev2'] = pd.qcut(
df['Total'], q=4, labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'])
df['cut_jenksv3'] = pd.cut(
df['Total'],
bins=jenkspy.jenks_breaks(df['Total'], nb_class=4),
labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'],
include_lowest=True)
df.sort_values(by='Total')
account | Total | quantile | cut_bins | cut_jenks | cut_jenksv2 | quantilev2 | cut_jenksv3 | |
---|---|---|---|---|---|---|---|---|
3 | Super Star Inc | 20 | bucket_1 | bucket_1 | NaN | bucket_1 | bucket_1 | bucket_1 |
2 | Blue Inc | 50 | bucket_1 | bucket_1 | bucket_1 | bucket_1 | bucket_1 | bucket_1 |
4 | Wamo | 75 | bucket_1 | bucket_1 | bucket_1 | bucket_1 | bucket_1 | bucket_1 |
6 | Giga Co | 950 | bucket_1 | bucket_1 | bucket_2 | bucket_2 | bucket_2 | bucket_2 |
5 | Next Gen | 1100 | bucket_1 | bucket_2 | bucket_2 | bucket_2 | bucket_2 | bucket_2 |
7 | IniTech | 1300 | bucket_2 | bucket_2 | bucket_2 | bucket_2 | bucket_3 | bucket_3 |
8 | Beta LLC | 1400 | bucket_2 | bucket_2 | bucket_2 | bucket_2 | bucket_3 | bucket_3 |
0 | Jones Inc | 1500 | bucket_2 | bucket_2 | bucket_2 | bucket_2 | bucket_4 | bucket_3 |
1 | Alpha Co | 2100 | bucket_2 | bucket_2 | bucket_2 | bucket_2 | bucket_4 | bucket_4 |