#!/usr/bin/env python # coding: utf-8 # ## Binning Data with Pandas cut and qcut # # This notebook accompanies the article posted on [pbpython.com](http://pbpython.com/pandas-qcut-cut.html) # In[1]: import pandas as pd import numpy as np import seaborn as sns # In[2]: sns.set_style('whitegrid') # In[3]: raw_df = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/2018_Sales_Total_v2.xlsx?raw=true') # In[4]: df = raw_df.groupby(['account number', 'name'])['ext price'].sum().reset_index() # In[5]: df.head() # A histogram is an example of binning data and showing the visual representation of the data distribution # ## Introducing qcut # In[6]: df['ext price'].plot(kind='hist') # Describe shows how data can be cut by percentiles # In[7]: df['ext price'].describe() # Here is an example of using [qcut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html) # In[8]: pd.qcut(df['ext price'], q=4) # Assign the results of the values back to the original dataframe # In[9]: df['quantile_ex_1'] = pd.qcut(df['ext price'], q=4) df['quantile_ex_2'] = pd.qcut(df['ext price'], q=10, precision=0) # In[10]: df.head() # Look at the distribution # In[11]: df['quantile_ex_1'].value_counts() # In[12]: df['quantile_ex_2'].value_counts() # In[13]: bin_labels_5 = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond'] df['quantile_ex_3'] = pd.qcut(df['ext price'], q=[0, .2, .4, .6, .8, 1], labels=bin_labels_5) df.head() # In[14]: df['quantile_ex_3'].value_counts() # In[15]: results, bin_edges = pd.qcut(df['ext price'], q=[0, .2, .4, .6, .8, 1], labels=bin_labels_5, retbins=True) results_table = pd.DataFrame(zip(bin_edges, bin_labels_5), columns=['Threshold', 'Tier']) # In[16]: results_table # In[17]: df.describe(include='category') # We can pass the percentiles to use to describe # In[18]: df.describe(percentiles=[0, 1/3, 2/3, 1]) # labels=False will return integers for each bin # In[19]: df['quantile_ex_4'] = pd.qcut(df['ext price'], q=[0, .2, .4, .6, .8, 1], labels=False) df.head() # ## cut # Remove the added columns to make the examples shorter # In[20]: df = df.drop(columns = ['quantile_ex_1','quantile_ex_2', 'quantile_ex_3', 'quantile_ex_4']) # In[21]: pd.cut(df['ext price'], bins=4) # In[22]: pd.cut(df['ext price'], bins=4).value_counts() # In[23]: cut_labels_4 = ['silver', 'gold', 'platinum', 'diamond'] cut_bins = [0, 70000, 100000, 130000, 200000] df['cut_ex1'] = pd.cut(df['ext price'], bins=cut_bins, labels=cut_labels_4) # In[24]: df.head() # We can use nump.linspace to define the ranges # In[25]: np.linspace(0, 200000, 9) # In[26]: pd.cut(df['ext price'], bins=np.linspace(0, 200000, 9)) # numpy arange is another option # In[27]: np.arange(0, 200000, 10000) # In[28]: pd.interval_range(start=0, freq=10000, end=200000, closed='left') # In[29]: interval_range = pd.interval_range(start=0, freq=10000, end=200000) df['cut_ex2'] = pd.cut(df['ext price'], bins=interval_range, labels=[1,2,3]) df.head()