#!/usr/bin/env python
# coding: utf-8

# ## Binning Data with Pandas cut and qcut
# 
# This notebook accompanies the article posted on [pbpython.com](http://pbpython.com/pandas-qcut-cut.html)

# In[1]:


import pandas as pd
import numpy as np
import seaborn as sns


# In[2]:


sns.set_style('whitegrid')


# In[3]:


raw_df = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/2018_Sales_Total_v2.xlsx?raw=true')


# In[4]:


df = raw_df.groupby(['account number', 'name'])['ext price'].sum().reset_index()


# In[5]:


df.head()


# A histogram is an example of binning data and showing the visual representation of the data distribution

# ## Introducing qcut

# In[6]:


df['ext price'].plot(kind='hist')


# Describe shows how data can be cut by percentiles

# In[7]:


df['ext price'].describe()


# Here is an example of using [qcut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html)

# In[8]:


pd.qcut(df['ext price'], q=4)


# Assign the results of the values back to the original dataframe

# In[9]:


df['quantile_ex_1'] = pd.qcut(df['ext price'], q=4)
df['quantile_ex_2'] = pd.qcut(df['ext price'], q=10, precision=0)


# In[10]:


df.head()


# Look at the distribution

# In[11]:


df['quantile_ex_1'].value_counts()


# In[12]:


df['quantile_ex_2'].value_counts()


# In[13]:


bin_labels_5 = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond']
df['quantile_ex_3'] = pd.qcut(df['ext price'],
                              q=[0, .2, .4, .6, .8, 1],
                              labels=bin_labels_5)
df.head()


# In[14]:


df['quantile_ex_3'].value_counts()


# In[15]:


results, bin_edges = pd.qcut(df['ext price'],
                             q=[0, .2, .4, .6, .8, 1],
                             labels=bin_labels_5,
                             retbins=True)

results_table = pd.DataFrame(zip(bin_edges, bin_labels_5),
                             columns=['Threshold', 'Tier'])


# In[16]:


results_table


# In[17]:


df.describe(include='category')


# We can pass the percentiles to use to describe

# In[18]:


df.describe(percentiles=[0, 1/3, 2/3, 1])


# labels=False will return integers for each bin

# In[19]:


df['quantile_ex_4'] = pd.qcut(df['ext price'],
                            q=[0, .2, .4, .6, .8, 1],
                            labels=False)
df.head()


# ## cut

# Remove the added columns to make the examples shorter

# In[20]:


df = df.drop(columns = ['quantile_ex_1','quantile_ex_2', 'quantile_ex_3', 'quantile_ex_4'])


# In[21]:


pd.cut(df['ext price'], bins=4)


# In[22]:


pd.cut(df['ext price'], bins=4).value_counts()


# In[23]:


cut_labels_4 = ['silver', 'gold', 'platinum', 'diamond']
cut_bins = [0, 70000, 100000, 130000, 200000]
df['cut_ex1'] = pd.cut(df['ext price'], bins=cut_bins, labels=cut_labels_4)


# In[24]:


df.head()


# We can use nump.linspace to define the ranges

# In[25]:


np.linspace(0, 200000, 9)


# In[26]:


pd.cut(df['ext price'], bins=np.linspace(0, 200000, 9))


# numpy arange is another option

# In[27]:


np.arange(0, 200000, 10000)


# In[28]:


pd.interval_range(start=0, freq=10000, end=200000, closed='left')


# In[29]:


interval_range = pd.interval_range(start=0, freq=10000, end=200000)
df['cut_ex2'] = pd.cut(df['ext price'], bins=interval_range, labels=[1,2,3])
df.head()