This notebook presents different data wrangling techniques used commonly
# import required libraries
import random
import datetime
import numpy as np
import pandas as pd
from random import randrange
from sklearn import preprocessing
from IPython.display import display
pd.options.mode.chained_assignment = None
def _random_date(start,date_count):
"""This function generates a random date based on params
Args:
start (date object): the base date
date_count (int): number of dates to be generated
Returns:
list of random dates
"""
current = start
while date_count > 0:
curr = current + datetime.timedelta(days=randrange(42))
yield curr
date_count-=1
def generate_sample_data(row_count=100):
"""This function generates a random transaction dataset
Args:
row_count (int): number of rows for the dataframe
Returns:
a pandas dataframe
"""
# sentinels
startDate = datetime.datetime(2016, 1, 1,13)
serial_number_sentinel = 1000
user_id_sentinel = 5001
product_id_sentinel = 101
price_sentinel = 2000
# base list of attributes
data_dict = {
'Serial No': np.arange(row_count)+serial_number_sentinel,
'Date': np.random.permutation(pd.to_datetime([x.strftime("%d-%m-%Y")
for x in _random_date(startDate,
row_count)]).date
),
'User ID': np.random.permutation(np.random.randint(0,
row_count,
size=int(row_count/10)) + user_id_sentinel).tolist()*10,
'Product ID': np.random.permutation(np.random.randint(0,
row_count,
size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
'Quantity Purchased': np.random.permutation(np.random.randint(1,
42,
size=row_count)),
'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,
decimals=2),
'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1))
for i in range(row_count)])
}
# introduce missing values
for index in range(int(np.sqrt(row_count))):
data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan
data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0
data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1
data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101
# create data frame
df = pd.DataFrame(data_dict)
return df
def describe_dataframe(df=pd.DataFrame()):
"""This function generates descriptive stats of a dataframe
Args:
df (dataframe): the dataframe to be analyzed
Returns:
None
"""
print("\n\n")
print("*"*30)
print("About the Data")
print("*"*30)
print("Number of rows::",df.shape[0])
print("Number of columns::",df.shape[1])
print("\n")
print("Column Names::",df.columns.values.tolist())
print("\n")
print("Column Data Types::\n",df.dtypes)
print("\n")
print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
print("\n")
print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).nonzero()[0].tolist()))
print("\n")
print("Sample Indices with missing data::",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])
print("\n")
print("General Stats::")
print(df.info())
print("\n")
print("Summary Stats::")
print(df.describe())
print("\n")
print("Dataframe Sample Rows::")
display(df.head(5))
def cleanup_column_names(df,rename_dict={},do_inplace=True):
"""This function renames columns of a pandas dataframe
It converts column names to snake case if rename_dict is not passed.
Args:
rename_dict (dict): keys represent old column names and values point to
newer ones
do_inplace (bool): flag to update existing dataframe or return a new one
Returns:
pandas dataframe if do_inplace is set to False, None otherwise
"""
if not rename_dict:
return df.rename(columns={col: col.lower().replace(' ','_')
for col in df.columns.values.tolist()},
inplace=do_inplace)
else:
return df.rename(columns=rename_dict,inplace=do_inplace)
def expand_user_type(u_type):
"""This function maps user types to user classes
Args:
u_type (str): user type value
Returns:
(str) user_class value
"""
if u_type in ['a','b']:
return 'new'
elif u_type == 'c':
return 'existing'
elif u_type == 'd':
return 'loyal_existing'
else:
return 'error'
df = generate_sample_data(row_count=1000)
describe_dataframe(df)
print("Dataframe columns:\n{}".format(df.columns.tolist()))
cleanup_column_names(df)
print("Dataframe columns:\n{}".format(df.columns.tolist()))
display(df.sort_values(['serial_no', 'price'],
ascending=[True, False]).head())
display(df[['serial_no','date','user_id','user_type',
'product_id','quantity_purchased','price']].head())
Using Column Index
# print 10 values from column at index 3
print(df.iloc[:,3].values[0:10])
Using Column Name
# print 10 values of quantity purchased
print(df.quantity_purchased.values[0:10])
Using Column Datatype
# print 10 values of columns with data type float
print(df.select_dtypes(include=['float64']).values[:10,0])
Select specific rows
display(df.iloc[[10,501,20]])
Exclude Specific Row indices
display(df.drop([0,24,51], axis=0).head())
Conditional Filtering
display(df[df.quantity_purchased>25].head())
Offset from top of the dataframe
display(df[100:].head())
Offset from bottom of the dataframe
display(df[-10:].head())
df['date'] = pd.to_datetime(df.date)
# compare dtypes of the original df with this one
print(df.dtypes)
Map : Create a derived attribute using map
df['user_class'] = df['user_type'].map(expand_user_type)
display(df.tail())
Apply: Using apply to get attribute ranges
display(df.select_dtypes(include=[np.number]).apply(lambda x:
x.max()- x.min()))
Applymap: Extract week from date
df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week
if not pd.isnull(dt.week)
else 0)
display(df.head())
Drop Rows with missing dates
df_dropped = df.dropna(subset=['date'])
display(df_dropped.head())
Fill Missing Price values with mean price
df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2),
inplace=True)
Fill Missing user_type values with value from previous row (forward fill)
df_dropped['user_type'].fillna(method='ffill',inplace=True)
Fill Missing user_type values with value from next row (backward fill)
df_dropped['user_type'].fillna(method='bfill',inplace=True)
Drop Duplicate serial_no rows
# sample duplicates
display(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())
print("Shape of df={}".format(df_dropped.shape))
df_dropped.drop_duplicates(subset=['serial_no'],inplace=True)
# updated dataframe
display(df_dropped.head())
print("Shape of df={}".format(df_dropped.shape))
Remove rows which have less than 3 attributes with non-missing data
display(df.dropna(thresh=3).head())
print("Shape of df={}".format(df.dropna(thresh=3).shape))
One Hot Encoding using get_dummies()
display(pd.get_dummies(df,columns=['user_type']).head())
Label Mapping
type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1}
df['encoded_user_type'] = df.user_type.map(type_map)
display((df.tail()))
display(df.sample(frac=0.2, replace=True, random_state=42).head())
Normalize price values using Min-Max Scaler
df_normalized = df.dropna().copy()
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1))
df_normalized['price'] = np_scaled.reshape(-1,1)
display(df_normalized.head())
Normalize quantity purchased values using Robust Scaler
df_normalized = df.dropna().copy()
robust_scaler = preprocessing.RobustScaler()
rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1))
df_normalized['quantity_purchased'] = rs_scaled.reshape(-1,1)
display(df_normalized.head())
Condition based aggregation
print("Mean price of items purchased by user_type=a :: {}".format(df['price'][df['user_type']=='a'].mean()))
Condtion based counts
print(df['purchase_week'].value_counts())
Group By certain attributes
print(df.groupby(['user_class'])['quantity_purchased'].sum())
Group By with different aggregate functions
display(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum,
np.mean,
np.count_nonzero]))
Group by specific aggregate functions for each attribute
display(df.groupby(['user_class','user_type']).agg({'price':np.mean,
'quantity_purchased':np.max}))
Group by with multiple agg for each attribute
display(df.groupby(['user_class','user_type']).agg({'price':{
'total_price':np.sum,
'mean_price':np.mean,
'variance_price':np.std,
'count':np.count_nonzero},
'quantity_purchased':np.sum}))
display(df.pivot_table(index='date', columns='user_type',
values='price',aggfunc=np.mean))
print(df.stack())