Data Wrangling

This notebook presents different data wrangling techniques used commonly

In [1]:
# import required libraries
import random
import datetime 
import numpy as np
import pandas as pd
from random import randrange
from sklearn import preprocessing

from IPython.display import display

pd.options.mode.chained_assignment = None

Utilities

In [2]:
def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1


def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """
    
    # sentinels
    startDate = datetime.datetime(2016, 1, 1,13)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000
    
    
    # base list of attributes
    data_dict = {
    'Serial No': np.arange(row_count)+serial_number_sentinel,
    'Date': np.random.permutation(pd.to_datetime([x.strftime("%d-%m-%Y") 
                                                    for x in _random_date(startDate,
                                                                          row_count)]).date
                                  ),
    'User ID': np.random.permutation(np.random.randint(0,
                                                       row_count,
                                                       size=int(row_count/10)) + user_id_sentinel).tolist()*10,
    'Product ID': np.random.permutation(np.random.randint(0,
                                                          row_count,
                                                          size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
    'Quantity Purchased': np.random.permutation(np.random.randint(1,
                                                                  42,
                                                                  size=row_count)),
    'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,
                      decimals=2),
    'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) 
                                            for i in range(row_count)])
    }
    
    # introduce missing values
    for index in range(int(np.sqrt(row_count))): 
        data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101
        
    
    # create data frame
    df = pd.DataFrame(data_dict)
    
    return df
    

def describe_dataframe(df=pd.DataFrame()):
    """This function generates descriptive stats of a dataframe
    Args:
        df (dataframe): the dataframe to be analyzed
    Returns:
        None

    """
    print("\n\n")
    print("*"*30)
    print("About the Data")
    print("*"*30)
    
    print("Number of rows::",df.shape[0])
    print("Number of columns::",df.shape[1])
    print("\n")
    
    print("Column Names::",df.columns.values.tolist())
    print("\n")
    
    print("Column Data Types::\n",df.dtypes)
    print("\n")
    
    print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
    print("\n")
    
    print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).nonzero()[0].tolist()))
    print("\n")
    
    print("Sample Indices with missing data::",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])
    print("\n")
    
    print("General Stats::")
    print(df.info())
    print("\n")
    
    print("Summary Stats::")
    print(df.describe())
    print("\n")
    
    print("Dataframe Sample Rows::")
    display(df.head(5))
    
def cleanup_column_names(df,rename_dict={},do_inplace=True):
    """This function renames columns of a pandas dataframe
       It converts column names to snake case if rename_dict is not passed. 
    Args:
        rename_dict (dict): keys represent old column names and values point to 
                            newer ones
        do_inplace (bool): flag to update existing dataframe or return a new one
    Returns:
        pandas dataframe if do_inplace is set to False, None otherwise

    """
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ','_') 
                    for col in df.columns.values.tolist()}, 
                  inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict,inplace=do_inplace)

def expand_user_type(u_type):
    """This function maps user types to user classes
    Args:
        u_type (str): user type value
    Returns:
        (str) user_class value

    """
    if u_type in ['a','b']:
        return 'new'
    elif u_type == 'c':
        return 'existing'
    elif u_type == 'd':
        return 'loyal_existing'
    else:
        return 'error'

Generate a Sample Dataset

In [3]:
df = generate_sample_data(row_count=1000)

Describe the Dataset

In [4]:
describe_dataframe(df)

******************************
About the Data
******************************
Number of rows:: 1000
Number of columns:: 7


Column Names:: ['Date', 'Price', 'Product ID', 'Quantity Purchased', 'Serial No', 'User ID', 'User Type']


Column Data Types::
 Date                   object
Price                 float64
Product ID              int64
Quantity Purchased      int32
Serial No               int32
User ID                 int64
User Type              object
dtype: object


Columns with Missing Values:: ['Date', 'Price']


Number of rows with Missing Values:: 61


Sample Indices with missing data:: [1, 2, 3, 9, 11]


General Stats::
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Date                  969 non-null object
Price                 969 non-null float64
Product ID            1000 non-null int64
Quantity Purchased    1000 non-null int32
Serial No             1000 non-null int32
User ID               1000 non-null int64
User Type             1000 non-null object
dtypes: float64(1), int32(2), int64(2), object(2)
memory usage: 47.0+ KB
None


Summary Stats::
             Price  Product ID  Quantity Purchased    Serial No      User ID
count   969.000000  1000.00000         1000.000000  1000.000000  1000.000000
mean   2468.147967   618.90100           21.063000  1454.554000  5478.014000
std    1657.607501   274.46151           12.170092   385.901616   340.779522
min       2.600000     0.00000            1.000000    -1.000000  -101.000000
25%    1086.990000   382.75000           10.000000  1227.750000  5245.000000
50%    2248.270000   628.00000           21.000000  1483.500000  5412.000000
75%    3543.000000   838.75000           32.000000  1744.250000  5744.500000
max    8493.210000  1099.00000           41.000000  1999.000000  5992.000000


Dataframe Sample Rows::
Date Price Product ID Quantity Purchased Serial No User ID User Type
0 2016-01-23 1395.65 0 3 1000 -101 n
1 NaN 1352.99 906 19 1001 5632 n
2 NaN 3997.32 625 21 1002 5240 n
3 NaN 3681.48 865 35 1003 5557 n
4 2016-01-27 3850.22 929 3 1004 5489 n

Rename Columns

In [5]:
print("Dataframe columns:\n{}".format(df.columns.tolist()))
Dataframe columns:
['Date', 'Price', 'Product ID', 'Quantity Purchased', 'Serial No', 'User ID', 'User Type']
In [6]:
cleanup_column_names(df)
In [7]:
print("Dataframe columns:\n{}".format(df.columns.tolist()))
Dataframe columns:
['date', 'price', 'product_id', 'quantity_purchased', 'serial_no', 'user_id', 'user_type']

Sort Rows on defined attributes

In [8]:
display(df.sort_values(['serial_no', 'price'], 
                         ascending=[True, False]).head())
date price product_id quantity_purchased serial_no user_id user_type
502 2016-03-02 6168.66 625 22 -1 5240 b
727 2016-01-28 5483.55 445 11 -1 5016 c
680 2016-01-13 5163.72 185 41 -1 5679 d
599 2016-01-01 4903.91 551 2 -1 5688 d
75 2016-07-02 4584.97 534 12 -1 5351 a

Rearrange Columns in a Dataframe

In [9]:
display(df[['serial_no','date','user_id','user_type',
              'product_id','quantity_purchased','price']].head())
serial_no date user_id user_type product_id quantity_purchased price
0 1000 2016-01-23 -101 n 0 3 1395.65
1 1001 NaN 5632 n 906 19 1352.99
2 1002 NaN 5240 n 625 21 3997.32
3 1003 NaN 5557 n 865 35 3681.48
4 1004 2016-01-27 5489 n 929 3 3850.22

Filtering Columns

Using Column Index

In [10]:
# print 10 values from column at index 3
print(df.iloc[:,3].values[0:10])
[ 3 19 21 35  3  6 36 14 25 32]

Using Column Name

In [11]:
# print 10 values of quantity purchased
print(df.quantity_purchased.values[0:10])
[ 3 19 21 35  3  6 36 14 25 32]

Using Column Datatype

In [12]:
# print 10 values of columns with data type float
print(df.select_dtypes(include=['float64']).values[:10,0])
[ 1395.65  1352.99  3997.32  3681.48  3850.22   786.27  2725.81  4857.7
  2884.57  3138.58]

Filtering Rows

Select specific rows

In [13]:
display(df.iloc[[10,501,20]])
date price product_id quantity_purchased serial_no user_id user_type
10 2016-04-02 3027.86 856 24 1010 5381 n
501 2016-01-21 2017.56 906 5 1501 5632 d
20 2016-03-02 1920.21 172 30 1020 5865 n

Exclude Specific Row indices

In [14]:
display(df.drop([0,24,51], axis=0).head())
date price product_id quantity_purchased serial_no user_id user_type
1 NaN 1352.99 906 19 1001 5632 n
2 NaN 3997.32 625 21 1002 5240 n
3 NaN 3681.48 865 35 1003 5557 n
4 2016-01-27 3850.22 929 3 1004 5489 n
5 2016-01-28 786.27 300 6 1005 5262 n

Conditional Filtering

In [15]:
display(df[df.quantity_purchased>25].head())
date price product_id quantity_purchased serial_no user_id user_type
3 NaN 3681.48 865 35 1003 5557 n
6 2016-01-22 2725.81 572 36 1006 5661 n
9 NaN 3138.58 556 32 1009 5332 n
11 NaN 2780.03 829 37 -1 5307 n
12 2016-01-18 4192.76 1099 27 1012 5824 n

Offset from top of the dataframe

In [16]:
display(df[100:].head())
date price product_id quantity_purchased serial_no user_id user_type
100 2016-07-01 3151.10 379 38 -1 5405 c
101 2016-01-25 235.77 906 19 1101 5632 d
102 2016-01-28 429.29 625 33 1102 5240 d
103 2016-01-31 6877.38 865 35 1103 5557 c
104 2016-10-01 1895.89 929 29 1104 5489 c

Offset from bottom of the dataframe

In [17]:
display(df[-10:].head())
date price product_id quantity_purchased serial_no user_id user_type
990 2016-01-13 3366.48 611 16 1990 5039 d
991 2016-10-02 398.64 775 24 1991 5496 a
992 2016-07-01 4910.83 743 24 1992 5245 d
993 2016-11-02 1172.38 300 12 1993 5233 b
994 2016-01-17 1528.26 754 13 1994 5112 c

TypeCasting/Data Type Conversion

In [18]:
df['date'] = pd.to_datetime(df.date)
# compare dtypes of the original df with this one
print(df.dtypes)
date                  datetime64[ns]
price                        float64
product_id                     int64
quantity_purchased             int32
serial_no                      int32
user_id                        int64
user_type                     object
dtype: object

Apply/Map Usage

Map : Create a derived attribute using map

In [19]:
df['user_class'] = df['user_type'].map(expand_user_type)
display(df.tail())
date price product_id quantity_purchased serial_no user_id user_type user_class
995 2016-01-19 404.66 713 40 1995 5976 c existing
996 2016-02-01 236.63 808 40 1996 5950 b new
997 2016-08-01 3413.14 965 33 1997 5264 b new
998 2016-10-02 4820.05 678 36 1998 5955 b new
999 2016-01-15 3906.33 551 3 1999 5688 a new

Apply: Using apply to get attribute ranges

In [20]:
display(df.select_dtypes(include=[np.number]).apply(lambda x: 
                                                        x.max()- x.min()))
price                 8490.61
product_id            1099.00
quantity_purchased      40.00
serial_no             2000.00
user_id               6093.00
dtype: float64

Applymap: Extract week from date

In [21]:
df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week 
                                                if not pd.isnull(dt.week) 
                                                else 0)
In [22]:
display(df.head())
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week
0 2016-01-23 1395.65 0 3 1000 -101 n error 3
1 NaT 1352.99 906 19 1001 5632 n error 0
2 NaT 3997.32 625 21 1002 5240 n error 0
3 NaT 3681.48 865 35 1003 5557 n error 0
4 2016-01-27 3850.22 929 3 1004 5489 n error 4

Missing Values

Drop Rows with missing dates

In [23]:
df_dropped = df.dropna(subset=['date'])
display(df_dropped.head())
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week
0 2016-01-23 1395.65 0 3 1000 -101 n error 3
4 2016-01-27 3850.22 929 3 1004 5489 n error 4
5 2016-01-28 786.27 300 6 1005 5262 n error 4
6 2016-01-22 2725.81 572 36 1006 5661 n error 3
7 2016-10-01 4857.70 1011 14 1007 5412 n error 39

Fill Missing Price values with mean price

In [24]:
df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2),
                                inplace=True)

Fill Missing user_type values with value from previous row (forward fill)

In [25]:
df_dropped['user_type'].fillna(method='ffill',inplace=True)

Fill Missing user_type values with value from next row (backward fill)

In [26]:
df_dropped['user_type'].fillna(method='bfill',inplace=True)

Duplicates

Drop Duplicate serial_no rows

In [27]:
# sample duplicates
display(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())
print("Shape of df={}".format(df_dropped.shape))
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week
75 2016-07-02 4584.97 534 12 -1 5351 a new 26
97 2016-01-01 743.37 965 3 -1 5264 c existing 53
100 2016-07-01 3151.10 379 38 -1 5405 c existing 26
114 2016-05-01 337.44 736 6 -1 5443 d loyal_existing 17
145 2016-04-02 682.72 994 26 -1 5412 a new 13
Shape of df=(969, 9)
In [28]:
df_dropped.drop_duplicates(subset=['serial_no'],inplace=True)
In [29]:
# updated dataframe
display(df_dropped.head())
print("Shape of df={}".format(df_dropped.shape))
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week
0 2016-01-23 1395.65 0 3 1000 -101 n error 3
4 2016-01-27 3850.22 929 3 1004 5489 n error 4
5 2016-01-28 786.27 300 6 1005 5262 n error 4
6 2016-01-22 2725.81 572 36 1006 5661 n error 3
7 2016-10-01 4857.70 1011 14 1007 5412 n error 39
Shape of df=(940, 9)

Remove rows which have less than 3 attributes with non-missing data

In [30]:
display(df.dropna(thresh=3).head())
print("Shape of df={}".format(df.dropna(thresh=3).shape))
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week
0 2016-01-23 1395.65 0 3 1000 -101 n error 3
1 NaT 1352.99 906 19 1001 5632 n error 0
2 NaT 3997.32 625 21 1002 5240 n error 0
3 NaT 3681.48 865 35 1003 5557 n error 0
4 2016-01-27 3850.22 929 3 1004 5489 n error 4
Shape of df=(1000, 9)

Encode Categoricals

One Hot Encoding using get_dummies()

In [31]:
display(pd.get_dummies(df,columns=['user_type']).head())
date price product_id quantity_purchased serial_no user_id user_class purchase_week user_type_a user_type_b user_type_c user_type_d user_type_n
0 2016-01-23 1395.65 0 3 1000 -101 error 3 0 0 0 0 1
1 NaT 1352.99 906 19 1001 5632 error 0 0 0 0 0 1
2 NaT 3997.32 625 21 1002 5240 error 0 0 0 0 0 1
3 NaT 3681.48 865 35 1003 5557 error 0 0 0 0 0 1
4 2016-01-27 3850.22 929 3 1004 5489 error 4 0 0 0 0 1

Label Mapping

In [32]:
type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1}
df['encoded_user_type'] = df.user_type.map(type_map)
display((df.tail()))
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week encoded_user_type
995 2016-01-19 404.66 713 40 1995 5976 c existing 3 2.0
996 2016-02-01 236.63 808 40 1996 5950 b new 5 1.0
997 2016-08-01 3413.14 965 33 1997 5264 b new 31 1.0
998 2016-10-02 4820.05 678 36 1998 5955 b new 39 1.0
999 2016-01-15 3906.33 551 3 1999 5688 a new 2 0.0

Random Sampling data from DataFrame

In [33]:
display(df.sample(frac=0.2, replace=True, random_state=42).head())
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week encoded_user_type
102 2016-01-28 429.29 625 33 1102 5240 d loyal_existing 4 3.0
435 2016-01-25 1068.73 1067 41 1435 5943 b new 4 1.0
860 2016-08-02 5952.59 320 39 1860 5024 c existing 31 2.0
270 2016-12-01 1158.08 405 25 1270 5759 c existing 48 2.0
106 2016-08-02 2207.99 572 41 1106 5661 b new 31 1.0

Normalizing Numeric Values

Normalize price values using Min-Max Scaler

In [34]:
df_normalized = df.dropna().copy()
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1))
df_normalized['price'] = np_scaled.reshape(-1,1)
In [35]:
display(df_normalized.head())
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week encoded_user_type
23 2016-01-31 0.080591 805 12 1023 5042 b new 4 1.0
32 2016-01-23 0.092868 800 36 1032 5946 d loyal_existing 3 3.0
33 2016-05-01 0.102266 538 5 1033 5078 d loyal_existing 17 3.0
34 2016-08-02 0.080187 1069 31 1034 5202 d loyal_existing 31 3.0
35 2016-01-01 0.171362 1067 38 1035 5943 a new 53 0.0

Normalize quantity purchased values using Robust Scaler

In [36]:
df_normalized = df.dropna().copy()
robust_scaler = preprocessing.RobustScaler()
rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1))
df_normalized['quantity_purchased'] = rs_scaled.reshape(-1,1)
In [37]:
display(df_normalized.head())
date price product_id quantity_purchased serial_no user_id user_type user_class purchase_week encoded_user_type
23 2016-01-31 686.87 805 -0.428571 1023 5042 b new 4 1.0
32 2016-01-23 791.11 800 0.714286 1032 5946 d loyal_existing 3 3.0
33 2016-05-01 870.90 538 -0.761905 1033 5078 d loyal_existing 17 3.0
34 2016-08-02 683.44 1069 0.476190 1034 5202 d loyal_existing 31 3.0
35 2016-01-01 1457.57 1067 0.809524 1035 5943 a new 53 0.0

Data Summarization

Condition based aggregation

In [38]:
print("Mean price of items purchased by user_type=a :: {}".format(df['price'][df['user_type']=='a'].mean()))
Mean price of items purchased by user_type=a :: 2441.0280995475105

Condtion based counts

In [39]:
print(df['purchase_week'].value_counts())
3     172
4     166
2     101
39     53
35     50
26     50
22     46
9      46
53     45
13     45
31     41
44     39
5      39
0      31
17     27
48     27
18     22
Name: purchase_week, dtype: int64

Group By

Group By certain attributes

In [40]:
print(df.groupby(['user_class'])['quantity_purchased'].sum())
user_class
error              565
existing          5299
loyal_existing    5211
new               9988
Name: quantity_purchased, dtype: int32

Group By with different aggregate functions

In [41]:
display(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum,
                                                                np.mean,
                                                                np.count_nonzero]))
sum mean count_nonzero
user_class
error 565 18.225806 31
existing 5299 20.699219 256
loyal_existing 5211 21.533058 242
new 9988 21.205945 471

Group by specific aggregate functions for each attribute

In [42]:
display(df.groupby(['user_class','user_type']).agg({'price':np.mean,
                                                        'quantity_purchased':np.max}))
quantity_purchased price
user_class user_type
error n 40 2355.328710
existing c 41 2502.277358
loyal_existing d 41 2349.236695
new a 41 2441.028100
b 41 2592.225064

Group by with multiple agg for each attribute

In [43]:
display(df.groupby(['user_class','user_type']).agg({'price':{
                                                                'total_price':np.sum,
                                                                'mean_price':np.mean,
                                                                'variance_price':np.std,
                                                                'count':np.count_nonzero},
                                                   'quantity_purchased':np.sum}))  
C:\Anaconda2\envs\python3\lib\site-packages\pandas\core\groupby.py:4036: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
quantity_purchased price
sum count total_price variance_price mean_price
user_class user_type
error n 565 31.0 73015.19 1286.420840 2355.328710
existing c 5299 256.0 615560.23 1692.473899 2502.277358
loyal_existing d 5211 242.0 554419.86 1689.976272 2349.236695
new a 4752 229.0 539467.21 1530.875030 2441.028100
b 5236 242.0 609172.89 1746.481212 2592.225064

Pivot Tables

In [44]:
display(df.pivot_table(index='date', columns='user_type', 
                         values='price',aggfunc=np.mean))
user_type a b c d n
date
2016-01-01 1764.427143 2783.268889 1372.035556 2394.892000 NaN
2016-01-02 2783.710000 1978.700000 2249.120000 2899.566667 NaN
2016-01-13 2693.746000 2589.390000 3011.610000 2391.913750 NaN
2016-01-14 2456.080000 1274.623333 2336.740000 2341.664286 NaN
2016-01-15 2784.635000 4452.075000 2389.252500 2019.024000 NaN
2016-01-16 1871.776667 4390.910000 1474.106000 3959.580000 NaN
2016-01-17 1838.467500 2715.795000 2396.167500 1604.277500 NaN
2016-01-18 2287.111667 4001.793750 2302.727500 2558.880000 4192.760000
2016-01-19 2666.602500 1599.908333 1752.883333 1199.605556 NaN
2016-01-20 2931.550000 3002.483333 1593.802857 2323.136000 NaN
2016-01-21 2294.050000 2676.760000 1100.090000 2637.204167 NaN
2016-01-22 1982.850000 1257.962857 3056.168750 2241.121667 2267.086667
2016-01-23 2074.580000 2288.303333 3044.710000 2620.335000 1395.650000
2016-01-24 2643.150000 2974.468571 1296.088000 2374.154000 NaN
2016-01-25 2280.662857 2126.724000 2518.366000 1378.290000 NaN
2016-01-26 2856.388333 2502.004000 2032.230000 856.983333 NaN
2016-01-27 1861.821667 2942.900000 1988.678000 NaN 2465.666667
2016-01-28 2218.346667 4202.705000 2899.943333 2481.795000 786.270000
2016-01-29 2249.405000 2308.105714 3600.764444 2495.716667 NaN
2016-01-30 2340.668889 2407.977143 772.580000 2442.558571 NaN
2016-01-31 1747.980000 2723.865714 2928.438000 2564.803333 NaN
2016-02-01 4419.866667 2494.590000 3073.573750 2686.650000 NaN
2016-02-02 1486.748000 2444.455000 3577.707778 1073.430000 NaN
2016-03-01 2279.590000 4372.235714 2468.440000 2574.437143 NaN
2016-03-02 1490.204000 3080.872000 2099.906667 2161.867143 1920.210000
2016-04-01 2951.452857 1038.900000 2455.441429 2187.444286 NaN
2016-04-02 1507.856000 2675.234000 3138.163333 2199.732000 3027.860000
2016-05-01 3483.363333 2249.283333 2710.720000 2948.008333 NaN
2016-05-02 1992.621667 2404.948000 2845.672500 2339.355000 NaN
2016-06-01 2154.700000 2748.571250 1891.890000 2777.006667 NaN
2016-06-02 1924.160000 1321.549000 2408.695556 2147.680000 NaN
2016-07-01 2718.868333 2836.845714 3742.868333 3811.535556 538.690000
2016-07-02 3776.060000 4228.195000 2421.845000 2798.548000 3965.180000
2016-08-01 3618.547500 3413.140000 1708.800000 2434.700000 NaN
2016-08-02 2846.970000 2320.396250 3671.488182 1544.168000 NaN
2016-09-01 3104.240000 3417.488571 4045.373333 2343.676667 NaN
2016-09-02 1455.944000 2181.548889 2336.960000 2016.385000 3851.250000
2016-10-01 3133.875000 1264.810000 2009.400000 2793.730000 4184.955000
2016-10-02 1684.794000 1824.796250 2691.698000 1785.662000 NaN
2016-11-01 2104.942500 NaN 2520.680000 2366.991429 NaN
2016-11-02 4805.934000 2694.271667 1942.766667 1376.770000 2545.870000
2016-12-01 2400.422000 3680.301250 1640.072857 2402.953333 NaN

Stack a Dataframe

In [45]:
print(df.stack())
0    date                  2016-01-23 00:00:00
     price                             1395.65
     product_id                              0
     quantity_purchased                      3
     serial_no                            1000
     user_id                              -101
     user_type                               n
     user_class                          error
     purchase_week                           3
1    price                             1352.99
     product_id                            906
     quantity_purchased                     19
     serial_no                            1001
     user_id                              5632
     user_type                               n
     user_class                          error
     purchase_week                           0
2    price                             3997.32
     product_id                            625
     quantity_purchased                     21
     serial_no                            1002
     user_id                              5240
     user_type                               n
     user_class                          error
     purchase_week                           0
3    price                             3681.48
     product_id                            865
     quantity_purchased                     35
     serial_no                            1003
     user_id                              5557
                                  ...         
997  date                  2016-08-01 00:00:00
     price                             3413.14
     product_id                            965
     quantity_purchased                     33
     serial_no                            1997
     user_id                              5264
     user_type                               b
     user_class                            new
     purchase_week                          31
     encoded_user_type                       1
998  date                  2016-10-02 00:00:00
     price                             4820.05
     product_id                            678
     quantity_purchased                     36
     serial_no                            1998
     user_id                              5955
     user_type                               b
     user_class                            new
     purchase_week                          39
     encoded_user_type                       1
999  date                  2016-01-15 00:00:00
     price                             3906.33
     product_id                            551
     quantity_purchased                      3
     serial_no                            1999
     user_id                              5688
     user_type                               a
     user_class                            new
     purchase_week                           2
     encoded_user_type                       0
Length: 9907, dtype: object