In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store.csv", index_col=0)

vstable.shape

Out[2]:

(40, 6)

In [3]:

vstable.head(10)

Out[3]:

	Gender	Income	Age	Rentals	AvgPerVisit	Genre
ID
1	M	45000	25	27	2.5	Action
2	F	54000	33	12	3.4	Drama
3	F	32000	20	42	1.6	Comedy
4	F	59000	70	16	4.2	Drama
5	M	37000	35	25	3.2	Action
6	M	18000	20	33	1.7	Action
7	F	29000	45	19	3.8	Drama
8	M	74000	25	31	2.4	Action
9	M	38000	21	18	2.1	Comedy
10	F	65000	40	21	3.3	Drama

In [4]:

vstable.columns

Out[4]:

Index(['Gender', 'Income', 'Age', 'Rentals', 'AvgPerVisit', 'Genre'], dtype='object')

In [5]:

vstable.dtypes

Out[5]:

Gender          object
Income           int64
Age              int64
Rentals          int64
AvgPerVisit    float64
Genre           object
dtype: object

Now we can convert columns to the appropriate type as necessary:¶

In [6]:

vstable["Income"] = vstable["Income"].astype(float)
vstable.dtypes

Out[6]:

Gender          object
Income         float64
Age              int64
Rentals          int64
AvgPerVisit    float64
Genre           object
dtype: object

In [7]:

vstable.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 1 to 40
Data columns (total 6 columns):
Gender         40 non-null object
Income         40 non-null float64
Age            40 non-null int64
Rentals        40 non-null int64
AvgPerVisit    40 non-null float64
Genre          40 non-null object
dtypes: float64(2), int64(2), object(2)
memory usage: 2.2+ KB

In [8]:

vstable.describe()

Out[8]:

	Income	Age	Rentals	AvgPerVisit
count	40.000000	40.000000	40.000000	40.000000
mean	41500.000000	31.500000	26.175000	2.792500
std	22925.744123	12.752074	9.594035	0.833401
min	1000.000000	15.000000	11.000000	1.200000
25%	24750.000000	21.000000	19.000000	2.200000
50%	41000.000000	30.000000	25.000000	2.800000
75%	57500.000000	36.500000	32.250000	3.325000
max	89000.000000	70.000000	48.000000	4.600000

In [9]:

min_sal = vstable["Income"].min()
max_sal = vstable["Income"].max()
print(min_sal, max_sal)

1000.0 89000.0

In [10]:

vstable.describe(include="all")

Out[10]:

	Gender	Income	Age	Rentals	AvgPerVisit	Genre
count	40	40.000000	40.000000	40.000000	40.000000	40
unique	2	NaN	NaN	NaN	NaN	3
top	M	NaN	NaN	NaN	NaN	Action
freq	21	NaN	NaN	NaN	NaN	15
mean	NaN	41500.000000	31.500000	26.175000	2.792500	NaN
std	NaN	22925.744123	12.752074	9.594035	0.833401	NaN
min	NaN	1000.000000	15.000000	11.000000	1.200000	NaN
25%	NaN	24750.000000	21.000000	19.000000	2.200000	NaN
50%	NaN	41000.000000	30.000000	25.000000	2.800000	NaN
75%	NaN	57500.000000	36.500000	32.250000	3.325000	NaN
max	NaN	89000.000000	70.000000	48.000000	4.600000	NaN

In [11]:

vstable[["Income", "Age"]].describe()

Out[11]:

	Income	Age
count	40.000000	40.000000
mean	41500.000000	31.500000
std	22925.744123	12.752074
min	1000.000000	15.000000
25%	24750.000000	21.000000
50%	41000.000000	30.000000
75%	57500.000000	36.500000
max	89000.000000	70.000000

We can perform data transformations such as normalization by directly applying the operation to the Pandas Series:¶

In [12]:

norm_sal = (vstable["Income"] - min_sal) / (max_sal-min_sal)
norm_sal.head(10)

Out[12]:

ID
1     0.500000
2     0.602273
3     0.352273
4     0.659091
5     0.409091
6     0.193182
7     0.318182
8     0.829545
9     0.420455
10    0.727273
Name: Income, dtype: float64

Z-Score Standardization on Age¶

In [13]:

age_z = (vstable["Age"] - vstable["Age"].mean()) / vstable["Age"].std()
age_z.head(5)

Out[13]:

ID
1   -0.509721
2    0.117628
3   -0.901814
4    3.019117
5    0.274465
Name: Age, dtype: float64

New columns can be added to the dataframe as needed¶

In [14]:

vstable["Age-Std"] = age_z
vstable.head()

Out[14]:

	Gender	Income	Age	Rentals	AvgPerVisit	Genre	Age-Std
ID
1	M	45000.0	25	27	2.5	Action	-0.509721
2	F	54000.0	33	12	3.4	Drama	0.117628
3	F	32000.0	20	42	1.6	Comedy	-0.901814
4	F	59000.0	70	16	4.2	Drama	3.019117
5	M	37000.0	35	25	3.2	Action	0.274465

Discretization with Panda¶

In [15]:

# Discretize variable into equal-sized buckets based on rank or based on sample quantiles.

inc_bins = pd.qcut(vstable.Income, 3)
inc_bins.head(10)

Out[15]:

ID
1     (29000.0, 49000.0]
2     (49000.0, 89000.0]
3     (29000.0, 49000.0]
4     (49000.0, 89000.0]
5     (29000.0, 49000.0]
6     (999.999, 29000.0]
7     (999.999, 29000.0]
8     (49000.0, 89000.0]
9     (29000.0, 49000.0]
10    (49000.0, 89000.0]
Name: Income, dtype: category
Categories (3, interval[float64]): [(999.999, 29000.0] < (29000.0, 49000.0] < (49000.0, 89000.0]]

In [16]:

# We can specifiy an array of quantiles for discretization together with labels for the bins)

inc_bins = pd.qcut(vstable.Income, [0, .33, .66, 1], labels=["low", "mid", "high"])
inc_bins.head(10)

Out[16]:

ID
1      mid
2     high
3      mid
4     high
5      mid
6      low
7      low
8     high
9      mid
10    high
Name: Income, dtype: category
Categories (3, object): [low < mid < high]

In [17]:

vstable["inc-bins"] = inc_bins
vstable.head(10)

Out[17]:

	Gender	Income	Age	Rentals	AvgPerVisit	Genre	Age-Std	inc-bins
ID
1	M	45000.0	25	27	2.5	Action	-0.509721	mid
2	F	54000.0	33	12	3.4	Drama	0.117628	high
3	F	32000.0	20	42	1.6	Comedy	-0.901814	mid
4	F	59000.0	70	16	4.2	Drama	3.019117	high
5	M	37000.0	35	25	3.2	Action	0.274465	mid
6	M	18000.0	20	33	1.7	Action	-0.901814	low
7	F	29000.0	45	19	3.8	Drama	1.058651	low
8	M	74000.0	25	31	2.4	Action	-0.509721	high
9	M	38000.0	21	18	2.1	Comedy	-0.823395	mid
10	F	65000.0	40	21	3.3	Drama	0.666558	high

In [18]:

# We can also drop columns from the dataframe

vstable.drop(columns=['Age-Std','inc-bins'], inplace=True)
vstable.head()

Out[18]:

	Gender	Income	Age	Rentals	AvgPerVisit	Genre
ID
1	M	45000.0	25	27	2.5	Action
2	F	54000.0	33	12	3.4	Drama
3	F	32000.0	20	42	1.6	Comedy
4	F	59000.0	70	16	4.2	Drama
5	M	37000.0	35	25	3.2	Action

In [19]:

vs_numeric = vstable[["Age","Income","Rentals","AvgPerVisit"]]
vs_num_std = (vs_numeric - vs_numeric.mean()) / vs_numeric.std()
vs_num_std.head(10)

Out[19]:

	Age	Income	Rentals	AvgPerVisit
ID
1	-0.509721	0.152667	0.085991	-0.350971
2	0.117628	0.545239	-1.477480	0.728941
3	-0.901814	-0.414381	1.649462	-1.430883
4	3.019117	0.763334	-1.060555	1.688862
5	0.274465	-0.196286	-0.122472	0.488960
6	-0.901814	-1.025049	0.711379	-1.310893
7	1.058651	-0.545239	-0.747860	1.208901
8	-0.509721	1.417620	0.502917	-0.470962
9	-0.823395	-0.152667	-0.852092	-0.830932
10	0.666558	1.025049	-0.539398	0.608950

In [20]:

zscore = lambda x: (x - x.mean()) / x.std()
vs_num_std = vs_numeric.apply(zscore)
vs_num_std.head()

Out[20]:

	Age	Income	Rentals	AvgPerVisit
ID
1	-0.509721	0.152667	0.085991	-0.350971
2	0.117628	0.545239	-1.477480	0.728941
3	-0.901814	-0.414381	1.649462	-1.430883
4	3.019117	0.763334	-1.060555	1.688862
5	0.274465	-0.196286	-0.122472	0.488960

In [21]:

# Instead of separating the numeric attributes, we can condition the standardization function on the data types

zscore = lambda x: ((x - x.mean()) / x.std()) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
vs_std = vstable.copy()
vs_std.apply(zscore).head()

Out[21]:

	Gender	Income	Age	Rentals	AvgPerVisit	Genre
ID
1	M	0.152667	-0.509721	0.085991	-0.350971	Action
2	F	0.545239	0.117628	-1.477480	0.728941	Drama
3	F	-0.414381	-0.901814	1.649462	-1.430883	Comedy
4	F	0.763334	3.019117	-1.060555	1.688862	Drama
5	M	-0.196286	0.274465	-0.122472	0.488960	Action

Grouping and aggregating data¶

In [22]:

vstable.groupby("Gender").mean()

Out[22]:

	Income	Age	Rentals	AvgPerVisit
Gender
F	40631.578947	33.631579	27.684211	2.968421
M	42285.714286	29.571429	24.809524	2.633333

In [23]:

vstable.groupby("Genre").mean()

Out[23]:

	Income	Age	Rentals	AvgPerVisit
Genre
Action	32666.666667	24.066667	29.933333	2.466667
Comedy	45000.000000	31.916667	25.666667	2.641667
Drama	48461.538462	39.692308	22.307692	3.307692

In [24]:

vstable.groupby("Genre").describe().T

Out[24]:

	Genre	Action	Comedy	Drama
Age	count	15.000000	12.000000	13.000000
	mean	24.066667	31.916667	39.692308
	std	6.374802	14.650215	11.933040
	min	16.000000	15.000000	22.000000
	25%	19.000000	20.750000	33.000000
	50%	25.000000	27.500000	36.000000
	75%	27.000000	46.000000	45.000000
	max	35.000000	56.000000	70.000000
AvgPerVisit	count	15.000000	12.000000	13.000000
	mean	2.466667	2.641667	3.307692
	std	0.776132	0.967150	0.504086
	min	1.400000	1.200000	2.300000
	25%	1.950000	1.975000	3.100000
	50%	2.400000	2.600000	3.300000
	75%	2.800000	3.300000	3.600000
	max	4.600000	4.100000	4.200000
Income	count	15.000000	12.000000	13.000000
	mean	32666.666667	45000.000000	48461.538462
	std	21562.754484	29073.574381	15119.608596
	min	6000.000000	1000.000000	25000.000000
	25%	17000.000000	27750.000000	41000.000000
	50%	26000.000000	43500.000000	47000.000000
	75%	43000.000000	68000.000000	59000.000000
	max	74000.000000	89000.000000	79000.000000
Rentals	count	15.000000	12.000000	13.000000
	mean	29.933333	25.666667	22.307692
	std	7.591976	10.662878	9.672854
	min	17.000000	12.000000	11.000000
	25%	25.500000	17.250000	16.000000
	50%	29.000000	23.500000	21.000000
	75%	35.000000	34.500000	24.000000
	max	43.000000	42.000000	48.000000

In [25]:

vstable["Income"].plot(kind="hist", bins=6)

Out[25]:

<matplotlib.axes._subplots.AxesSubplot at 0x24fb262ba20>

In [26]:

vstable["Genre"].value_counts().plot(kind='bar')

Out[26]:

<matplotlib.axes._subplots.AxesSubplot at 0x24fb26ef748>

In [27]:

temp1 = vstable["Genre"].value_counts()/vstable["Genre"].count()
temp2 = vstable["Gender"].value_counts()/vstable["Gender"].count()
temp2

fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Genre')
ax1.set_ylabel('Percentage')
ax1.set_title("Genre Distribution")
temp1.plot(kind='bar', grid = True)

ax1 = fig.add_subplot(122)
ax1.set_xlabel('Gender')
ax1.set_ylabel('Percentage')
ax1.set_title("Gender Distribution")
temp2.plot(kind='bar', grid = True)

Out[27]:

<matplotlib.axes._subplots.AxesSubplot at 0x24fb277c860>

In [28]:

vstable.plot(x="Income", y="Age", kind="scatter")

Out[28]:

<matplotlib.axes._subplots.AxesSubplot at 0x24fb2742ba8>

In [29]:

vstable.plot(x="Income", y="Age", kind="scatter", alpha=0.8, s=vstable["Rentals"]*5, c="AvgPerVisit", cmap=plt.get_cmap("jet"), colorbar=True, figsize=(10,7))

Out[29]:

<matplotlib.axes._subplots.AxesSubplot at 0x24fb27cbd30>

In [30]:

vstable.groupby(["Genre","Gender"])["Gender"].count()

Out[30]:

Genre   Gender
Action  F          5
        M         10
Comedy  F          6
        M          6
Drama   F          8
        M          5
Name: Gender, dtype: int64

In [31]:

gg = pd.crosstab(vstable["Genre"], vstable["Gender"])
gg

Out[31]:

Gender	F	M
Genre
Action	5	10
Comedy	6	6
Drama	8	5

In [32]:

plt.show(gg.plot(kind="bar"))

In [33]:

gg["percent_female"] = gg["F"]/(gg["F"]+gg["M"])
gg

Out[33]:

Gender	F	M	percent_female
Genre
Action	5	10	0.333333
Comedy	6	6	0.500000
Drama	8	5	0.615385

In [34]:

plt.show(gg["percent_female"].plot(kind="bar"))

Suppose that we would like to find all "good cutomers", i.e., those with Rentals value of >= 30:¶

In [35]:

good_cust = vstable[vstable.Rentals>=30]
good_cust

Out[35]:

	Gender	Income	Age	Rentals	AvgPerVisit	Genre
ID
3	F	32000.0	20	42	1.6	Comedy
6	M	18000.0	20	33	1.7	Action
8	M	74000.0	25	31	2.4	Action
11	F	41000.0	22	48	2.3	Drama
15	M	68000.0	30	36	2.7	Comedy
18	F	6000.0	16	39	1.8	Action
19	F	24000.0	25	41	3.1	Comedy
23	F	2000.0	15	30	2.5	Comedy
26	F	56000.0	35	40	2.6	Action
27	F	62000.0	47	32	3.6	Drama
29	F	15000.0	18	37	2.1	Action
35	M	74000.0	29	43	4.6	Action
36	F	29000.0	21	34	2.3	Comedy
40	M	17000.0	19	32	1.8	Action

In [36]:

print("Good Customers:")
good_cust.describe()

Good Customers:

Out[36]:

	Income	Age	Rentals	AvgPerVisit
count	14.000000	14.000000	14.000000	14.000000
mean	37000.000000	24.428571	37.000000	2.507143
std	25404.421178	8.599770	5.349335	0.818502
min	2000.000000	15.000000	30.000000	1.600000
25%	17250.000000	19.250000	32.250000	1.875000
50%	30500.000000	21.500000	36.500000	2.350000
75%	60500.000000	28.000000	40.750000	2.675000
max	74000.000000	47.000000	48.000000	4.600000

In [37]:

print("All Customers:")
vstable.describe()

All Customers:

Out[37]:

	Income	Age	Rentals	AvgPerVisit
count	40.000000	40.000000	40.000000	40.000000
mean	41500.000000	31.500000	26.175000	2.792500
std	22925.744123	12.752074	9.594035	0.833401
min	1000.000000	15.000000	11.000000	1.200000
25%	24750.000000	21.000000	19.000000	2.200000
50%	41000.000000	30.000000	25.000000	2.800000
75%	57500.000000	36.500000	32.250000	3.325000
max	89000.000000	70.000000	48.000000	4.600000

Creating dummy variables and converting to standard spreadsheet format (all numeric attributes)¶

In [38]:

gender_bin = pd.get_dummies(vstable["Gender"], prefix="Gender")
gender_bin.head()

Out[38]:

	Gender_F	Gender_M
ID
1	0	1
2	1	0
3	1	0
4	1	0
5	0	1

In [39]:

vs_ssf = pd.get_dummies(vstable)
vs_ssf.head(10)

Out[39]:

	Income	Age	Rentals	AvgPerVisit	Gender_F	Gender_M	Genre_Action	Genre_Comedy	Genre_Drama
ID
1	45000.0	25	27	2.5	0	1	1	0	0
2	54000.0	33	12	3.4	1	0	0	0	1
3	32000.0	20	42	1.6	1	0	0	1	0
4	59000.0	70	16	4.2	1	0	0	0	1
5	37000.0	35	25	3.2	0	1	1	0	0
6	18000.0	20	33	1.7	0	1	1	0	0
7	29000.0	45	19	3.8	1	0	0	0	1
8	74000.0	25	31	2.4	0	1	1	0	0
9	38000.0	21	18	2.1	0	1	0	1	0
10	65000.0	40	21	3.3	1	0	0	0	1

In [40]:

vs_ssf.describe()

Out[40]:

	Income	Age	Rentals	AvgPerVisit	Gender_F	Gender_M	Genre_Action	Genre_Comedy	Genre_Drama
count	40.000000	40.000000	40.000000	40.000000	40.000000	40.000000	40.00000	40.000000	40.000000
mean	41500.000000	31.500000	26.175000	2.792500	0.475000	0.525000	0.37500	0.300000	0.325000
std	22925.744123	12.752074	9.594035	0.833401	0.505736	0.505736	0.49029	0.464095	0.474342
min	1000.000000	15.000000	11.000000	1.200000	0.000000	0.000000	0.00000	0.000000	0.000000
25%	24750.000000	21.000000	19.000000	2.200000	0.000000	0.000000	0.00000	0.000000	0.000000
50%	41000.000000	30.000000	25.000000	2.800000	0.000000	1.000000	0.00000	0.000000	0.000000
75%	57500.000000	36.500000	32.250000	3.325000	1.000000	1.000000	1.00000	1.000000	1.000000
max	89000.000000	70.000000	48.000000	4.600000	1.000000	1.000000	1.00000	1.000000	1.000000

In [41]:

# Min-Max normalization performed on the full numeric data set

vs_norm = (vs_ssf - vs_ssf.min()) / (vs_ssf.max()-vs_ssf.min())
vs_norm.head(10)

Out[41]:

	Income	Age	Rentals	AvgPerVisit	Gender_F	Gender_M	Genre_Action	Genre_Comedy	Genre_Drama
ID
1	0.500000	0.181818	0.432432	0.382353	0.0	1.0	1.0	0.0	0.0
2	0.602273	0.327273	0.027027	0.647059	1.0	0.0	0.0	0.0	1.0
3	0.352273	0.090909	0.837838	0.117647	1.0	0.0	0.0	1.0	0.0
4	0.659091	1.000000	0.135135	0.882353	1.0	0.0	0.0	0.0	1.0
5	0.409091	0.363636	0.378378	0.588235	0.0	1.0	1.0	0.0	0.0
6	0.193182	0.090909	0.594595	0.147059	0.0	1.0	1.0	0.0	0.0
7	0.318182	0.545455	0.216216	0.764706	1.0	0.0	0.0	0.0	1.0
8	0.829545	0.181818	0.540541	0.352941	0.0	1.0	1.0	0.0	0.0
9	0.420455	0.109091	0.189189	0.264706	0.0	1.0	0.0	1.0	0.0
10	0.727273	0.454545	0.270270	0.617647	1.0	0.0	0.0	0.0	1.0

In [45]:

# After converting to all numeric attributes, we can perform correlation analysis on the variable

corr_matrix = vs_ssf.corr()
corr_matrix

Out[45]:

	Income	Age	Rentals	AvgPerVisit	Gender_F	Gender_M	Genre_Action	Genre_Comedy	Genre_Drama
Income	1.000000	0.613769	-0.262472	0.468565	-0.036490	0.036490	-0.302256	0.101217	0.213388
Age	0.613769	1.000000	-0.547113	0.629107	0.161022	-0.161022	-0.457274	0.021663	0.451453
Rentals	-0.262472	-0.547113	1.000000	-0.206353	0.151535	-0.151535	0.307303	-0.035128	-0.283266
AvgPerVisit	0.468565	0.629107	-0.206353	1.000000	0.203343	-0.203343	-0.306701	-0.119992	0.434413
Gender_F	-0.036490	0.161022	0.151535	0.203343	1.000000	-1.000000	-0.219744	0.032774	0.195067
Gender_M	0.036490	-0.161022	-0.151535	-0.203343	-1.000000	1.000000	0.219744	-0.032774	-0.195067
Genre_Action	-0.302256	-0.457274	0.307303	-0.306701	-0.219744	0.219744	1.000000	-0.507093	-0.537484
Genre_Comedy	0.101217	0.021663	-0.035128	-0.119992	0.032774	-0.032774	-0.507093	1.000000	-0.454257
Genre_Drama	0.213388	0.451453	-0.283266	0.434413	0.195067	-0.195067	-0.537484	-0.454257	1.000000

In [46]:

corr_matrix["Rentals"].sort_values(ascending=False)

Out[46]:

Rentals         1.000000
Genre_Action    0.307303
Gender_F        0.151535
Genre_Comedy   -0.035128
Gender_M       -0.151535
AvgPerVisit    -0.206353
Income         -0.262472
Genre_Drama    -0.283266
Age            -0.547113
Name: Rentals, dtype: float64

The new table can be written into a file using to_csv method:¶

In [43]:

vs_norm.to_csv("Video_Store_Numeric.csv", float_format="%1.2f")