import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store.csv", index_col=0)
vstable.shape
(40, 6)
vstable.head(10)
Gender | Income | Age | Rentals | AvgPerVisit | Genre | |
---|---|---|---|---|---|---|
ID | ||||||
1 | M | 45000 | 25 | 27 | 2.5 | Action |
2 | F | 54000 | 33 | 12 | 3.4 | Drama |
3 | F | 32000 | 20 | 42 | 1.6 | Comedy |
4 | F | 59000 | 70 | 16 | 4.2 | Drama |
5 | M | 37000 | 35 | 25 | 3.2 | Action |
6 | M | 18000 | 20 | 33 | 1.7 | Action |
7 | F | 29000 | 45 | 19 | 3.8 | Drama |
8 | M | 74000 | 25 | 31 | 2.4 | Action |
9 | M | 38000 | 21 | 18 | 2.1 | Comedy |
10 | F | 65000 | 40 | 21 | 3.3 | Drama |
vstable.columns
Index([u'Gender', u'Income', u'Age', u'Rentals', u'AvgPerVisit', u'Genre'], dtype='object')
vstable.dtypes
Gender object Income int64 Age int64 Rentals int64 AvgPerVisit float64 Genre object dtype: object
vstable["Income"] = vstable["Income"].astype(float)
vstable.dtypes
Gender object Income float64 Age int64 Rentals int64 AvgPerVisit float64 Genre object dtype: object
vstable.describe()
Income | Age | Rentals | AvgPerVisit | |
---|---|---|---|---|
count | 40.000000 | 40.000000 | 40.000000 | 40.000000 |
mean | 41500.000000 | 31.500000 | 26.175000 | 2.792500 |
std | 22925.744123 | 12.752074 | 9.594035 | 0.833401 |
min | 1000.000000 | 15.000000 | 11.000000 | 1.200000 |
25% | 24750.000000 | 21.000000 | 19.000000 | 2.200000 |
50% | 41000.000000 | 30.000000 | 25.000000 | 2.800000 |
75% | 57500.000000 | 36.500000 | 32.250000 | 3.325000 |
max | 89000.000000 | 70.000000 | 48.000000 | 4.600000 |
min_sal = vstable["Income"].min()
max_sal = vstable["Income"].max()
print min_sal, max_sal
1000.0 89000.0
vstable.describe(include="all")
Gender | Income | Age | Rentals | AvgPerVisit | Genre | |
---|---|---|---|---|---|---|
count | 40 | 40.000000 | 40.000000 | 40.000000 | 40.000000 | 40 |
unique | 2 | NaN | NaN | NaN | NaN | 3 |
top | M | NaN | NaN | NaN | NaN | Action |
freq | 21 | NaN | NaN | NaN | NaN | 15 |
mean | NaN | 41500.000000 | 31.500000 | 26.175000 | 2.792500 | NaN |
std | NaN | 22925.744123 | 12.752074 | 9.594035 | 0.833401 | NaN |
min | NaN | 1000.000000 | 15.000000 | 11.000000 | 1.200000 | NaN |
25% | NaN | 24750.000000 | 21.000000 | 19.000000 | 2.200000 | NaN |
50% | NaN | 41000.000000 | 30.000000 | 25.000000 | 2.800000 | NaN |
75% | NaN | 57500.000000 | 36.500000 | 32.250000 | 3.325000 | NaN |
max | NaN | 89000.000000 | 70.000000 | 48.000000 | 4.600000 | NaN |
norm_sal = (vstable["Income"] - min_sal) / (max_sal-min_sal)
print norm_sal.head(10)
ID 1 0.500000 2 0.602273 3 0.352273 4 0.659091 5 0.409091 6 0.193182 7 0.318182 8 0.829545 9 0.420455 10 0.727273 Name: Income, dtype: float64
age_z = (vstable["Age"] - vstable["Age"].mean()) / vstable["Age"].std()
age_z.head(5)
ID 1 -0.509721 2 0.117628 3 -0.901814 4 3.019117 5 0.274465 Name: Age, dtype: float64
vstable["Age-Std"] = age_z
vstable.head()
Gender | Income | Age | Rentals | AvgPerVisit | Genre | Age-Std | |
---|---|---|---|---|---|---|---|
ID | |||||||
1 | M | 45000.0 | 25 | 27 | 2.5 | Action | -0.509721 |
2 | F | 54000.0 | 33 | 12 | 3.4 | Drama | 0.117628 |
3 | F | 32000.0 | 20 | 42 | 1.6 | Comedy | -0.901814 |
4 | F | 59000.0 | 70 | 16 | 4.2 | Drama | 3.019117 |
5 | M | 37000.0 | 35 | 25 | 3.2 | Action | 0.274465 |
inc_bins = pd.qcut(vstable.Income, 3)
inc_bins.head(10)
ID 1 (29000, 49000] 2 (49000, 89000] 3 (29000, 49000] 4 (49000, 89000] 5 (29000, 49000] 6 [1000, 29000] 7 [1000, 29000] 8 (49000, 89000] 9 (29000, 49000] 10 (49000, 89000] Name: Income, dtype: category Categories (3, object): [[1000, 29000] < (29000, 49000] < (49000, 89000]]
inc_bins = pd.qcut(vstable.Income, [0, .33, .66, 1], labels=["low", "mid", "high"])
inc_bins
ID 1 mid 2 high 3 mid 4 high 5 mid 6 low 7 low 8 high 9 mid 10 high 11 mid 12 low 13 high 14 mid 15 high 16 low 17 mid 18 low 19 low 20 low 21 mid 22 low 23 low 24 high 25 low 26 high 27 high 28 high 29 low 30 mid 31 high 32 mid 33 low 34 low 35 high 36 low 37 high 38 mid 39 high 40 low Name: Income, dtype: category Categories (3, object): [low < mid < high]
vs2 = pd.concat([vstable,inc_bins], axis=1)
vs2
Gender | Income | Age | Rentals | AvgPerVisit | Genre | Age-Std | Income | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1 | M | 45000.0 | 25 | 27 | 2.5 | Action | -0.509721 | mid |
2 | F | 54000.0 | 33 | 12 | 3.4 | Drama | 0.117628 | high |
3 | F | 32000.0 | 20 | 42 | 1.6 | Comedy | -0.901814 | mid |
4 | F | 59000.0 | 70 | 16 | 4.2 | Drama | 3.019117 | high |
5 | M | 37000.0 | 35 | 25 | 3.2 | Action | 0.274465 | mid |
6 | M | 18000.0 | 20 | 33 | 1.7 | Action | -0.901814 | low |
7 | F | 29000.0 | 45 | 19 | 3.8 | Drama | 1.058651 | low |
8 | M | 74000.0 | 25 | 31 | 2.4 | Action | -0.509721 | high |
9 | M | 38000.0 | 21 | 18 | 2.1 | Comedy | -0.823395 | mid |
10 | F | 65000.0 | 40 | 21 | 3.3 | Drama | 0.666558 | high |
11 | F | 41000.0 | 22 | 48 | 2.3 | Drama | -0.744977 | mid |
12 | F | 26000.0 | 22 | 29 | 2.9 | Action | -0.744977 | low |
13 | M | 83000.0 | 46 | 14 | 3.6 | Comedy | 1.137070 | high |
14 | M | 45000.0 | 36 | 24 | 2.7 | Drama | 0.352884 | mid |
15 | M | 68000.0 | 30 | 36 | 2.7 | Comedy | -0.117628 | high |
16 | M | 17000.0 | 19 | 26 | 2.2 | Action | -0.980233 | low |
17 | M | 36000.0 | 35 | 28 | 3.5 | Drama | 0.274465 | mid |
18 | F | 6000.0 | 16 | 39 | 1.8 | Action | -1.215489 | low |
19 | F | 24000.0 | 25 | 41 | 3.1 | Comedy | -0.509721 | low |
20 | M | 12000.0 | 16 | 23 | 2.2 | Action | -1.215489 | low |
21 | F | 47000.0 | 52 | 11 | 3.1 | Drama | 1.607582 | mid |
22 | M | 25000.0 | 33 | 16 | 2.9 | Drama | 0.117628 | low |
23 | F | 2000.0 | 15 | 30 | 2.5 | Comedy | -1.293907 | low |
24 | F | 79000.0 | 35 | 22 | 3.8 | Drama | 0.274465 | high |
25 | M | 1000.0 | 16 | 25 | 1.4 | Comedy | -1.215489 | low |
26 | F | 56000.0 | 35 | 40 | 2.6 | Action | 0.274465 | high |
27 | F | 62000.0 | 47 | 32 | 3.6 | Drama | 1.215489 | high |
28 | M | 57000.0 | 52 | 22 | 4.1 | Comedy | 1.607582 | high |
29 | F | 15000.0 | 18 | 37 | 2.1 | Action | -1.058651 | low |
30 | M | 41000.0 | 25 | 17 | 1.4 | Action | -0.509721 | mid |
31 | F | 49000.0 | 56 | 15 | 3.2 | Comedy | 1.921256 | high |
32 | M | 47000.0 | 30 | 21 | 3.1 | Drama | -0.117628 | mid |
33 | M | 23000.0 | 25 | 28 | 2.7 | Action | -0.509721 | low |
34 | F | 29000.0 | 32 | 19 | 2.9 | Action | 0.039209 | low |
35 | M | 74000.0 | 29 | 43 | 4.6 | Action | -0.196047 | high |
36 | F | 29000.0 | 21 | 34 | 2.3 | Comedy | -0.823395 | low |
37 | M | 89000.0 | 46 | 12 | 1.2 | Comedy | 1.137070 | high |
38 | M | 41000.0 | 38 | 20 | 3.3 | Drama | 0.509721 | mid |
39 | F | 68000.0 | 35 | 19 | 3.9 | Comedy | 0.274465 | high |
40 | M | 17000.0 | 19 | 32 | 1.8 | Action | -0.980233 | low |
vstable["inc-bins"] = inc_bins
vstable
Gender | Income | Age | Rentals | AvgPerVisit | Genre | Age-Std | inc-bins | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1 | M | 45000.0 | 25 | 27 | 2.5 | Action | -0.509721 | mid |
2 | F | 54000.0 | 33 | 12 | 3.4 | Drama | 0.117628 | high |
3 | F | 32000.0 | 20 | 42 | 1.6 | Comedy | -0.901814 | mid |
4 | F | 59000.0 | 70 | 16 | 4.2 | Drama | 3.019117 | high |
5 | M | 37000.0 | 35 | 25 | 3.2 | Action | 0.274465 | mid |
6 | M | 18000.0 | 20 | 33 | 1.7 | Action | -0.901814 | low |
7 | F | 29000.0 | 45 | 19 | 3.8 | Drama | 1.058651 | low |
8 | M | 74000.0 | 25 | 31 | 2.4 | Action | -0.509721 | high |
9 | M | 38000.0 | 21 | 18 | 2.1 | Comedy | -0.823395 | mid |
10 | F | 65000.0 | 40 | 21 | 3.3 | Drama | 0.666558 | high |
11 | F | 41000.0 | 22 | 48 | 2.3 | Drama | -0.744977 | mid |
12 | F | 26000.0 | 22 | 29 | 2.9 | Action | -0.744977 | low |
13 | M | 83000.0 | 46 | 14 | 3.6 | Comedy | 1.137070 | high |
14 | M | 45000.0 | 36 | 24 | 2.7 | Drama | 0.352884 | mid |
15 | M | 68000.0 | 30 | 36 | 2.7 | Comedy | -0.117628 | high |
16 | M | 17000.0 | 19 | 26 | 2.2 | Action | -0.980233 | low |
17 | M | 36000.0 | 35 | 28 | 3.5 | Drama | 0.274465 | mid |
18 | F | 6000.0 | 16 | 39 | 1.8 | Action | -1.215489 | low |
19 | F | 24000.0 | 25 | 41 | 3.1 | Comedy | -0.509721 | low |
20 | M | 12000.0 | 16 | 23 | 2.2 | Action | -1.215489 | low |
21 | F | 47000.0 | 52 | 11 | 3.1 | Drama | 1.607582 | mid |
22 | M | 25000.0 | 33 | 16 | 2.9 | Drama | 0.117628 | low |
23 | F | 2000.0 | 15 | 30 | 2.5 | Comedy | -1.293907 | low |
24 | F | 79000.0 | 35 | 22 | 3.8 | Drama | 0.274465 | high |
25 | M | 1000.0 | 16 | 25 | 1.4 | Comedy | -1.215489 | low |
26 | F | 56000.0 | 35 | 40 | 2.6 | Action | 0.274465 | high |
27 | F | 62000.0 | 47 | 32 | 3.6 | Drama | 1.215489 | high |
28 | M | 57000.0 | 52 | 22 | 4.1 | Comedy | 1.607582 | high |
29 | F | 15000.0 | 18 | 37 | 2.1 | Action | -1.058651 | low |
30 | M | 41000.0 | 25 | 17 | 1.4 | Action | -0.509721 | mid |
31 | F | 49000.0 | 56 | 15 | 3.2 | Comedy | 1.921256 | high |
32 | M | 47000.0 | 30 | 21 | 3.1 | Drama | -0.117628 | mid |
33 | M | 23000.0 | 25 | 28 | 2.7 | Action | -0.509721 | low |
34 | F | 29000.0 | 32 | 19 | 2.9 | Action | 0.039209 | low |
35 | M | 74000.0 | 29 | 43 | 4.6 | Action | -0.196047 | high |
36 | F | 29000.0 | 21 | 34 | 2.3 | Comedy | -0.823395 | low |
37 | M | 89000.0 | 46 | 12 | 1.2 | Comedy | 1.137070 | high |
38 | M | 41000.0 | 38 | 20 | 3.3 | Drama | 0.509721 | mid |
39 | F | 68000.0 | 35 | 19 | 3.9 | Comedy | 0.274465 | high |
40 | M | 17000.0 | 19 | 32 | 1.8 | Action | -0.980233 | low |
vs_numeric = vstable[["Age","Income","Rentals","AvgPerVisit"]]
vs_num_std = (vs_numeric - vs_numeric.mean()) / vs_numeric.std()
vs_num_std.head(10)
Age | Income | Rentals | AvgPerVisit | |
---|---|---|---|---|
ID | ||||
1 | -0.509721 | 0.152667 | 0.085991 | -0.350971 |
2 | 0.117628 | 0.545239 | -1.477480 | 0.728941 |
3 | -0.901814 | -0.414381 | 1.649462 | -1.430883 |
4 | 3.019117 | 0.763334 | -1.060555 | 1.688862 |
5 | 0.274465 | -0.196286 | -0.122472 | 0.488960 |
6 | -0.901814 | -1.025049 | 0.711379 | -1.310893 |
7 | 1.058651 | -0.545239 | -0.747860 | 1.208901 |
8 | -0.509721 | 1.417620 | 0.502917 | -0.470962 |
9 | -0.823395 | -0.152667 | -0.852092 | -0.830932 |
10 | 0.666558 | 1.025049 | -0.539398 | 0.608950 |
zscore = lambda x: (x - x.mean()) / x.std()
vs_num_std = vs_numeric.apply(zscore)
vs_num_std.head()
Age | Income | Rentals | AvgPerVisit | |
---|---|---|---|---|
ID | ||||
1 | -0.509721 | 0.152667 | 0.085991 | -0.350971 |
2 | 0.117628 | 0.545239 | -1.477480 | 0.728941 |
3 | -0.901814 | -0.414381 | 1.649462 | -1.430883 |
4 | 3.019117 | 0.763334 | -1.060555 | 1.688862 |
5 | 0.274465 | -0.196286 | -0.122472 | 0.488960 |
zscore = lambda x: ((x - x.mean()) / x.std()) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
vs_std = vstable.copy()
vs_std.apply(zscore).head()
Gender | Income | Age | Rentals | AvgPerVisit | Genre | Age-Std | inc-bins | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
1 | M | 0.152667 | -0.509721 | 0.085991 | -0.350971 | Action | -0.509721 | mid |
2 | F | 0.545239 | 0.117628 | -1.477480 | 0.728941 | Drama | 0.117628 | high |
3 | F | -0.414381 | -0.901814 | 1.649462 | -1.430883 | Comedy | -0.901814 | mid |
4 | F | 0.763334 | 3.019117 | -1.060555 | 1.688862 | Drama | 3.019117 | high |
5 | M | -0.196286 | 0.274465 | -0.122472 | 0.488960 | Action | 0.274465 | mid |
vstable.groupby("Gender").mean()
Income | Age | Rentals | AvgPerVisit | Age-Std | |
---|---|---|---|---|---|
Gender | |||||
F | 40631.578947 | 33.631579 | 27.684211 | 2.968421 | 0.167155 |
M | 42285.714286 | 29.571429 | 24.809524 | 2.633333 | -0.151236 |
vstable.groupby("Genre").mean()
Income | Age | Rentals | AvgPerVisit | Age-Std | |
---|---|---|---|---|---|
Genre | |||||
Action | 32666.666667 | 24.066667 | 29.933333 | 2.466667 | -0.582912 |
Comedy | 45000.000000 | 31.916667 | 25.666667 | 2.641667 | 0.032674 |
Drama | 48461.538462 | 39.692308 | 22.307692 | 3.307692 | 0.642429 |
vstable.groupby("Genre").describe()
Age | Age-Std | AvgPerVisit | Income | Rentals | ||
---|---|---|---|---|---|---|
Genre | ||||||
Action | count | 15.000000 | 15.000000 | 15.000000 | 15.000000 | 15.000000 |
mean | 24.066667 | -0.582912 | 2.466667 | 32666.666667 | 29.933333 | |
std | 6.374802 | 0.499903 | 0.776132 | 21562.754484 | 7.591976 | |
min | 16.000000 | -1.215489 | 1.400000 | 6000.000000 | 17.000000 | |
25% | 19.000000 | -0.980233 | 1.950000 | 17000.000000 | 25.500000 | |
50% | 25.000000 | -0.509721 | 2.400000 | 26000.000000 | 29.000000 | |
75% | 27.000000 | -0.352884 | 2.800000 | 43000.000000 | 35.000000 | |
max | 35.000000 | 0.274465 | 4.600000 | 74000.000000 | 43.000000 | |
Comedy | count | 12.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 |
mean | 31.916667 | 0.032674 | 2.641667 | 45000.000000 | 25.666667 | |
std | 14.650215 | 1.148850 | 0.967150 | 29073.574381 | 10.662878 | |
min | 15.000000 | -1.293907 | 1.200000 | 1000.000000 | 12.000000 | |
25% | 20.750000 | -0.843000 | 1.975000 | 27750.000000 | 17.250000 | |
50% | 27.500000 | -0.313674 | 2.600000 | 43500.000000 | 23.500000 | |
75% | 46.000000 | 1.137070 | 3.300000 | 68000.000000 | 34.500000 | |
max | 56.000000 | 1.921256 | 4.100000 | 89000.000000 | 42.000000 | |
Drama | count | 13.000000 | 13.000000 | 13.000000 | 13.000000 | 13.000000 |
mean | 39.692308 | 0.642429 | 3.307692 | 48461.538462 | 22.307692 | |
std | 11.933040 | 0.935772 | 0.504086 | 15119.608596 | 9.672854 | |
min | 22.000000 | -0.744977 | 2.300000 | 25000.000000 | 11.000000 | |
25% | 33.000000 | 0.117628 | 3.100000 | 41000.000000 | 16.000000 | |
50% | 36.000000 | 0.352884 | 3.300000 | 47000.000000 | 21.000000 | |
75% | 45.000000 | 1.058651 | 3.600000 | 59000.000000 | 24.000000 | |
max | 70.000000 | 3.019117 | 4.200000 | 79000.000000 | 48.000000 |
vstable["Income"].plot(kind="hist", bins=6)
<matplotlib.axes._subplots.AxesSubplot at 0x9eb0c50>
vstable["Genre"].value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xa142470>
temp1 = vstable["Genre"].value_counts()/vstable["Genre"].count()
temp2 = vstable["Gender"].value_counts()/vstable["Gender"].count()
temp2
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Genre')
ax1.set_ylabel('Percentage')
ax1.set_title("Genre Distribution")
temp1.plot(kind='bar', grid = True)
ax1 = fig.add_subplot(122)
ax1.set_xlabel('Gender')
ax1.set_ylabel('Percentage')
ax1.set_title("Gender Distribution")
temp2.plot(kind='bar', grid = True)
<matplotlib.axes._subplots.AxesSubplot at 0xa487860>
vstable.plot(x="Income", y="Age", kind="scatter")
<matplotlib.axes._subplots.AxesSubplot at 0xa2e6908>
vstable.groupby(["Genre","Gender"])["Gender"].count()
Genre Gender Action F 5 M 10 Comedy F 6 M 6 Drama F 8 M 5 Name: Gender, dtype: int64
gg = pd.crosstab(vstable["Genre"], vstable["Gender"])
gg
Gender | F | M |
---|---|---|
Genre | ||
Action | 5 | 10 |
Comedy | 6 | 6 |
Drama | 8 | 5 |
plt.show(gg.plot(kind="bar"))
gg["percent_female"] = gg["F"]/(gg["F"]+gg["M"])
gg
Gender | F | M | percent_female |
---|---|---|---|
Genre | |||
Action | 5 | 10 | 0.333333 |
Comedy | 6 | 6 | 0.500000 |
Drama | 8 | 5 | 0.615385 |
plt.show(gg["percent_female"].plot(kind="bar"))
good_cust = vstable[vstable.Rentals>=30]
good_cust
Gender | Income | Age | Rentals | AvgPerVisit | Genre | Age-Std | inc-bins | |
---|---|---|---|---|---|---|---|---|
ID | ||||||||
3 | F | 32000.0 | 20 | 42 | 1.6 | Comedy | -0.901814 | mid |
6 | M | 18000.0 | 20 | 33 | 1.7 | Action | -0.901814 | low |
8 | M | 74000.0 | 25 | 31 | 2.4 | Action | -0.509721 | high |
11 | F | 41000.0 | 22 | 48 | 2.3 | Drama | -0.744977 | mid |
15 | M | 68000.0 | 30 | 36 | 2.7 | Comedy | -0.117628 | high |
18 | F | 6000.0 | 16 | 39 | 1.8 | Action | -1.215489 | low |
19 | F | 24000.0 | 25 | 41 | 3.1 | Comedy | -0.509721 | low |
23 | F | 2000.0 | 15 | 30 | 2.5 | Comedy | -1.293907 | low |
26 | F | 56000.0 | 35 | 40 | 2.6 | Action | 0.274465 | high |
27 | F | 62000.0 | 47 | 32 | 3.6 | Drama | 1.215489 | high |
29 | F | 15000.0 | 18 | 37 | 2.1 | Action | -1.058651 | low |
35 | M | 74000.0 | 29 | 43 | 4.6 | Action | -0.196047 | high |
36 | F | 29000.0 | 21 | 34 | 2.3 | Comedy | -0.823395 | low |
40 | M | 17000.0 | 19 | 32 | 1.8 | Action | -0.980233 | low |
print "Good Customers:\n", good_cust.describe()
print "\n All Customers:\n", vstable.describe()
Good Customers: Income Age Rentals AvgPerVisit Age-Std count 14.000000 14.000000 14.000000 14.000000 14.000000 mean 37000.000000 24.428571 37.000000 2.507143 -0.554532 std 25404.421178 8.599770 5.349335 0.818502 0.674382 min 2000.000000 15.000000 30.000000 1.600000 -1.293907 25% 17250.000000 19.250000 32.250000 1.875000 -0.960628 50% 30500.000000 21.500000 36.500000 2.350000 -0.784186 75% 60500.000000 28.000000 40.750000 2.675000 -0.274465 max 74000.000000 47.000000 48.000000 4.600000 1.215489 All Customers: Income Age Rentals AvgPerVisit Age-Std count 40.000000 40.000000 40.000000 40.000000 4.000000e+01 mean 41500.000000 31.500000 26.175000 2.792500 -1.110223e-17 std 22925.744123 12.752074 9.594035 0.833401 1.000000e+00 min 1000.000000 15.000000 11.000000 1.200000 -1.293907e+00 25% 24750.000000 21.000000 19.000000 2.200000 -8.233955e-01 50% 41000.000000 30.000000 25.000000 2.800000 -1.176279e-01 75% 57500.000000 36.500000 32.250000 3.325000 3.920931e-01 max 89000.000000 70.000000 48.000000 4.600000 3.019117e+00
gender_bin = pd.get_dummies(vstable["Gender"], prefix="Gender")
gender_bin.head()
Gender_F | Gender_M | |
---|---|---|
ID | ||
1 | 0.0 | 1.0 |
2 | 1.0 | 0.0 |
3 | 1.0 | 0.0 |
4 | 1.0 | 0.0 |
5 | 0.0 | 1.0 |
vs_ssf = pd.get_dummies(vstable)
vs_ssf.head(10)
Income | Age | Rentals | AvgPerVisit | Age-Std | Gender_F | Gender_M | Genre_Action | Genre_Comedy | Genre_Drama | inc-bins_low | inc-bins_mid | inc-bins_high | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ID | |||||||||||||
1 | 45000.0 | 25 | 27 | 2.5 | -0.509721 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
2 | 54000.0 | 33 | 12 | 3.4 | 0.117628 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
3 | 32000.0 | 20 | 42 | 1.6 | -0.901814 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
4 | 59000.0 | 70 | 16 | 4.2 | 3.019117 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
5 | 37000.0 | 35 | 25 | 3.2 | 0.274465 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
6 | 18000.0 | 20 | 33 | 1.7 | -0.901814 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
7 | 29000.0 | 45 | 19 | 3.8 | 1.058651 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 |
8 | 74000.0 | 25 | 31 | 2.4 | -0.509721 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
9 | 38000.0 | 21 | 18 | 2.1 | -0.823395 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
10 | 65000.0 | 40 | 21 | 3.3 | 0.666558 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
vs_ssf.describe(include="all")
Income | Age | Rentals | AvgPerVisit | Age-Std | Gender_F | Gender_M | Genre_Action | Genre_Comedy | Genre_Drama | inc-bins_low | inc-bins_mid | inc-bins_high | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 40.000000 | 40.000000 | 40.000000 | 40.000000 | 4.000000e+01 | 40.000000 | 40.000000 | 40.00000 | 40.000000 | 40.000000 | 40.00000 | 40.000000 | 40.000000 |
mean | 41500.000000 | 31.500000 | 26.175000 | 2.792500 | -1.110223e-17 | 0.475000 | 0.525000 | 0.37500 | 0.300000 | 0.325000 | 0.37500 | 0.275000 | 0.350000 |
std | 22925.744123 | 12.752074 | 9.594035 | 0.833401 | 1.000000e+00 | 0.505736 | 0.505736 | 0.49029 | 0.464095 | 0.474342 | 0.49029 | 0.452203 | 0.483046 |
min | 1000.000000 | 15.000000 | 11.000000 | 1.200000 | -1.293907e+00 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
25% | 24750.000000 | 21.000000 | 19.000000 | 2.200000 | -8.233955e-01 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
50% | 41000.000000 | 30.000000 | 25.000000 | 2.800000 | -1.176279e-01 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 |
75% | 57500.000000 | 36.500000 | 32.250000 | 3.325000 | 3.920931e-01 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 |
max | 89000.000000 | 70.000000 | 48.000000 | 4.600000 | 3.019117e+00 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 |
vs_norm = (vs_ssf - vs_ssf.min()) / (vs_ssf.max()-vs_ssf.min())
vs_norm.head(10)
Income | Age | Rentals | AvgPerVisit | Age-Std | Gender_F | Gender_M | Genre_Action | Genre_Comedy | Genre_Drama | inc-bins_low | inc-bins_mid | inc-bins_high | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ID | |||||||||||||
1 | 0.500000 | 0.181818 | 0.432432 | 0.382353 | 0.181818 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
2 | 0.602273 | 0.327273 | 0.027027 | 0.647059 | 0.327273 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
3 | 0.352273 | 0.090909 | 0.837838 | 0.117647 | 0.090909 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
4 | 0.659091 | 1.000000 | 0.135135 | 0.882353 | 1.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
5 | 0.409091 | 0.363636 | 0.378378 | 0.588235 | 0.363636 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
6 | 0.193182 | 0.090909 | 0.594595 | 0.147059 | 0.090909 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
7 | 0.318182 | 0.545455 | 0.216216 | 0.764706 | 0.545455 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 |
8 | 0.829545 | 0.181818 | 0.540541 | 0.352941 | 0.181818 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
9 | 0.420455 | 0.109091 | 0.189189 | 0.264706 | 0.109091 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
10 | 0.727273 | 0.454545 | 0.270270 | 0.617647 | 0.454545 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
vs_ssf.corr()
Income | Age | Rentals | AvgPerVisit | Age-Std | Gender_F | Gender_M | Genre_Action | Genre_Comedy | Genre_Drama | inc-bins_low | inc-bins_mid | inc-bins_high | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Income | 1.000000 | 0.613769 | -0.262472 | 0.468565 | 0.613769 | -0.036490 | 0.036490 | -0.302256 | 0.101217 | 0.213388 | -0.797271 | -0.016077 | 0.824278 |
Age | 0.613769 | 1.000000 | -0.547113 | 0.629107 | 1.000000 | 0.161022 | -0.161022 | -0.457274 | 0.021663 | 0.451453 | -0.535195 | -0.033349 | 0.574441 |
Rentals | -0.262472 | -0.547113 | 1.000000 | -0.206353 | -0.547113 | 0.151535 | -0.151535 | 0.307303 | -0.035128 | -0.283266 | 0.209184 | -0.040928 | -0.174007 |
AvgPerVisit | 0.468565 | 0.629107 | -0.206353 | 1.000000 | 0.629107 | 0.203343 | -0.203343 | -0.306701 | -0.119992 | 0.434413 | -0.350627 | -0.130462 | 0.478017 |
Age-Std | 0.613769 | 1.000000 | -0.547113 | 0.629107 | 1.000000 | 0.161022 | -0.161022 | -0.457274 | 0.021663 | 0.451453 | -0.535195 | -0.033349 | 0.574441 |
Gender_F | -0.036490 | 0.161022 | 0.151535 | 0.203343 | 0.161022 | 1.000000 | -1.000000 | -0.219744 | 0.032774 | 0.195067 | 0.090483 | -0.249464 | 0.141696 |
Gender_M | 0.036490 | -0.161022 | -0.151535 | -0.203343 | -0.161022 | -1.000000 | 1.000000 | 0.219744 | -0.032774 | -0.195067 | -0.090483 | 0.249464 | -0.141696 |
Genre_Action | -0.302256 | -0.457274 | 0.307303 | -0.306701 | -0.457274 | -0.219744 | 0.219744 | 1.000000 | -0.507093 | -0.537484 | 0.360000 | -0.130107 | -0.243599 |
Genre_Comedy | 0.101217 | 0.021663 | -0.035128 | -0.119992 | 0.021663 | 0.032774 | -0.032774 | -0.507093 | 1.000000 | -0.454257 | -0.056344 | -0.158832 | 0.205879 |
Genre_Drama | 0.213388 | 0.451453 | -0.283266 | 0.434413 | 0.451453 | 0.195067 | -0.195067 | -0.537484 | -0.454257 | 1.000000 | -0.316978 | 0.289883 | 0.050358 |
inc-bins_low | -0.797271 | -0.535195 | 0.209184 | -0.350627 | -0.535195 | 0.090483 | -0.090483 | 0.360000 | -0.056344 | -0.316978 | 1.000000 | -0.477060 | -0.568399 |
inc-bins_mid | -0.016077 | -0.033349 | -0.040928 | -0.130462 | -0.033349 | -0.249464 | 0.249464 | -0.130107 | -0.158832 | 0.289883 | -0.477060 | 1.000000 | -0.451934 |
inc-bins_high | 0.824278 | 0.574441 | -0.174007 | 0.478017 | 0.574441 | 0.141696 | -0.141696 | -0.243599 | 0.205879 | 0.050358 | -0.568399 | -0.451934 | 1.000000 |
vs_norm.to_csv("Video_Store_Numeric.csv", float_format="%1.2f")