In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [5]:
vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store.csv", index_col=0)

vstable.shape
Out[5]:
(40, 6)
In [6]:
vstable.head(10)
Out[6]:
Gender Income Age Rentals AvgPerVisit Genre
ID
1 M 45000 25 27 2.5 Action
2 F 54000 33 12 3.4 Drama
3 F 32000 20 42 1.6 Comedy
4 F 59000 70 16 4.2 Drama
5 M 37000 35 25 3.2 Action
6 M 18000 20 33 1.7 Action
7 F 29000 45 19 3.8 Drama
8 M 74000 25 31 2.4 Action
9 M 38000 21 18 2.1 Comedy
10 F 65000 40 21 3.3 Drama
In [7]:
vstable.columns
Out[7]:
Index([u'Gender', u'Income', u'Age', u'Rentals', u'AvgPerVisit', u'Genre'], dtype='object')
In [8]:
vstable.dtypes
Out[8]:
Gender          object
Income           int64
Age              int64
Rentals          int64
AvgPerVisit    float64
Genre           object
dtype: object

Now we can convert columns to the appropriate type as necessary:

In [9]:
vstable["Income"] = vstable["Income"].astype(float)
vstable.dtypes
Out[9]:
Gender          object
Income         float64
Age              int64
Rentals          int64
AvgPerVisit    float64
Genre           object
dtype: object
In [10]:
vstable.describe()
Out[10]:
Income Age Rentals AvgPerVisit
count 40.000000 40.000000 40.000000 40.000000
mean 41500.000000 31.500000 26.175000 2.792500
std 22925.744123 12.752074 9.594035 0.833401
min 1000.000000 15.000000 11.000000 1.200000
25% 24750.000000 21.000000 19.000000 2.200000
50% 41000.000000 30.000000 25.000000 2.800000
75% 57500.000000 36.500000 32.250000 3.325000
max 89000.000000 70.000000 48.000000 4.600000
In [11]:
min_sal = vstable["Income"].min()
max_sal = vstable["Income"].max()
print min_sal, max_sal
1000.0 89000.0
In [12]:
vstable.describe(include="all")
Out[12]:
Gender Income Age Rentals AvgPerVisit Genre
count 40 40.000000 40.000000 40.000000 40.000000 40
unique 2 NaN NaN NaN NaN 3
top M NaN NaN NaN NaN Action
freq 21 NaN NaN NaN NaN 15
mean NaN 41500.000000 31.500000 26.175000 2.792500 NaN
std NaN 22925.744123 12.752074 9.594035 0.833401 NaN
min NaN 1000.000000 15.000000 11.000000 1.200000 NaN
25% NaN 24750.000000 21.000000 19.000000 2.200000 NaN
50% NaN 41000.000000 30.000000 25.000000 2.800000 NaN
75% NaN 57500.000000 36.500000 32.250000 3.325000 NaN
max NaN 89000.000000 70.000000 48.000000 4.600000 NaN

We can perform data transformations such as normalization by directly applying the operation to the Pandas Series:

In [13]:
norm_sal = (vstable["Income"] - min_sal) / (max_sal-min_sal)
print norm_sal.head(10)
ID
1     0.500000
2     0.602273
3     0.352273
4     0.659091
5     0.409091
6     0.193182
7     0.318182
8     0.829545
9     0.420455
10    0.727273
Name: Income, dtype: float64

Z-Score Standardization on Age

In [14]:
age_z = (vstable["Age"] - vstable["Age"].mean()) / vstable["Age"].std()
age_z.head(5)
Out[14]:
ID
1   -0.509721
2    0.117628
3   -0.901814
4    3.019117
5    0.274465
Name: Age, dtype: float64
In [15]:
vstable["Age-Std"] = age_z
vstable.head()
Out[15]:
Gender Income Age Rentals AvgPerVisit Genre Age-Std
ID
1 M 45000.0 25 27 2.5 Action -0.509721
2 F 54000.0 33 12 3.4 Drama 0.117628
3 F 32000.0 20 42 1.6 Comedy -0.901814
4 F 59000.0 70 16 4.2 Drama 3.019117
5 M 37000.0 35 25 3.2 Action 0.274465
In [17]:
inc_bins = pd.qcut(vstable.Income, 3)
inc_bins.head(10)
Out[17]:
ID
1     (29000, 49000]
2     (49000, 89000]
3     (29000, 49000]
4     (49000, 89000]
5     (29000, 49000]
6      [1000, 29000]
7      [1000, 29000]
8     (49000, 89000]
9     (29000, 49000]
10    (49000, 89000]
Name: Income, dtype: category
Categories (3, object): [[1000, 29000] < (29000, 49000] < (49000, 89000]]
In [18]:
inc_bins = pd.qcut(vstable.Income, [0, .33, .66, 1], labels=["low", "mid", "high"])
inc_bins
Out[18]:
ID
1      mid
2     high
3      mid
4     high
5      mid
6      low
7      low
8     high
9      mid
10    high
11     mid
12     low
13    high
14     mid
15    high
16     low
17     mid
18     low
19     low
20     low
21     mid
22     low
23     low
24    high
25     low
26    high
27    high
28    high
29     low
30     mid
31    high
32     mid
33     low
34     low
35    high
36     low
37    high
38     mid
39    high
40     low
Name: Income, dtype: category
Categories (3, object): [low < mid < high]
In [19]:
vs2 = pd.concat([vstable,inc_bins], axis=1)
vs2
Out[19]:
Gender Income Age Rentals AvgPerVisit Genre Age-Std Income
ID
1 M 45000.0 25 27 2.5 Action -0.509721 mid
2 F 54000.0 33 12 3.4 Drama 0.117628 high
3 F 32000.0 20 42 1.6 Comedy -0.901814 mid
4 F 59000.0 70 16 4.2 Drama 3.019117 high
5 M 37000.0 35 25 3.2 Action 0.274465 mid
6 M 18000.0 20 33 1.7 Action -0.901814 low
7 F 29000.0 45 19 3.8 Drama 1.058651 low
8 M 74000.0 25 31 2.4 Action -0.509721 high
9 M 38000.0 21 18 2.1 Comedy -0.823395 mid
10 F 65000.0 40 21 3.3 Drama 0.666558 high
11 F 41000.0 22 48 2.3 Drama -0.744977 mid
12 F 26000.0 22 29 2.9 Action -0.744977 low
13 M 83000.0 46 14 3.6 Comedy 1.137070 high
14 M 45000.0 36 24 2.7 Drama 0.352884 mid
15 M 68000.0 30 36 2.7 Comedy -0.117628 high
16 M 17000.0 19 26 2.2 Action -0.980233 low
17 M 36000.0 35 28 3.5 Drama 0.274465 mid
18 F 6000.0 16 39 1.8 Action -1.215489 low
19 F 24000.0 25 41 3.1 Comedy -0.509721 low
20 M 12000.0 16 23 2.2 Action -1.215489 low
21 F 47000.0 52 11 3.1 Drama 1.607582 mid
22 M 25000.0 33 16 2.9 Drama 0.117628 low
23 F 2000.0 15 30 2.5 Comedy -1.293907 low
24 F 79000.0 35 22 3.8 Drama 0.274465 high
25 M 1000.0 16 25 1.4 Comedy -1.215489 low
26 F 56000.0 35 40 2.6 Action 0.274465 high
27 F 62000.0 47 32 3.6 Drama 1.215489 high
28 M 57000.0 52 22 4.1 Comedy 1.607582 high
29 F 15000.0 18 37 2.1 Action -1.058651 low
30 M 41000.0 25 17 1.4 Action -0.509721 mid
31 F 49000.0 56 15 3.2 Comedy 1.921256 high
32 M 47000.0 30 21 3.1 Drama -0.117628 mid
33 M 23000.0 25 28 2.7 Action -0.509721 low
34 F 29000.0 32 19 2.9 Action 0.039209 low
35 M 74000.0 29 43 4.6 Action -0.196047 high
36 F 29000.0 21 34 2.3 Comedy -0.823395 low
37 M 89000.0 46 12 1.2 Comedy 1.137070 high
38 M 41000.0 38 20 3.3 Drama 0.509721 mid
39 F 68000.0 35 19 3.9 Comedy 0.274465 high
40 M 17000.0 19 32 1.8 Action -0.980233 low
In [21]:
vstable["inc-bins"] = inc_bins
vstable
Out[21]:
Gender Income Age Rentals AvgPerVisit Genre Age-Std inc-bins
ID
1 M 45000.0 25 27 2.5 Action -0.509721 mid
2 F 54000.0 33 12 3.4 Drama 0.117628 high
3 F 32000.0 20 42 1.6 Comedy -0.901814 mid
4 F 59000.0 70 16 4.2 Drama 3.019117 high
5 M 37000.0 35 25 3.2 Action 0.274465 mid
6 M 18000.0 20 33 1.7 Action -0.901814 low
7 F 29000.0 45 19 3.8 Drama 1.058651 low
8 M 74000.0 25 31 2.4 Action -0.509721 high
9 M 38000.0 21 18 2.1 Comedy -0.823395 mid
10 F 65000.0 40 21 3.3 Drama 0.666558 high
11 F 41000.0 22 48 2.3 Drama -0.744977 mid
12 F 26000.0 22 29 2.9 Action -0.744977 low
13 M 83000.0 46 14 3.6 Comedy 1.137070 high
14 M 45000.0 36 24 2.7 Drama 0.352884 mid
15 M 68000.0 30 36 2.7 Comedy -0.117628 high
16 M 17000.0 19 26 2.2 Action -0.980233 low
17 M 36000.0 35 28 3.5 Drama 0.274465 mid
18 F 6000.0 16 39 1.8 Action -1.215489 low
19 F 24000.0 25 41 3.1 Comedy -0.509721 low
20 M 12000.0 16 23 2.2 Action -1.215489 low
21 F 47000.0 52 11 3.1 Drama 1.607582 mid
22 M 25000.0 33 16 2.9 Drama 0.117628 low
23 F 2000.0 15 30 2.5 Comedy -1.293907 low
24 F 79000.0 35 22 3.8 Drama 0.274465 high
25 M 1000.0 16 25 1.4 Comedy -1.215489 low
26 F 56000.0 35 40 2.6 Action 0.274465 high
27 F 62000.0 47 32 3.6 Drama 1.215489 high
28 M 57000.0 52 22 4.1 Comedy 1.607582 high
29 F 15000.0 18 37 2.1 Action -1.058651 low
30 M 41000.0 25 17 1.4 Action -0.509721 mid
31 F 49000.0 56 15 3.2 Comedy 1.921256 high
32 M 47000.0 30 21 3.1 Drama -0.117628 mid
33 M 23000.0 25 28 2.7 Action -0.509721 low
34 F 29000.0 32 19 2.9 Action 0.039209 low
35 M 74000.0 29 43 4.6 Action -0.196047 high
36 F 29000.0 21 34 2.3 Comedy -0.823395 low
37 M 89000.0 46 12 1.2 Comedy 1.137070 high
38 M 41000.0 38 20 3.3 Drama 0.509721 mid
39 F 68000.0 35 19 3.9 Comedy 0.274465 high
40 M 17000.0 19 32 1.8 Action -0.980233 low
In [22]:
vs_numeric = vstable[["Age","Income","Rentals","AvgPerVisit"]]
vs_num_std = (vs_numeric - vs_numeric.mean()) / vs_numeric.std()
vs_num_std.head(10)
Out[22]:
Age Income Rentals AvgPerVisit
ID
1 -0.509721 0.152667 0.085991 -0.350971
2 0.117628 0.545239 -1.477480 0.728941
3 -0.901814 -0.414381 1.649462 -1.430883
4 3.019117 0.763334 -1.060555 1.688862
5 0.274465 -0.196286 -0.122472 0.488960
6 -0.901814 -1.025049 0.711379 -1.310893
7 1.058651 -0.545239 -0.747860 1.208901
8 -0.509721 1.417620 0.502917 -0.470962
9 -0.823395 -0.152667 -0.852092 -0.830932
10 0.666558 1.025049 -0.539398 0.608950
In [23]:
zscore = lambda x: (x - x.mean()) / x.std()
vs_num_std = vs_numeric.apply(zscore)
vs_num_std.head()
Out[23]:
Age Income Rentals AvgPerVisit
ID
1 -0.509721 0.152667 0.085991 -0.350971
2 0.117628 0.545239 -1.477480 0.728941
3 -0.901814 -0.414381 1.649462 -1.430883
4 3.019117 0.763334 -1.060555 1.688862
5 0.274465 -0.196286 -0.122472 0.488960
In [24]:
zscore = lambda x: ((x - x.mean()) / x.std()) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
vs_std = vstable.copy()
vs_std.apply(zscore).head()
Out[24]:
Gender Income Age Rentals AvgPerVisit Genre Age-Std inc-bins
ID
1 M 0.152667 -0.509721 0.085991 -0.350971 Action -0.509721 mid
2 F 0.545239 0.117628 -1.477480 0.728941 Drama 0.117628 high
3 F -0.414381 -0.901814 1.649462 -1.430883 Comedy -0.901814 mid
4 F 0.763334 3.019117 -1.060555 1.688862 Drama 3.019117 high
5 M -0.196286 0.274465 -0.122472 0.488960 Action 0.274465 mid
In [25]:
vstable.groupby("Gender").mean()
Out[25]:
Income Age Rentals AvgPerVisit Age-Std
Gender
F 40631.578947 33.631579 27.684211 2.968421 0.167155
M 42285.714286 29.571429 24.809524 2.633333 -0.151236
In [26]:
vstable.groupby("Genre").mean()
Out[26]:
Income Age Rentals AvgPerVisit Age-Std
Genre
Action 32666.666667 24.066667 29.933333 2.466667 -0.582912
Comedy 45000.000000 31.916667 25.666667 2.641667 0.032674
Drama 48461.538462 39.692308 22.307692 3.307692 0.642429
In [27]:
vstable.groupby("Genre").describe()
Out[27]:
Age Age-Std AvgPerVisit Income Rentals
Genre
Action count 15.000000 15.000000 15.000000 15.000000 15.000000
mean 24.066667 -0.582912 2.466667 32666.666667 29.933333
std 6.374802 0.499903 0.776132 21562.754484 7.591976
min 16.000000 -1.215489 1.400000 6000.000000 17.000000
25% 19.000000 -0.980233 1.950000 17000.000000 25.500000
50% 25.000000 -0.509721 2.400000 26000.000000 29.000000
75% 27.000000 -0.352884 2.800000 43000.000000 35.000000
max 35.000000 0.274465 4.600000 74000.000000 43.000000
Comedy count 12.000000 12.000000 12.000000 12.000000 12.000000
mean 31.916667 0.032674 2.641667 45000.000000 25.666667
std 14.650215 1.148850 0.967150 29073.574381 10.662878
min 15.000000 -1.293907 1.200000 1000.000000 12.000000
25% 20.750000 -0.843000 1.975000 27750.000000 17.250000
50% 27.500000 -0.313674 2.600000 43500.000000 23.500000
75% 46.000000 1.137070 3.300000 68000.000000 34.500000
max 56.000000 1.921256 4.100000 89000.000000 42.000000
Drama count 13.000000 13.000000 13.000000 13.000000 13.000000
mean 39.692308 0.642429 3.307692 48461.538462 22.307692
std 11.933040 0.935772 0.504086 15119.608596 9.672854
min 22.000000 -0.744977 2.300000 25000.000000 11.000000
25% 33.000000 0.117628 3.100000 41000.000000 16.000000
50% 36.000000 0.352884 3.300000 47000.000000 21.000000
75% 45.000000 1.058651 3.600000 59000.000000 24.000000
max 70.000000 3.019117 4.200000 79000.000000 48.000000
In [28]:
vstable["Income"].plot(kind="hist", bins=6)
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x9eb0c50>
In [29]:
vstable["Genre"].value_counts().plot(kind='bar')
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0xa142470>
In [30]:
temp1 = vstable["Genre"].value_counts()/vstable["Genre"].count()
temp2 = vstable["Gender"].value_counts()/vstable["Gender"].count()
temp2

fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Genre')
ax1.set_ylabel('Percentage')
ax1.set_title("Genre Distribution")
temp1.plot(kind='bar', grid = True)

ax1 = fig.add_subplot(122)
ax1.set_xlabel('Gender')
ax1.set_ylabel('Percentage')
ax1.set_title("Gender Distribution")
temp2.plot(kind='bar', grid = True)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0xa487860>
In [31]:
vstable.plot(x="Income", y="Age", kind="scatter")
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0xa2e6908>
In [32]:
vstable.groupby(["Genre","Gender"])["Gender"].count()
Out[32]:
Genre   Gender
Action  F          5
        M         10
Comedy  F          6
        M          6
Drama   F          8
        M          5
Name: Gender, dtype: int64
In [33]:
gg = pd.crosstab(vstable["Genre"], vstable["Gender"])
gg
Out[33]:
Gender F M
Genre
Action 5 10
Comedy 6 6
Drama 8 5
In [34]:
plt.show(gg.plot(kind="bar"))
In [35]:
gg["percent_female"] = gg["F"]/(gg["F"]+gg["M"])
gg
Out[35]:
Gender F M percent_female
Genre
Action 5 10 0.333333
Comedy 6 6 0.500000
Drama 8 5 0.615385
In [36]:
plt.show(gg["percent_female"].plot(kind="bar"))

Suppose that we would like to find all "good cutomers", i.e., those with Rentals value of >= 30:

In [37]:
good_cust = vstable[vstable.Rentals>=30]
good_cust
Out[37]:
Gender Income Age Rentals AvgPerVisit Genre Age-Std inc-bins
ID
3 F 32000.0 20 42 1.6 Comedy -0.901814 mid
6 M 18000.0 20 33 1.7 Action -0.901814 low
8 M 74000.0 25 31 2.4 Action -0.509721 high
11 F 41000.0 22 48 2.3 Drama -0.744977 mid
15 M 68000.0 30 36 2.7 Comedy -0.117628 high
18 F 6000.0 16 39 1.8 Action -1.215489 low
19 F 24000.0 25 41 3.1 Comedy -0.509721 low
23 F 2000.0 15 30 2.5 Comedy -1.293907 low
26 F 56000.0 35 40 2.6 Action 0.274465 high
27 F 62000.0 47 32 3.6 Drama 1.215489 high
29 F 15000.0 18 37 2.1 Action -1.058651 low
35 M 74000.0 29 43 4.6 Action -0.196047 high
36 F 29000.0 21 34 2.3 Comedy -0.823395 low
40 M 17000.0 19 32 1.8 Action -0.980233 low
In [38]:
print "Good Customers:\n", good_cust.describe()
print "\n All Customers:\n", vstable.describe()
Good Customers:
             Income        Age    Rentals  AvgPerVisit    Age-Std
count     14.000000  14.000000  14.000000    14.000000  14.000000
mean   37000.000000  24.428571  37.000000     2.507143  -0.554532
std    25404.421178   8.599770   5.349335     0.818502   0.674382
min     2000.000000  15.000000  30.000000     1.600000  -1.293907
25%    17250.000000  19.250000  32.250000     1.875000  -0.960628
50%    30500.000000  21.500000  36.500000     2.350000  -0.784186
75%    60500.000000  28.000000  40.750000     2.675000  -0.274465
max    74000.000000  47.000000  48.000000     4.600000   1.215489

 All Customers:
             Income        Age    Rentals  AvgPerVisit       Age-Std
count     40.000000  40.000000  40.000000    40.000000  4.000000e+01
mean   41500.000000  31.500000  26.175000     2.792500 -1.110223e-17
std    22925.744123  12.752074   9.594035     0.833401  1.000000e+00
min     1000.000000  15.000000  11.000000     1.200000 -1.293907e+00
25%    24750.000000  21.000000  19.000000     2.200000 -8.233955e-01
50%    41000.000000  30.000000  25.000000     2.800000 -1.176279e-01
75%    57500.000000  36.500000  32.250000     3.325000  3.920931e-01
max    89000.000000  70.000000  48.000000     4.600000  3.019117e+00

Creating dummy variables and converting to standard spreadsheet format (all numeric attributes)

In [39]:
gender_bin = pd.get_dummies(vstable["Gender"], prefix="Gender")
gender_bin.head()
Out[39]:
Gender_F Gender_M
ID
1 0.0 1.0
2 1.0 0.0
3 1.0 0.0
4 1.0 0.0
5 0.0 1.0
In [40]:
vs_ssf = pd.get_dummies(vstable)
vs_ssf.head(10)
Out[40]:
Income Age Rentals AvgPerVisit Age-Std Gender_F Gender_M Genre_Action Genre_Comedy Genre_Drama inc-bins_low inc-bins_mid inc-bins_high
ID
1 45000.0 25 27 2.5 -0.509721 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0
2 54000.0 33 12 3.4 0.117628 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
3 32000.0 20 42 1.6 -0.901814 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
4 59000.0 70 16 4.2 3.019117 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
5 37000.0 35 25 3.2 0.274465 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0
6 18000.0 20 33 1.7 -0.901814 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0
7 29000.0 45 19 3.8 1.058651 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0
8 74000.0 25 31 2.4 -0.509721 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0
9 38000.0 21 18 2.1 -0.823395 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0
10 65000.0 40 21 3.3 0.666558 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
In [42]:
vs_ssf.describe(include="all")
Out[42]:
Income Age Rentals AvgPerVisit Age-Std Gender_F Gender_M Genre_Action Genre_Comedy Genre_Drama inc-bins_low inc-bins_mid inc-bins_high
count 40.000000 40.000000 40.000000 40.000000 4.000000e+01 40.000000 40.000000 40.00000 40.000000 40.000000 40.00000 40.000000 40.000000
mean 41500.000000 31.500000 26.175000 2.792500 -1.110223e-17 0.475000 0.525000 0.37500 0.300000 0.325000 0.37500 0.275000 0.350000
std 22925.744123 12.752074 9.594035 0.833401 1.000000e+00 0.505736 0.505736 0.49029 0.464095 0.474342 0.49029 0.452203 0.483046
min 1000.000000 15.000000 11.000000 1.200000 -1.293907e+00 0.000000 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.000000
25% 24750.000000 21.000000 19.000000 2.200000 -8.233955e-01 0.000000 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.000000
50% 41000.000000 30.000000 25.000000 2.800000 -1.176279e-01 0.000000 1.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.000000
75% 57500.000000 36.500000 32.250000 3.325000 3.920931e-01 1.000000 1.000000 1.00000 1.000000 1.000000 1.00000 1.000000 1.000000
max 89000.000000 70.000000 48.000000 4.600000 3.019117e+00 1.000000 1.000000 1.00000 1.000000 1.000000 1.00000 1.000000 1.000000
In [43]:
vs_norm = (vs_ssf - vs_ssf.min()) / (vs_ssf.max()-vs_ssf.min())
vs_norm.head(10)
Out[43]:
Income Age Rentals AvgPerVisit Age-Std Gender_F Gender_M Genre_Action Genre_Comedy Genre_Drama inc-bins_low inc-bins_mid inc-bins_high
ID
1 0.500000 0.181818 0.432432 0.382353 0.181818 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0
2 0.602273 0.327273 0.027027 0.647059 0.327273 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
3 0.352273 0.090909 0.837838 0.117647 0.090909 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
4 0.659091 1.000000 0.135135 0.882353 1.000000 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
5 0.409091 0.363636 0.378378 0.588235 0.363636 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0
6 0.193182 0.090909 0.594595 0.147059 0.090909 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0
7 0.318182 0.545455 0.216216 0.764706 0.545455 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0
8 0.829545 0.181818 0.540541 0.352941 0.181818 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0
9 0.420455 0.109091 0.189189 0.264706 0.109091 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0
10 0.727273 0.454545 0.270270 0.617647 0.454545 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0
In [44]:
vs_ssf.corr()
Out[44]:
Income Age Rentals AvgPerVisit Age-Std Gender_F Gender_M Genre_Action Genre_Comedy Genre_Drama inc-bins_low inc-bins_mid inc-bins_high
Income 1.000000 0.613769 -0.262472 0.468565 0.613769 -0.036490 0.036490 -0.302256 0.101217 0.213388 -0.797271 -0.016077 0.824278
Age 0.613769 1.000000 -0.547113 0.629107 1.000000 0.161022 -0.161022 -0.457274 0.021663 0.451453 -0.535195 -0.033349 0.574441
Rentals -0.262472 -0.547113 1.000000 -0.206353 -0.547113 0.151535 -0.151535 0.307303 -0.035128 -0.283266 0.209184 -0.040928 -0.174007
AvgPerVisit 0.468565 0.629107 -0.206353 1.000000 0.629107 0.203343 -0.203343 -0.306701 -0.119992 0.434413 -0.350627 -0.130462 0.478017
Age-Std 0.613769 1.000000 -0.547113 0.629107 1.000000 0.161022 -0.161022 -0.457274 0.021663 0.451453 -0.535195 -0.033349 0.574441
Gender_F -0.036490 0.161022 0.151535 0.203343 0.161022 1.000000 -1.000000 -0.219744 0.032774 0.195067 0.090483 -0.249464 0.141696
Gender_M 0.036490 -0.161022 -0.151535 -0.203343 -0.161022 -1.000000 1.000000 0.219744 -0.032774 -0.195067 -0.090483 0.249464 -0.141696
Genre_Action -0.302256 -0.457274 0.307303 -0.306701 -0.457274 -0.219744 0.219744 1.000000 -0.507093 -0.537484 0.360000 -0.130107 -0.243599
Genre_Comedy 0.101217 0.021663 -0.035128 -0.119992 0.021663 0.032774 -0.032774 -0.507093 1.000000 -0.454257 -0.056344 -0.158832 0.205879
Genre_Drama 0.213388 0.451453 -0.283266 0.434413 0.451453 0.195067 -0.195067 -0.537484 -0.454257 1.000000 -0.316978 0.289883 0.050358
inc-bins_low -0.797271 -0.535195 0.209184 -0.350627 -0.535195 0.090483 -0.090483 0.360000 -0.056344 -0.316978 1.000000 -0.477060 -0.568399
inc-bins_mid -0.016077 -0.033349 -0.040928 -0.130462 -0.033349 -0.249464 0.249464 -0.130107 -0.158832 0.289883 -0.477060 1.000000 -0.451934
inc-bins_high 0.824278 0.574441 -0.174007 0.478017 0.574441 0.141696 -0.141696 -0.243599 0.205879 0.050358 -0.568399 -0.451934 1.000000

The new table can be written into a file using to_csv method:

In [45]:
vs_norm.to_csv("Video_Store_Numeric.csv", float_format="%1.2f")
In [ ]: