import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store.csv", index_col=0)
vstable.shape
vstable.head(10)
vstable.columns
vstable.dtypes
vstable["Income"] = vstable["Income"].astype(float)
vstable.dtypes
vstable.info()
vstable.describe()
min_sal = vstable["Income"].min()
max_sal = vstable["Income"].max()
print(min_sal, max_sal)
vstable.describe(include="all")
vstable[["Income", "Age"]].describe()
norm_sal = (vstable["Income"] - min_sal) / (max_sal-min_sal)
norm_sal.head(10)
age_z = (vstable["Age"] - vstable["Age"].mean()) / vstable["Age"].std()
age_z.head(5)
vstable["Age-Std"] = age_z
vstable.head()
# Discretize variable into equal-sized buckets based on rank or based on sample quantiles.
inc_bins = pd.qcut(vstable.Income, 3)
inc_bins.head(10)
# We can specifiy an array of quantiles for discretization together with labels for the bins)
inc_bins = pd.qcut(vstable.Income, [0, .33, .66, 1], labels=["low", "mid", "high"])
inc_bins.head(10)
vstable["inc-bins"] = inc_bins
vstable.head(10)
# We can also drop columns from the dataframe
vstable.drop(columns=['Age-Std','inc-bins'], inplace=True)
vstable.head()
vs_numeric = vstable[["Age","Income","Rentals","AvgPerVisit"]]
vs_num_std = (vs_numeric - vs_numeric.mean()) / vs_numeric.std()
vs_num_std.head(10)
zscore = lambda x: (x - x.mean()) / x.std()
vs_num_std = vs_numeric.apply(zscore)
vs_num_std.head()
# Instead of separating the numeric attributes, we can condition the standardization function on the data types
zscore = lambda x: ((x - x.mean()) / x.std()) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
vs_std = vstable.copy()
vs_std.apply(zscore).head()
vstable.groupby("Gender").mean()
vstable.groupby("Genre").mean()
vstable.groupby("Genre").describe().T
vstable["Income"].plot(kind="hist", bins=6)
vstable["Genre"].value_counts().plot(kind='bar')
temp1 = vstable["Genre"].value_counts()/vstable["Genre"].count()
temp2 = vstable["Gender"].value_counts()/vstable["Gender"].count()
temp2
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Genre')
ax1.set_ylabel('Percentage')
ax1.set_title("Genre Distribution")
temp1.plot(kind='bar', grid = True)
ax1 = fig.add_subplot(122)
ax1.set_xlabel('Gender')
ax1.set_ylabel('Percentage')
ax1.set_title("Gender Distribution")
temp2.plot(kind='bar', grid = True)
vstable.plot(x="Income", y="Age", kind="scatter")
vstable.plot(x="Income", y="Age", kind="scatter", alpha=0.8, s=vstable["Rentals"]*5, c="AvgPerVisit", cmap=plt.get_cmap("jet"), colorbar=True, figsize=(10,7))
vstable.groupby(["Genre","Gender"])["Gender"].count()
gg = pd.crosstab(vstable["Genre"], vstable["Gender"])
gg
plt.show(gg.plot(kind="bar"))
gg["percent_female"] = gg["F"]/(gg["F"]+gg["M"])
gg
plt.show(gg["percent_female"].plot(kind="bar"))
good_cust = vstable[vstable.Rentals>=30]
good_cust
print("Good Customers:")
good_cust.describe()
print("All Customers:")
vstable.describe()
gender_bin = pd.get_dummies(vstable["Gender"], prefix="Gender")
gender_bin.head()
vs_ssf = pd.get_dummies(vstable)
vs_ssf.head(10)
vs_ssf.describe()
# Min-Max normalization performed on the full numeric data set
vs_norm = (vs_ssf - vs_ssf.min()) / (vs_ssf.max()-vs_ssf.min())
vs_norm.head(10)
# After converting to all numeric attributes, we can perform correlation analysis on the variable
corr_matrix = vs_ssf.corr()
corr_matrix
corr_matrix["Rentals"].sort_values(ascending=False)
vs_norm.to_csv("Video_Store_Numeric.csv", float_format="%1.2f")