#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store.csv", index_col=0)

vstable.shape


# In[3]:


vstable.head(10)


# In[4]:


vstable.columns


# In[5]:


vstable.dtypes


# #### Now we can convert columns to the appropriate type as necessary:

# In[6]:


vstable["Income"] = vstable["Income"].astype(float)
vstable.dtypes


# In[7]:


vstable.info()


# In[8]:


vstable.describe()


# In[9]:


min_sal = vstable["Income"].min()
max_sal = vstable["Income"].max()
print(min_sal, max_sal)


# In[10]:


vstable.describe(include="all")


# In[11]:


vstable[["Income", "Age"]].describe()


# #### We can perform data transformations such as normalization by directly applying the operation to the Pandas Series:

# In[12]:


norm_sal = (vstable["Income"] - min_sal) / (max_sal-min_sal)
norm_sal.head(10)


# #### Z-Score Standardization on Age

# In[13]:


age_z = (vstable["Age"] - vstable["Age"].mean()) / vstable["Age"].std()
age_z.head(5)


# #### New columns can be added to the dataframe as needed

# In[14]:


vstable["Age-Std"] = age_z
vstable.head()


# #### Discretization with Panda

# In[15]:


# Discretize variable into equal-sized buckets based on rank or based on sample quantiles.

inc_bins = pd.qcut(vstable.Income, 3)
inc_bins.head(10)


# In[16]:


# We can specifiy an array of quantiles for discretization together with labels for the bins)

inc_bins = pd.qcut(vstable.Income, [0, .33, .66, 1], labels=["low", "mid", "high"])
inc_bins.head(10)


# In[17]:


vstable["inc-bins"] = inc_bins
vstable.head(10)


# In[18]:


# We can also drop columns from the dataframe

vstable.drop(columns=['Age-Std','inc-bins'], inplace=True)
vstable.head()


# In[19]:


vs_numeric = vstable[["Age","Income","Rentals","AvgPerVisit"]]
vs_num_std = (vs_numeric - vs_numeric.mean()) / vs_numeric.std()
vs_num_std.head(10)


# In[20]:


zscore = lambda x: (x - x.mean()) / x.std()
vs_num_std = vs_numeric.apply(zscore)
vs_num_std.head()


# In[21]:


# Instead of separating the numeric attributes, we can condition the standardization function on the data types

zscore = lambda x: ((x - x.mean()) / x.std()) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x
vs_std = vstable.copy()
vs_std.apply(zscore).head()


# #### Grouping and aggregating data

# In[22]:


vstable.groupby("Gender").mean()


# In[23]:


vstable.groupby("Genre").mean()


# In[24]:


vstable.groupby("Genre").describe().T


# In[25]:


vstable["Income"].plot(kind="hist", bins=6)


# In[26]:


vstable["Genre"].value_counts().plot(kind='bar')


# In[27]:


temp1 = vstable["Genre"].value_counts()/vstable["Genre"].count()
temp2 = vstable["Gender"].value_counts()/vstable["Gender"].count()
temp2

fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Genre')
ax1.set_ylabel('Percentage')
ax1.set_title("Genre Distribution")
temp1.plot(kind='bar', grid = True)

ax1 = fig.add_subplot(122)
ax1.set_xlabel('Gender')
ax1.set_ylabel('Percentage')
ax1.set_title("Gender Distribution")
temp2.plot(kind='bar', grid = True)


# In[28]:


vstable.plot(x="Income", y="Age", kind="scatter")


# In[29]:


vstable.plot(x="Income", y="Age", kind="scatter", alpha=0.8, s=vstable["Rentals"]*5, c="AvgPerVisit", cmap=plt.get_cmap("jet"), colorbar=True, figsize=(10,7))


# In[30]:


vstable.groupby(["Genre","Gender"])["Gender"].count()


# In[31]:


gg = pd.crosstab(vstable["Genre"], vstable["Gender"])
gg


# In[32]:


plt.show(gg.plot(kind="bar"))


# In[33]:


gg["percent_female"] = gg["F"]/(gg["F"]+gg["M"])
gg


# In[34]:


plt.show(gg["percent_female"].plot(kind="bar"))


# #### Suppose that we would like to find all "good cutomers", i.e., those with Rentals value of >= 30:

# In[35]:


good_cust = vstable[vstable.Rentals>=30]
good_cust


# In[36]:


print("Good Customers:")
good_cust.describe()


# In[37]:


print("All Customers:")
vstable.describe()


# #### Creating dummy variables and converting to standard spreadsheet format (all numeric attributes)

# In[38]:


gender_bin = pd.get_dummies(vstable["Gender"], prefix="Gender")
gender_bin.head()


# In[39]:


vs_ssf = pd.get_dummies(vstable)
vs_ssf.head(10)


# In[40]:


vs_ssf.describe()


# In[41]:


# Min-Max normalization performed on the full numeric data set

vs_norm = (vs_ssf - vs_ssf.min()) / (vs_ssf.max()-vs_ssf.min())
vs_norm.head(10)


# In[45]:


# After converting to all numeric attributes, we can perform correlation analysis on the variable

corr_matrix = vs_ssf.corr()
corr_matrix


# In[46]:


corr_matrix["Rentals"].sort_values(ascending=False)


# #### The new table can be written into a file using to_csv method:

# In[43]:


vs_norm.to_csv("Video_Store_Numeric.csv", float_format="%1.2f")