#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store.csv", index_col=0) vstable.shape # In[3]: vstable.head(10) # In[4]: vstable.columns # In[5]: vstable.dtypes # #### Now we can convert columns to the appropriate type as necessary: # In[6]: vstable["Income"] = vstable["Income"].astype(float) vstable.dtypes # In[7]: vstable.info() # In[8]: vstable.describe() # In[9]: min_sal = vstable["Income"].min() max_sal = vstable["Income"].max() print(min_sal, max_sal) # In[10]: vstable.describe(include="all") # In[11]: vstable[["Income", "Age"]].describe() # #### We can perform data transformations such as normalization by directly applying the operation to the Pandas Series: # In[12]: norm_sal = (vstable["Income"] - min_sal) / (max_sal-min_sal) norm_sal.head(10) # #### Z-Score Standardization on Age # In[13]: age_z = (vstable["Age"] - vstable["Age"].mean()) / vstable["Age"].std() age_z.head(5) # #### New columns can be added to the dataframe as needed # In[14]: vstable["Age-Std"] = age_z vstable.head() # #### Discretization with Panda # In[15]: # Discretize variable into equal-sized buckets based on rank or based on sample quantiles. inc_bins = pd.qcut(vstable.Income, 3) inc_bins.head(10) # In[16]: # We can specifiy an array of quantiles for discretization together with labels for the bins) inc_bins = pd.qcut(vstable.Income, [0, .33, .66, 1], labels=["low", "mid", "high"]) inc_bins.head(10) # In[17]: vstable["inc-bins"] = inc_bins vstable.head(10) # In[18]: # We can also drop columns from the dataframe vstable.drop(columns=['Age-Std','inc-bins'], inplace=True) vstable.head() # In[19]: vs_numeric = vstable[["Age","Income","Rentals","AvgPerVisit"]] vs_num_std = (vs_numeric - vs_numeric.mean()) / vs_numeric.std() vs_num_std.head(10) # In[20]: zscore = lambda x: (x - x.mean()) / x.std() vs_num_std = vs_numeric.apply(zscore) vs_num_std.head() # In[21]: # Instead of separating the numeric attributes, we can condition the standardization function on the data types zscore = lambda x: ((x - x.mean()) / x.std()) if (x.dtypes==np.float64 or x.dtypes==np.int64) else x vs_std = vstable.copy() vs_std.apply(zscore).head() # #### Grouping and aggregating data # In[22]: vstable.groupby("Gender").mean() # In[23]: vstable.groupby("Genre").mean() # In[24]: vstable.groupby("Genre").describe().T # In[25]: vstable["Income"].plot(kind="hist", bins=6) # In[26]: vstable["Genre"].value_counts().plot(kind='bar') # In[27]: temp1 = vstable["Genre"].value_counts()/vstable["Genre"].count() temp2 = vstable["Gender"].value_counts()/vstable["Gender"].count() temp2 fig = plt.figure(figsize=(10,4)) ax1 = fig.add_subplot(121) ax1.set_xlabel('Genre') ax1.set_ylabel('Percentage') ax1.set_title("Genre Distribution") temp1.plot(kind='bar', grid = True) ax1 = fig.add_subplot(122) ax1.set_xlabel('Gender') ax1.set_ylabel('Percentage') ax1.set_title("Gender Distribution") temp2.plot(kind='bar', grid = True) # In[28]: vstable.plot(x="Income", y="Age", kind="scatter") # In[29]: vstable.plot(x="Income", y="Age", kind="scatter", alpha=0.8, s=vstable["Rentals"]*5, c="AvgPerVisit", cmap=plt.get_cmap("jet"), colorbar=True, figsize=(10,7)) # In[30]: vstable.groupby(["Genre","Gender"])["Gender"].count() # In[31]: gg = pd.crosstab(vstable["Genre"], vstable["Gender"]) gg # In[32]: plt.show(gg.plot(kind="bar")) # In[33]: gg["percent_female"] = gg["F"]/(gg["F"]+gg["M"]) gg # In[34]: plt.show(gg["percent_female"].plot(kind="bar")) # #### Suppose that we would like to find all "good cutomers", i.e., those with Rentals value of >= 30: # In[35]: good_cust = vstable[vstable.Rentals>=30] good_cust # In[36]: print("Good Customers:") good_cust.describe() # In[37]: print("All Customers:") vstable.describe() # #### Creating dummy variables and converting to standard spreadsheet format (all numeric attributes) # In[38]: gender_bin = pd.get_dummies(vstable["Gender"], prefix="Gender") gender_bin.head() # In[39]: vs_ssf = pd.get_dummies(vstable) vs_ssf.head(10) # In[40]: vs_ssf.describe() # In[41]: # Min-Max normalization performed on the full numeric data set vs_norm = (vs_ssf - vs_ssf.min()) / (vs_ssf.max()-vs_ssf.min()) vs_norm.head(10) # In[45]: # After converting to all numeric attributes, we can perform correlation analysis on the variable corr_matrix = vs_ssf.corr() corr_matrix # In[46]: corr_matrix["Rentals"].sort_values(ascending=False) # #### The new table can be written into a file using to_csv method: # In[43]: vs_norm.to_csv("Video_Store_Numeric.csv", float_format="%1.2f")