#!/usr/bin/env python # coding: utf-8 #
I used, modified code and instruction in Udacity ML nanodegree
# #Things we will cover: data exploration, data preprocessing, PCA, k-means, t-SNE
# In[131]: ls images #Modified image from Naruto uzumaki Pinterest.
# It is a way of extracting useful knowlege from the data "without any label" or a way of transforming the data into meaningful format.
# The emphasis is "without any label". This contrasts with Supervised Learning where we will predict the label on the testing sets from the training sets with the labels.
In this section, we will split the data into trainining and test set; then we will explore the dataset.
#The dataset contains 150 samples, each with five attributes. We will drop the class name variable and conduct unsupervised learning.
# In[9]: print("Num of row is: {0}, num of cols is {1}".format(iris_data.shape[0], iris_data.shape[1])) iris_data.head() # In[10]: # we change the features of the data to explicit names iris_data.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"] # In[11]: iris_data.head() # In[12]: X_data = iris_data[["sepal_length", "sepal_width", "petal_length", "petal_width"]] y_data = iris_data["class"] # In[13]: y_data.head() # In[14]: # encoding class labels from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y_data_en = le.fit_transform(y_data.values) print(y_data_en[:10]) print(y_data_en) # In[15]: # print the encoding labels le.classes_ # In[23]: # By default, train and test set will be splitted into 0.75% and 0.25% of original dataset # By default, dataset is shuffled before the split # set random_state in order for later replication(we can replicate the result this way since # same split will occur if we use the same random_state number) # we will use whole datasets as training data and use split to just shuffle the data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_data, y_data_en, test_size = 0, random_state=7) # In[26]: print("X_train and X_test shape : {0} vs {1}".format(X_train.shape, X_test.shape) ) print("y_train and y_test shape : {0} vs {1}".format(y_train.shape, y_test.shape)) # In[30]: # check the random order of data (check whether data was shuffled or not) print(y_train[:20]) # In[31]: # inspect the numerical reprensentation of each feature X_train.describe() #we will select three samples to explore them later. We will choose them so that they # is distinct (each feature value are different) from each other
# # we will choose three examples as follows:
#
# In[32]: # Choose a sample whose septal_length is maximum X_train.loc[X_train["sepal_length"] >= 7.8] # In[33]: sample_1 = X_train.loc[X_train["sepal_length"] >= 7.8] # In[39]: # Choose a sample whose sepal length is the closest to the mean value X_train.loc[(abs(X_train["sepal_length"] - X_train["sepal_length"].mean())).argmin()] # In[40]: sample_2 = X_train.loc[82] # In[42]: # Choose a sample whose petal length is the closest to the mean value X_train.loc[(abs(X_train["petal_length"] - X_train["petal_length"].mean())).argmin()] # In[43]: sample_3 = X_train.loc[80] # In[44]: # create a holder for the three samples chosen samples = [sample_1, sample_2, sample_3] # In[45]: indices = [131, 82, 80] samples = pd.DataFrame(X_train.loc[indices], columns=X_train.keys()).reset_index(drop=True) print("Chosen samples of iris dataset") samples #
One thing we can find from data is that some feature is predictive of others. In other words, is it possible to predict the sepal length using the other 3 features? (This is one example. Target variable does not have to be a sepal length. We can choose any feature to be inspected.)
#We will choose "sepal length" as a target class and use the other featuers for supervised learning models.
# In[46]: # Copy the X_train data and split them into a target(a dependent variable) # and independet variables pseudo_target = X_train['sepal_length'] pseudo_data = X_train.drop(['sepal_length'], axis=1) # Split the data into training and testing sets: 0.75 goes to training set. X_pseudo_train, X_pseudo_test, y_pseudo_train, y_pseudo_test\ = train_test_split(pseudo_data, pseudo_target, random_state=7) # Use a decision tree regressorn to fit and infer the target values from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=7) regressor.fit(X_pseudo_train, y_pseudo_train) y_pseduo_pred = regressor.predict(X_pseudo_test) from sklearn.metrics import r2_score score = r2_score(y_pseudo_test, y_pseduo_pred) print(score) #So about 83% of the variance of target variable is explained by the three variables. # It is clear that there is a dependency of this target variable(petal length) on the three variables. We could drop the "petal length" feature for reducing the dimentionality.(thus # reducing the run-time cost)
#In order to get sense of data, we will plot the feature distribution. # In this way, we can understand not only the each feature's distribution but also the # correlation between each feature.
# # # # # In[47]: pd.plotting.scatter_matrix(X_train, alpha=0.9, figsize=(14, 8), diagonal="kde") # #Sepal length and petal length are almost following normal distribution (Sepal length # does not trace the exact bell curve though). # Petal length and petal width trace bimodal distribution(meaning that there are two groups or two most frequent samples)
# # #Positive correlation can be seen between 'sepal_length' and 'petal_length', between 'petal_length' and 'petal_width'. 'Septal_length' and 'petal_width' are somewhat correlated positively but it is not clear.
# We can confirm the correlation we inferred above by using the heatmap. # In[48]: sns.heatmap(X_train.corr(), annot=True) #Positive correlation ( in decreasing order ) for 3 distributions: #
We will transform the data into an appropriate distribution.
# We will also detect outliers and deal with them.
Modified image from Sasuke's Ninja Way
# In[129]: ls images/ #Some machine-leanrning algorithms are sensetive to the data distribution. It is often preferable to pre-scale the datasets
#We will scale dataset (StandardScaler in this case) so that each feature will have zero mean and unit variance.(This does not necessarily mean that each feature will trace Gaussian Distribution)
# In[50]: # Using StandardScaler to make each distribution have zero mean and unit variance from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) samples_scaled = scaler.transform(samples) print("Mean of value for each feature before scaling\n{0}".format(X_train.mean(axis=0))) print("Variance of value for each feature before scaling\n{0}".format(X_train.var(axis=0))) print("Mean of value for each feature after scaling\n{0}".format(X_train_scaled.mean(axis=0))) print("Vaiance of value for each feature after scaling\n{0}".format(X_train_scaled.var(axis=0))) # In[51]: X_train_scaled_df = pd.DataFrame({"sepal_length": X_train_scaled[:,0], "sepal_width": X_train_scaled[:, 1], "petal_length": X_train_scaled[:, 2], "petal_width": X_train_scaled[:, 3]}) # In[52]: pd.plotting.scatter_matrix(X_train_scaled_df, figsize=(14, 8), diagonal="kde") # In[53]: samples_scaled_df = pd.DataFrame({"sepal_length": samples_scaled[:,0], "sepal_width": samples_scaled[:, 1], "petal_length": samples_scaled[:, 2], "petal_width": samples_scaled[:, 3]}) samples_scaled_df #We will detect outliers and deal with them
#We will use Turkey's Method for identifying outliers for identifying outliers
# In[56]: outliers = [] for feature in X_train_scaled_df.keys(): q1 = np.percentile(X_train_scaled_df[feature], 25, axis=0) q3 = np.percentile(X_train_scaled_df[feature], 75, axis=0) step = 1.5 * float(q3-q1) print("Data points considered outliers for the feature {0}".format(feature)) X_train_scaled_df[~((X_train_scaled_df[feature] >= q1 - step)&(X_train_scaled_df[feature] <= q3+step))] # In[58]: sns.boxplot(data=X_train_scaled_df) #Fowllowing Turkey's Method for identifying outliers, # we do not have outliers in this datasets
#We will see the underlying structure of the data
# Sign (+/-) of each weight will have meaning only if they are compared to other features.
# Sign can be flipped and sign itself does not indicate anything.
# We will consider the feature weight larger than 0.5 in absolute value to have strong correlation.
PCA allows us to reduce the dimensionality of the data --
# this will result in the less computation cost. As a side effect, total variance in the data will also decrease.
We will use 2 components since it captures 95% of total vairance.
We can see that right points in the middle of principal component 2 axis are iris with big(long) "petal length",
# "sepal width" and "sepal length".
As we saw from the visualization of the feature weights,
# "petal length", "sepal width" and "sepal length" are mostly strongly accociated with
# the first component, and "petal width" with the second component.
#
In this section, we will use K-means to cluter the datasets.
# We will use Silhouette coefficients to choose the number of cluster
Two clusters results in the best score even though we know that we actually have three classes # in the datasets
# In[83]: # Fit a kmeans with two clusters kmeans = KMeans(n_clusters=2, random_state=0) clusterer = kmeans.fit(reduced_data) preds = clusterer.predict(reduced_data) centers = clusterer.cluster_centers_ samples_preds = clusterer.predict(reduced_samples) print(centers) # In[123]: import matplotlib.cm as cm def cluster_results(reduced_data, preds, centers, pca_samples): ''' Visualizes the PCA-reduced cluster data in two dimensions Adds cues for cluster centers and student-selected sample data ''' predictions = pd.DataFrame(preds, columns = ['Cluster']) plot_data = pd.concat([predictions, reduced_data], axis = 1) # Generate the cluster plot fig, ax = plt.subplots(figsize = (14,8)) # Color map cmap = cm.get_cmap('gist_rainbow') # Color the points based on assigned cluster for i, cluster in plot_data.groupby('Cluster'): cluster.plot(ax = ax, kind = 'scatter', x = 'PCA 1', y = 'PCA 2', \ color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30); # Plot centers with indicators for i, c in enumerate(centers): ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \ alpha = 1, linewidth = 2, marker = 'o', s=200); ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100); # Plot transformed sample points ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \ s = 150, linewidth = 4, color = 'black', marker = 'x'); # Set plot title ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number\nTransformed Sample Data Marked by Black Cross"); # In[124]: # Display the results of the clustering from implementation cluster_results(reduced_data_df, preds, centers, reduced_samples) # In[ ]: #t-SNE will try to find a two dimensional representation of data which will preserve the distance in the original dimension( that is to say, points that are close will remain close and points that are far away remain far away).
#Important to Note: We are able to understand the class distribution since we have ground the truth labels.
# In[125]: colors = ["#476A2A", "#7851B8", "#BD3430"] from sklearn.manifold import TSNE tsne = TSNE(random_state=42) X_train_scaled_tsne = tsne.fit_transform(X_train_scaled) plt.figure(figsize=(14, 8)) plt.xlim(X_train_scaled_tsne[:, 0].min(), X_train_scaled_tsne[:, 0].max()+1) plt.ylim(X_train_scaled_tsne[:, 1].min(), X_train_scaled_tsne[:, 1].max()+1) for i in range(X_train_scaled_tsne.shape[0]): plt.text(X_train_scaled_tsne[i, 0], X_train_scaled_tsne[i, 1], str(y_train[i]), color = colors[y_train[i]], fontdict={'weight': 'bold', 'size': 9}) class_index = ["{0}: {1}".format(class_num, le.inverse_transform(class_num)) for class_num in [0, 1, 2]] plt.title(class_index) #We can see that class label setosa is well seprated from the other two classes, # versicolor and virginica.K-means might have not been able to seperate the two latter classes.
#We could use the finings and explore other techniques on this datasets # such as other clustering methods or conduct supervised learning with data whose features are reduced by PCA. What if we perform clustering on the original non-reduced dataset?
##
# # In[ ]: