#!/usr/bin/env python # coding: utf-8 # In[11]: # importing packages/modules import pandas as pd from sklearn.manifold import TSNE from sklearn.feature_extraction import DictVectorizer import matplotlib.pyplot as plt import seaborn as sns import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline sns.set_style('darkgrid') # Objective: Use unsupervised learning to create distinct clusters to represent the most abundant OTU(Operational taxonomic units)/identities of bacteria found in a mouse *Mus Musculus* based on schloss lab MiSeq standard operating procedures (SOPs). # In[12]: # import the csv file into jupyter df = pd.read_csv("0.16.cons.taxonomy.csv", delimiter=";",index_col=0, usecols=["otu","size","identity","phylum","class","order","family","genus","genus_0","genus_1"]) # In[13]: # look at the first couple of rows df.head() # In[14]: # give a concise summary of the dataset df.info() # In[15]: sum(df['genus_0'] == df['genus_1']) # they are 6 instances where our classification is not the same i.e where the column genus_0 and genus_1 are not the # same, thus, I will not drop the column # In[16]: # changing the type of the column to int32 to reduce memory usage # the other columns are of the right type df['size'] = df['size'].astype('int32') # In[17]: # check for NaNs or missing values missing = sum(pd.isnull(df).any()) msg = 'They are {} missing values in this data set.' print(msg.format(missing)) # In[18]: # subsetting the dataframe taking only the phylum columns phylum = df['phylum'] # counting each category phylum.value_counts() # We have a lot of Firmicues and Proteobacteria. Usually it's **Firmicutes** and **Bacteriodetes** in healthy humans (debatable depends on a lot of factors). Let's change the strings in the columns to numbers using a cool trick with DictVectorizer. For unsupervised learning techniques to work; you need a matrix: rows and columns with numbers. # In[19]: # Make a boolean mask for categorical columns df_mask = (df.dtypes == object) # Get list of categorical column names df_mask_columns = df.columns[df_mask].tolist() # conversion of dataframe to dictionary df_dict = df.to_dict("records") # We don't want sparse matrices dv = DictVectorizer(sparse = False) # In[20]: # Apply fit_transform to our dataset df_encoded = dv.fit_transform(df_dict) # New column names print(dv.vocabulary_) # In[21]: # rows and number of column names in the matrix # 106 rows and 240 columns df_encoded.shape # In[22]: # t-SNE # find a two dimensional representation of the data that preserves the distances between points as best as possible # preserves information by finding the closest neighbours (points) # instantiate model tsne = TSNE(random_state=42) # fit and transform the data otu_tsne = tsne.fit_transform(df_encoded) # In[23]: # plot xs = otu_tsne[:,0] ys = otu_tsne[:,1] plt.scatter(xs, ys) plt.show() # The clusters are not so distinct maybe changing the *learning rate* parameter could help. Let's apply another technique and see how this gones. Prinicipal Component analysis(PCoA), however, Principal Coordinate analysis would be more in this case. # In[24]: # PCA removes the zero variance features and transforms the data # Another cool thing about PCA is that you can use it for feature extraction stdscale = StandardScaler() # use a standard scaler to ensure each row has a mean of 0 and a variance of 1 df_scaled = stdscale.fit_transform(df_encoded) # apply it to our dataset # In[25]: # call PCA model_pca = PCA() transformed = model_pca.fit_transform(df_scaled) #fit print(transformed.shape) # look at the dimensions of the matrix # In[26]: # draw a scatter plot of the first two columns, change the transparency of the points and draw the points in red plt.scatter(transformed[:,0], transformed[:,1], alpha = 0.1, color = 'red') # In[27]: # Plot explained total variance of principal components against number of components # make a vertical line to ensure you are extracting the right point on the graph plt.plot(np.cumsum(model_pca.explained_variance_ratio_)) plt.axvline(x=48, color='green', linestyle='--') plt.xlabel('Number of components') plt.ylabel('Principal components') plt.title('Finding the ideal number of n_components parameter for PCA') # In[28]: # new fit # Let's try adjust the number of components and see what we'll get pca = PCA(n_components=48) transformed = model_pca.fit_transform(df_scaled) print(transformed.shape) plt.scatter(transformed[:,0], transformed[:,1], alpha = 0.1) # In[29]: phylum.value_counts() # From the results, i suspect that the points between 0 and 2 *x-axis* and 0 and 2 *y-axis* could represent the bacteria which are on the phylum **Firmicutes** to left could be the **Proteobacteria**. Whereas, the isolated points could be **TM7** and **Bacteria_unclassified**. I've achieved partially achieved my objective with this dataset. Later on, I'll run PCoA on this dataset and compare the results. In addition, if you choose another part of the matrix you'll get a different result.