#!/usr/bin/env python
# coding: utf-8

# In[11]:


# importing packages/modules

import pandas as pd
from sklearn.manifold import TSNE
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

sns.set_style('darkgrid')


# Objective: Use unsupervised learning to create distinct clusters to represent the most abundant OTU(Operational taxonomic units)/identities of bacteria found in a mouse *Mus Musculus* based on schloss lab MiSeq standard operating procedures (SOPs).

# In[12]:


# import the csv file into jupyter

df = pd.read_csv("0.16.cons.taxonomy.csv", delimiter=";",index_col=0, usecols=["otu","size","identity","phylum","class","order","family","genus","genus_0","genus_1"])


# In[13]:


# look at the first couple of rows
df.head()


# In[14]:


# give a concise summary of the dataset
df.info()


# In[15]:


sum(df['genus_0'] == df['genus_1']) 

# they are 6 instances where our classification is not the same i.e where the column genus_0 and genus_1 are not the 
# same, thus, I will not drop the column


# In[16]:


# changing the type of the column to int32 to reduce memory usage 
# the other columns are of the right type

df['size'] = df['size'].astype('int32')


# In[17]:


# check for NaNs or missing values

missing = sum(pd.isnull(df).any())

msg = 'They are {} missing values in this data set.'

print(msg.format(missing))


# In[18]:


# subsetting the dataframe taking only the phylum columns
phylum = df['phylum']

# counting each category 
phylum.value_counts()


# We have a lot of Firmicues and Proteobacteria. Usually it's **Firmicutes** and **Bacteriodetes** in healthy humans (debatable depends on a lot of factors). Let's change the strings in the columns to numbers using a cool trick with DictVectorizer. For unsupervised learning techniques to work; you need a matrix: rows and columns with numbers.

# In[19]:


# Make a boolean mask for categorical columns
df_mask = (df.dtypes == object)

# Get list of categorical column names
df_mask_columns = df.columns[df_mask].tolist()

# conversion of dataframe to dictionary 
df_dict = df.to_dict("records")

# We don't want  sparse matrices
dv = DictVectorizer(sparse = False)


# In[20]:


# Apply fit_transform to our dataset
df_encoded = dv.fit_transform(df_dict)

# New column names
print(dv.vocabulary_)


# In[21]:


# rows and number of column names in the matrix
# 106 rows and 240 columns
df_encoded.shape


# In[22]:


# t-SNE
# find a two dimensional representation of the data that preserves the distances between points as best as possible
# preserves information by finding the closest neighbours (points)

# instantiate model
tsne = TSNE(random_state=42)

# fit and transform the data
otu_tsne = tsne.fit_transform(df_encoded)


# In[23]:


# plot 
xs = otu_tsne[:,0]
ys = otu_tsne[:,1]
plt.scatter(xs, ys)
plt.show()


# The clusters are not so distinct maybe changing the *learning rate* parameter could help. Let's apply another technique and see how this gones. Prinicipal Component analysis(PCoA), however, Principal Coordinate analysis would be more in this case.

# In[24]:


# PCA removes the zero variance features and transforms the data
# Another cool thing about PCA is that you can use it for feature extraction

stdscale = StandardScaler() # use a standard scaler to ensure each row has a mean of 0 and a variance of 1
df_scaled = stdscale.fit_transform(df_encoded) # apply it to our dataset


# In[25]:


# call PCA
model_pca = PCA()
transformed = model_pca.fit_transform(df_scaled) #fit
print(transformed.shape) # look at the dimensions of the matrix


# In[26]:


# draw a scatter plot of the first two columns, change the transparency of the points and draw the points in red
plt.scatter(transformed[:,0], transformed[:,1], alpha = 0.1, color = 'red')


# In[27]:


# Plot explained total variance of principal components against number of components
# make a vertical line to ensure you are extracting the right point on the graph

plt.plot(np.cumsum(model_pca.explained_variance_ratio_))
plt.axvline(x=48, color='green', linestyle='--')
plt.xlabel('Number of components')
plt.ylabel('Principal components')
plt.title('Finding the ideal number of n_components parameter for PCA')


# In[28]:


# new fit
# Let's try adjust the number of components and see what we'll get
pca = PCA(n_components=48)
transformed = model_pca.fit_transform(df_scaled)
print(transformed.shape)
plt.scatter(transformed[:,0], transformed[:,1], alpha = 0.1)


# In[29]:


phylum.value_counts()


# From the results, i suspect that the points between 0 and 2 *x-axis* and 0 and 2 *y-axis* could represent the bacteria which are on the phylum **Firmicutes** to left could be the **Proteobacteria**. Whereas, the isolated points could be **TM7** and **Bacteria_unclassified**. I've achieved partially achieved my objective with this dataset. Later on, I'll run PCoA on this dataset and compare the results. In addition, if you choose another part of the matrix you'll get a different result.