In [11]:
# importing packages/modules

import pandas as pd
from sklearn.manifold import TSNE
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

sns.set_style('darkgrid')

Objective: Use unsupervised learning to create distinct clusters to represent the most abundant OTU(Operational taxonomic units)/identities of bacteria found in a mouse Mus Musculus based on schloss lab MiSeq standard operating procedures (SOPs).

In [12]:
# import the csv file into jupyter

df = pd.read_csv("0.16.cons.taxonomy.csv", delimiter=";",index_col=0, usecols=["otu","size","identity","phylum","class","order","family","genus","genus_0","genus_1"])
In [13]:
# look at the first couple of rows
df.head()
Out[13]:
size identity phylum class order family genus genus_0 genus_1
otu
Otu001 61864 Bacteria(100) Bacteroidetes(100) Bacteroidia(100) Bacteroidales(100) Porphyromonadaceae(100) Barnesiella(96) Barnesiella_unclassified(96) Barnesiella_unclassified(96)
Otu002 23360 Bacteria(100) Firmicutes(100) Clostridia(100) Clostridiales(100) Lachnospiraceae(89) Lachnospiraceae_unclassified(61) Lachnospiraceae_unclassified(61) Lachnospiraceae_unclassified(61)
Otu003 6679 Bacteria(100) Firmicutes(100) Bacilli(100) Lactobacillales(100) Lactobacillaceae(95) Lactobacillus(95) Lactobacillus_unclassified(95) Lactobacillus_unclassified(95)
Otu004 6584 Bacteria(100) Bacteroidetes(100) Bacteroidia(100) Bacteroidales(100) Bacteroidaceae(100) Bacteroides(100) Bacteroides_unclassified(100) Bacteroides_unclassified(100)
Otu005 5376 Bacteria(100) Bacteroidetes(100) Bacteroidia(100) Bacteroidales(100) Rikenellaceae(100) Alistipes(100) Alistipes_unclassified(100) Alistipes_unclassified(100)
In [14]:
# give a concise summary of the dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 106 entries, Otu001 to Otu106
Data columns (total 9 columns):
size        106 non-null int64
identity    106 non-null object
phylum      106 non-null object
class       106 non-null object
order       106 non-null object
family      106 non-null object
genus       106 non-null object
genus_0     106 non-null object
genus_1     106 non-null object
dtypes: int64(1), object(8)
memory usage: 8.3+ KB
In [15]:
sum(df['genus_0'] == df['genus_1']) 

# they are 6 instances where our classification is not the same i.e where the column genus_0 and genus_1 are not the 
# same, thus, I will not drop the column
Out[15]:
100
In [16]:
# changing the type of the column to int32 to reduce memory usage 
# the other columns are of the right type

df['size'] = df['size'].astype('int32')
In [17]:
# check for NaNs or missing values

missing = sum(pd.isnull(df).any())

msg = 'They are {} missing values in this data set.'

print(msg.format(missing))
They are 0 missing values in this data set.
In [18]:
# subsetting the dataframe taking only the phylum columns
phylum = df['phylum']

# counting each category 
phylum.value_counts()
Out[18]:
Firmicutes(100)               54
Proteobacteria(100)           14
TM7(100)                      12
Bacteria_unclassified(100)    10
Bacteroidetes(100)             7
Deinococcus-Thermus(100)       4
Actinobacteria(100)            4
Verrucomicrobia(100)           1
Name: phylum, dtype: int64

We have a lot of Firmicues and Proteobacteria. Usually it's Firmicutes and Bacteriodetes in healthy humans (debatable depends on a lot of factors). Let's change the strings in the columns to numbers using a cool trick with DictVectorizer. For unsupervised learning techniques to work; you need a matrix: rows and columns with numbers.

In [19]:
# Make a boolean mask for categorical columns
df_mask = (df.dtypes == object)

# Get list of categorical column names
df_mask_columns = df.columns[df_mask].tolist()

# conversion of dataframe to dictionary 
df_dict = df.to_dict("records")

# We don't want  sparse matrices
dv = DictVectorizer(sparse = False)
In [20]:
# Apply fit_transform to our dataset
df_encoded = dv.fit_transform(df_dict)

# New column names
print(dv.vocabulary_)
{'family=Cryomorphaceae(100)': 31, 'phylum=Firmicutes(100)': 235, 'class=Epsilonproteobacteria(100)': 9, 'genus_0=Anaerovorax_unclassified(86)': 111, 'family=Bacteria_unclassified(100)': 21, 'genus=Bacillus(100)': 67, 'class=Bacteroidia(100)': 5, 'order=Rhodobacterales(100)': 227, 'genus_1=Bacteroides_unclassified(100)': 167, 'order=Coriobacteridae(100)': 216, 'genus_1=Ruminococcaceae_unclassified(100)': 194, 'genus_0=Ruminococcaceae_unclassified(96)': 145, 'genus=Stenotrophomonas(100)': 99, 'genus=Clostridiales_unclassified(69)': 77, 'genus=Rhodobacter(100)': 95, 'genus=Staphylococcus(100)': 98, 'genus_0=Clostridiales_unclassified(100)': 122, 'family=Rikenellaceae(100)': 48, 'family=Bacillales_unclassified(100)': 19, 'class=Clostridia(100)': 7, 'genus_1=Staphylococcus_unclassified(100)': 196, 'family=Ruminococcaceae(100)': 49, 'order=Verrucomicrobiales(100)': 229, 'genus=Coprobacillus(100)': 78, 'family=Clostridia_unclassified(100)': 25, 'identity=Bacteria(100)': 205, 'genus_1=Clostridiales_unclassified(69)': 174, 'genus=Streptococcus(100)': 100, 'class=TM7_genera_incertae_sedis(100)': 14, 'genus_1=Aeromonas_unclassified(100)': 158, 'family=Moraxellaceae(100)': 42, 'genus=Gammaproteobacteria_unclassified(100)': 86, 'order=Rhizobiales(100)': 226, 'genus_1=Escherichia/Shigella_unclassified(100)': 180, 'family=Deinococcaceae(100)': 32, 'genus_0=Pseudomonadaceae_unclassified(70)': 141, 'class=Betaproteobacteria(100)': 6, 'genus=Lactobacillus(95)': 89, 'genus_0=Ruminococcaceae_unclassified(100)': 144, 'genus_0=Akkermansia_unclassified(100)': 109, 'genus_1=Bacteroidetes_unclassified(100)': 168, 'genus_1=Clostridium_unclassified(100)': 175, 'genus_0=Actinomycetaceae(53)': 107, 'genus_0=Staphylococcus_unclassified(100)': 146, 'family=Clostridiaceae(100)': 26, 'genus_1=Akkermansia_unclassified(100)': 159, 'genus_1=Clostridia_unclassified(100)': 171, 'genus_0=Porphyromonas_unclassified(100)': 140, 'phylum=Bacteria_unclassified(100)': 232, 'genus_0=Deinococcus_unclassified(100)': 129, 'family=Rhodobacteraceae(100)': 47, 'family=Bacillaceae(100)': 18, 'genus_1=Pseudomonadaceae_unclassified(70)': 191, 'genus=Actinomycineae(53)': 59, 'genus=Helicobacter(100)': 87, 'order=Flavobacteriales(100)': 221, 'genus_1=Clostridiales_unclassified(67)': 173, 'genus_0=Bacillales_unclassified(100)': 113, 'genus_1=Bacilli_unclassified(100)': 164, 'genus=unclassified_Clostridiales_unclassified(93)': 104, 'genus_0=Clostridiales_unclassified(69)': 124, 'genus_1=Fluviicola_unclassified(100)': 182, 'family=Ruminococcaceae(96)': 50, 'genus_0=Streptococcus_unclassified(100)': 148, 'family=Pseudomonadaceae(100)': 45, 'genus=Coriobacterineae(100)': 79, 'class=Bacilli(100)': 2, 'family=Erysipelotrichaceae(100)': 34, 'genus_0=Turicibacter_unclassified(100)': 150, 'family=Incertae_Sedis_XIII(86)': 38, 'order=Actinobacteridae(100)': 206, 'genus_1=Coprobacillus_unclassified(100)': 176, 'genus_0=Escherichia/Shigella_unclassified(100)': 131, 'family=unclassified_Clostridiales(63)': 56, 'genus=Bacillales_unclassified(100)': 65, 'genus_1=Cryomorphaceae_unclassified(100)': 177, 'genus=unclassified_Ruminococcaceae(62)': 105, 'phylum=Bacteroidetes(100)': 233, 'genus_0=TM7_genera_incertae_sedis_unclassified(100)': 149, 'family=Aeromonadaceae(100)': 17, 'phylum=Verrucomicrobia(100)': 238, 'order=Bacteroidetes_unclassified(100)': 212, 'family=Bacteroidaceae(100)': 22, 'class=Verrucomicrobiae(100)': 15, 'family=Helicobacteraceae(100)': 37, 'genus_0=Listeria_unclassified(100)': 138, 'genus_0=Rhodobacter_unclassified(100)': 143, 'genus=Bacteria_unclassified(100)': 68, 'order=TM7_genera_incertae_sedis_unclassified(100)': 228, 'genus_1=Lactobacillus_unclassified(95)': 186, 'genus_1=TM7_genera_incertae_sedis_unclassified(100)': 199, 'genus_1=Rhizobiales_unclassified(100)': 192, 'family=Bacteroidetes_unclassified(100)': 23, 'family=Neisseriaceae(100)': 43, 'order=Campylobacterales(100)': 213, 'order=Neisseriales(100)': 224, 'genus_1=Bacillus_unclassified(100)': 165, 'genus_1=Clostridiales_unclassified(100)': 172, 'class=Deinococci(100)': 8, 'order=Bacillales(100)': 208, 'genus=Aeromonas(100)': 60, 'genus_1=unclassified_Clostridiales_unclassified(63)': 202, 'order=Aeromonadales(100)': 207, 'family=Bifidobacteriales(100)': 24, 'family=Actinomycetales(54)': 16, 'genus_1=Deinococcus_unclassified(100)': 178, 'genus_1=Bacteria_unclassified(100)': 166, 'phylum=Actinobacteria(100)': 231, 'genus_0=Bacteria_unclassified(100)': 116, 'genus_0=Aeromonas_unclassified(100)': 108, 'genus_0=Bifidobacterium(100)': 120, 'genus_0=unclassified_Clostridiaceae_1(95)': 151, 'genus=Ruminococcaceae_unclassified(100)': 96, 'genus_0=Erysipelotrichaceae_unclassified(100)': 130, 'family=Xanthomonadaceae(100)': 55, 'order=Pseudomonadales(100)': 225, 'genus_0=Rhizobiales_unclassified(100)': 142, 'genus_1=Bacillales_unclassified(100)': 163, 'genus_1=Bifidobacterium_unclassified(100)': 170, 'genus_0=Cryomorphaceae_unclassified(100)': 128, 'order=Xanthomonadales(100)': 230, 'class=Flavobacteria(100)': 12, 'size': 239, 'genus_0=Alistipes_unclassified(100)': 110, 'order=Bacteria_unclassified(100)': 210, 'order=Clostridiales(100)': 215, 'order=Clostridia_unclassified(100)': 214, 'genus_1=Turicibacter_unclassified(100)': 200, 'genus=Fluviicola(100)': 85, 'genus_0=Clostridia_unclassified(100)': 121, 'order=Deinococcales(100)': 217, 'phylum=TM7(100)': 237, 'genus_0=Clostridium(100)': 125, 'genus_1=Olsenella(100)': 189, 'genus_1=Firmicutes_unclassified(100)': 181, 'genus_0=Stenotrophomonas_unclassified(100)': 147, 'genus_1=Alistipes_unclassified(100)': 160, 'class=Firmicutes_unclassified(100)': 11, 'order=Gammaproteobacteria_unclassified(100)': 222, 'family=Staphylococcaceae(100)': 51, 'genus=Clostridia_unclassified(100)': 73, 'class=Gammaproteobacteria(100)': 13, 'family=Bacilli_unclassified(100)': 20, 'genus=Clostridiales_unclassified(67)': 76, 'genus=Alistipes(100)': 62, 'genus_1=Barnesiella_unclassified(96)': 169, 'order=Lactobacillales(100)': 223, 'genus_1=Rhodobacter_unclassified(100)': 193, 'genus_0=Coriobacteriaceae(100)': 127, 'class=Alphaproteobacteria(100)': 1, 'genus=Bacteroidetes_unclassified(100)': 70, 'phylum=Proteobacteria(100)': 236, 'class=Erysipelotrichi(100)': 10, 'genus_1=Anaerovorax_unclassified(86)': 161, 'genus_1=Actinomyces(53)': 156, 'genus=Bacilli_unclassified(100)': 66, 'class=Bacteria_unclassified(100)': 3, 'genus=Lachnospiraceae_unclassified(61)': 88, 'genus_1=Stenotrophomonas_unclassified(100)': 197, 'genus_0=Bacillus_unclassified(100)': 115, 'genus_0=Bacteroidetes_unclassified(100)': 118, 'genus_0=Lachnospiraceae_unclassified(61)': 136, 'genus=Turicibacter(100)': 102, 'order=Bacteroidales(100)': 211, 'family=TM7_genera_incertae_sedis_unclassified(100)': 53, 'genus_0=Bacteroides_unclassified(100)': 117, 'class=Actinobacteria(100)': 0, 'genus_0=Firmicutes_unclassified(100)': 132, 'genus_0=Clostridiales_unclassified(67)': 123, 'genus_0=Coprobacillus_unclassified(100)': 126, 'genus=Listeria(100)': 90, 'genus=Firmicutes_unclassified(100)': 84, 'genus_1=Helicobacter_unclassified(100)': 184, 'genus_1=Neisseria_unclassified(100)': 188, 'genus_1=Acinetobacter_unclassified(100)': 155, 'genus=Barnesiella(96)': 71, 'genus=Anaerovorax(86)': 63, 'genus=Pseudomonadaceae_unclassified(70)': 93, 'genus_0=Bacilli_unclassified(100)': 114, 'order=Firmicutes_unclassified(100)': 220, 'genus=Porphyromonas(100)': 92, 'genus_0=Helicobacter_unclassified(100)': 135, 'order=Enterobacteriales(100)': 218, 'family=Clostridiales_unclassified(67)': 28, 'genus_1=unclassified_Clostridiales_unclassified(93)': 203, 'family=Lactobacillaceae(95)': 40, 'order=Bacilli_unclassified(100)': 209, 'genus_0=Acinetobacter_unclassified(100)': 106, 'class=Bacteroidetes_unclassified(100)': 4, 'family=Streptococcaceae(100)': 52, 'genus_0=Fluviicola_unclassified(100)': 133, 'genus_1=unclassified_Clostridiaceae_1_unclassified(95)': 201, 'genus=Bacteroides(100)': 69, 'family=Clostridiales_unclassified(69)': 29, 'genus_0=unclassified_Ruminococcaceae_unclassified(62)': 154, 'family=Verrucomicrobiaceae(100)': 54, 'genus=Clostridiales_unclassified(100)': 75, 'genus_0=Neisseria_unclassified(100)': 139, 'genus=Acinetobacter(100)': 58, 'genus=Deinococcus(100)': 81, 'genus=Erysipelotrichaceae_unclassified(100)': 82, 'family=Lachnospiraceae(89)': 39, 'genus=Akkermansia(100)': 61, 'genus_1=Ruminococcaceae_unclassified(96)': 195, 'family=Coriobacteriales(100)': 30, 'genus_1=Listeria_unclassified(100)': 187, 'genus_1=Porphyromonas_unclassified(100)': 190, 'genus_0=Bacillaceae_unclassified(100)': 112, 'genus_0=unclassified_Clostridiales_unclassified(93)': 153, 'phylum=Deinococcus-Thermus(100)': 234, 'genus_1=unclassified_Ruminococcaceae_unclassified(62)': 204, 'family=Enterobacteriaceae(100)': 33, 'genus_1=Bacillaceae_unclassified(100)': 162, 'family=Clostridiales_unclassified(100)': 27, 'genus_1=Erysipelotrichaceae_unclassified(100)': 179, 'genus=Escherichia/Shigella(100)': 83, 'order=Erysipelotrichales(100)': 219, 'genus=Cryomorphaceae_unclassified(100)': 80, 'genus=Bacillaceae_unclassified(100)': 64, 'family=Firmicutes_unclassified(100)': 35, 'genus_0=Barnesiella_unclassified(96)': 119, 'genus_0=unclassified_Clostridiales_unclassified(63)': 152, 'family=Listeriaceae(100)': 41, 'genus_1=Adlercreutzia(64)': 157, 'genus_1=Lachnospiraceae_unclassified(61)': 185, 'genus=unclassified_Clostridiales_unclassified(63)': 103, 'genus_1=Streptococcus_unclassified(100)': 198, 'family=unclassified_Clostridiales(93)': 57, 'genus_1=Gammaproteobacteria_unclassified(100)': 183, 'genus_0=Gammaproteobacteria_unclassified(100)': 134, 'genus=Bifidobacteriaceae(100)': 72, 'genus=Ruminococcaceae_unclassified(96)': 97, 'family=Gammaproteobacteria_unclassified(100)': 36, 'genus=Clostridiaceae_1(100)': 74, 'genus=TM7_genera_incertae_sedis_unclassified(100)': 101, 'family=Porphyromonadaceae(100)': 44, 'genus=Rhizobiales_unclassified(100)': 94, 'family=Rhizobiales_unclassified(100)': 46, 'genus=Neisseria(100)': 91, 'genus_0=Lactobacillus_unclassified(95)': 137}
In [21]:
# rows and number of column names in the matrix
# 106 rows and 240 columns
df_encoded.shape
Out[21]:
(106, 240)
In [22]:
# t-SNE
# find a two dimensional representation of the data that preserves the distances between points as best as possible
# preserves information by finding the closest neighbours (points)

# instantiate model
tsne = TSNE(random_state=42)

# fit and transform the data
otu_tsne = tsne.fit_transform(df_encoded)
In [23]:
# plot 
xs = otu_tsne[:,0]
ys = otu_tsne[:,1]
plt.scatter(xs, ys)
plt.show()

The clusters are not so distinct maybe changing the learning rate parameter could help. Let's apply another technique and see how this gones. Prinicipal Component analysis(PCoA), however, Principal Coordinate analysis would be more in this case.

In [24]:
# PCA removes the zero variance features and transforms the data
# Another cool thing about PCA is that you can use it for feature extraction

stdscale = StandardScaler() # use a standard scaler to ensure each row has a mean of 0 and a variance of 1
df_scaled = stdscale.fit_transform(df_encoded) # apply it to our dataset
In [25]:
# call PCA
model_pca = PCA()
transformed = model_pca.fit_transform(df_scaled) #fit
print(transformed.shape) # look at the dimensions of the matrix
(106, 106)
In [26]:
# draw a scatter plot of the first two columns, change the transparency of the points and draw the points in red
plt.scatter(transformed[:,0], transformed[:,1], alpha = 0.1, color = 'red')
Out[26]:
<matplotlib.collections.PathCollection at 0x11fd8e940>
In [27]:
# Plot explained total variance of principal components against number of components
# make a vertical line to ensure you are extracting the right point on the graph

plt.plot(np.cumsum(model_pca.explained_variance_ratio_))
plt.axvline(x=48, color='green', linestyle='--')
plt.xlabel('Number of components')
plt.ylabel('Principal components')
plt.title('Finding the ideal number of n_components parameter for PCA')
Out[27]:
Text(0.5,1,'Finding the ideal number of n_components parameter for PCA')
In [28]:
# new fit
# Let's try adjust the number of components and see what we'll get
pca = PCA(n_components=48)
transformed = model_pca.fit_transform(df_scaled)
print(transformed.shape)
plt.scatter(transformed[:,0], transformed[:,1], alpha = 0.1)
(106, 106)
Out[28]:
<matplotlib.collections.PathCollection at 0x11fec1a90>
In [29]:
phylum.value_counts()
Out[29]:
Firmicutes(100)               54
Proteobacteria(100)           14
TM7(100)                      12
Bacteria_unclassified(100)    10
Bacteroidetes(100)             7
Deinococcus-Thermus(100)       4
Actinobacteria(100)            4
Verrucomicrobia(100)           1
Name: phylum, dtype: int64

From the results, i suspect that the points between 0 and 2 x-axis and 0 and 2 y-axis could represent the bacteria which are on the phylum Firmicutes to left could be the Proteobacteria. Whereas, the isolated points could be TM7 and Bacteria_unclassified. I've achieved partially achieved my objective with this dataset. Later on, I'll run PCoA on this dataset and compare the results. In addition, if you choose another part of the matrix you'll get a different result.