#!/usr/bin/env python
# coding: utf-8

# ## Functional prediction of hypothetical TFs in bacteria using supervised machine learning: *Pseudomonas aeruginosa*

# Created by Emanuel Flores-Bautista 2018.  All code contained in this notebook is licensed under the [Creative Commons License 4.0](https://creativecommons.org/licenses/by/4.0/).

# This is the workflow we will use:
#     
# 1. Load in the TF-TF network from RegulonDB as a `DataFrame`, and NetworkX.
# 2. Apply clustering louvain algorithm to find the modules (caveat: each run the cluster labels change, but the clusters have the same content, i.e. we have to make our predictions in one single run). Add a new attribute in the network corresponding to cluster labels. Validate their functional relationship (e.g. SoxR and SoxS, GadEWX fall into the same modules, respectively)
# 4. Data preprocessing: check for missing data, normalize, and reduce dimensionality using PCA. 
# 6. Multi-label classification: Assign the cluster labels to the expression dataframe. 
# 5. Train a multi-label neural network with the known TF exp data assigning the cluster labels (try first with a SVM). Based on this ref: https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f 
# 6. Validate the performance. We can try 4 different algorithms and then predict with the best one. 
# 7. Predict the labels for the TFs with unknown physiological function.
# 8. Analyze results. 
#     

# In[1]:


pwd


# In[2]:


cd ../Desktop/uni/bioinfo/data/paeru/


# In[5]:


##Setting the Pyplot Figures inside the notebook
get_ipython().run_line_magic('matplotlib', 'inline')

#Get svg graphics from the Notebook
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'svg'")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import random 
import matplotlib as mpl

sns.set_style('white')
sns.set_context("talk", font_scale=1, rc={"lines.linewidth": 2.0, 'lines.markersize': 5})
sns.set_style("ticks")
sns.set_style({"xtick.direction": "in","ytick.direction": "in"})
#mpl.rc('axes', prop_cycle=(cycler('color', ['b','g','y','m','c']) ))
mpl.rc('text', usetex=False)

tw = 1.5
sns.set_style({"xtick.major.size": 6, "ytick.major.size": 6,
               "xtick.minor.size": 4, "ytick.minor.size": 4,
               'axes.labelsize': 28,
               'xtick.major.width': tw, 'xtick.minor.width': tw,
               'ytick.major.width': tw, 'ytick.minor.width': tw})

mpl.rc('xtick', labelsize=28) 
mpl.rc('ytick', labelsize=28)
mpl.rc('axes', linewidth=1.75)
plt.gcf().subplots_adjust(bottom=0.15)
sns.set_style({'axes.labelsize': 24})


# In[5]:


ls *.csv


# In[6]:


df_trn = pd.read_csv('paeru-trn.csv',  comment= '#')


# In[7]:


df_trn.head()


# In[8]:


#Pandas DataFrame to a NetworkX graph object

trn = nx.from_pandas_edgelist(df= df_trn, source= 'Regulator', target='Target gene',
                             edge_attr='mode of regulation')


# In[9]:


#Calculating eigenvector centrality to get the hubs

eigen_cen= nx.eigenvector_centrality(trn)
hubs= sorted(eigen_cen.items(), key= lambda cc: cc[1], reverse= True)[:10]
hubs


# In[10]:


import community


# In[11]:


#Running the clustering louvain algorithm to divide our network into clusters

communities_trn = community.best_partition(trn)


# In[12]:


#How many clusters do we get ? 

max(communities_trn.values())


# In[13]:


#Extracting the LCC

trn_lcc = max(nx.connected_component_subgraphs(trn), key=len)


# In[14]:


len(trn_lcc)


# In[15]:


#Running the clustering louvain algorithm to divide our network into clusters

communities_trn = community.best_partition(trn_lcc)


# In[16]:


#How many clusters do we get with the TRN's LCC? 

max(communities_trn.values())


# In[17]:


nx.set_node_attributes(trn_lcc, values= communities_trn, name='modularity')


# Let's extract the network clusters. 

# In[18]:


cluster1 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 0]

cluster2 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 1]

cluster3 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 2]

cluster4 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 3]

cluster5 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 4]

cluster6 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 5]

cluster7 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 6]

cluster8 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 7]

cluster9 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 8]

cluster10 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 9]

cluster11 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 10]

cluster12 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 11]

cluster13 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 12]

cluster14 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 13]


# In[19]:


nx.draw(trn_lcc, node_color = 'lightyellow', node_label = True, alpha = 0.6, font_size= 8)


# In[20]:


import csv
with open('list_tfs.txt') as inputfile:
    results = list(csv.reader(inputfile))


# In[21]:


results[:5]


# In[22]:


#Concatenating the lists with itertools
import itertools

tf_list = list(itertools.chain.from_iterable(results))


# In[23]:


tf_list[:5]


# #### Extracting the TF-TF net

# In[24]:


tf_tf_list = []


# In[25]:


for row in df_trn['Target gene']:
    if row in tf_list: 
        tf_tf_list.append(1)
    else:
        tf_tf_list.append(0)
        

# In[26]:


df_trn['tf'] = tf_tf_list


# In[27]:


df_trn.head()


# In[28]:


tf_tf_net = df_trn[df_trn['tf'] == 1]


# In[29]:


tf_tf_net.shape


# In[30]:


tf_tf_net.head()


# In[31]:


#Pandas DataFrame to a NetworkX graph object

tf_trn = nx.from_pandas_edgelist(df= tf_tf_net, source= 'Regulator', target='Target gene',
                             edge_attr='mode of regulation')


# In[32]:


#Calculating eigenvector centrality to get the hubs

eigen_cen= nx.eigenvector_centrality(tf_trn)
hubs= sorted(eigen_cen.items(), key= lambda cc: cc[1], reverse= True)[:10]
hubs


# In[33]:


#Clustering the TF- TF network

communities_trn = community.best_partition(trn)


# In[34]:


#How many clusters do we get ? 

max(communities_trn.values())


# In[35]:


#Extracting the LCC
tf_tf_lcc = max(nx.connected_component_subgraphs(tf_trn), key=len)


# In[36]:


len(tf_tf_lcc)


# In[37]:


communities_trn = community.best_partition(tf_tf_lcc)


# In[38]:


#How many clusters do we get ? 

print(max(communities_trn.values()) +1 )


# In[39]:


nx.set_node_attributes(tf_tf_lcc, values= communities_trn, name='modularity')


# In[40]:


cluster1 =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 0]

cluster2 =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 1]

cluster3 =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 2]

cluster4 =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 3]

cluster5 =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 4]

cluster6 =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 5]

cluster7 =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 6]

cluster8  =  [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 7]


# In[41]:


cluster1


# In[42]:


cluster2


# Now that we have our clusters of TFs, let's proceed to preprocess the expression data. 

# ### Expression data pre-processing

# In[43]:


df_x = pd.read_csv("colombos_paeru_exprdata_20151029.txt", delimiter= '\t', comment= '#')
df_x.head()


# In[44]:


df_x.shape


# In[45]:


def test_missing_data(df, fname):
    """Look for missing entries."""
    assert np.all(df.notnull()), fname + ' contains missing data'


# In[46]:


fname = 'colombos_paeru_exprdata_20151029.txt'


# In[47]:


#test_missing_data(df_x, fname)


# In[48]:


##Replacing the NaNs with the median of each column (expression condition)

df_x = df_x.fillna(df_x.median())


# In[49]:


test_missing_data(df_x, fname)


# We're good to go. 

# ####  Dividing the expression and annotation datasets

# In[50]:


annot= df_x.iloc[:,:3]
exp = df_x.iloc[:,3:]


# In[51]:


annot.head()


# In[52]:


from sklearn.preprocessing import StandardScaler as st
std_scaler = st() # initialize stdScaler object
df_ex = std_scaler.fit_transform(exp)  # transform data
df_ex= pd.DataFrame(df_ex)  #go from std scaler object to df


# In[53]:


df_ex.describe()## check if mean = 0 , s.d. = 1 


# Let's now apply a PCA to our normalized data. 

# In[54]:


from sklearn.decomposition import PCA as PCA
# apply dimensionality reduction
pca = PCA(svd_solver='randomized', random_state = 42).fit(df_ex)


# In[55]:


cum_exp_var = np.cumsum(pca.explained_variance_ratio_)

# look at it
plt.plot(cum_exp_var, color = 'salmon', alpha = 0.7)
plt.xlabel('Number of dimensions')
plt.ylabel('Cumulative explained variance ratio')
#plt.axvline(x= 14 , color='k', linestyle='--')
plt.title('PCA Explained Variance ', fontsize= 20)
plt.xlim(0,500)
#plt.savefig('PCA-var.tiff',dpi=500, bbox_inches='tight')


# In[56]:


vr = np.cumsum(pca.explained_variance_ratio_) 


# In[57]:


vr[300]


# In[58]:


n_components = 300


# In[59]:


##Now let's tranform our data to a PCA object using the original index, so we can track of the genes

df_pca = pca.transform(df_ex)

df_pca = pd.DataFrame(df_pca,
                         index= df_x.index)


# In[60]:


df_exp = df_pca.iloc[:,:n_components] ## We'll keep 420 principal components


# In[61]:


##changing the col. names to PC
df_exp.columns =  ['PC' +  str(x) for x in range(1, n_components+1)] 


# In[62]:


df_exp.head()


# ### Multi-class classification data preparation: Assign the cluster labels to the expression dataframe.

# In[63]:


df_trn.tail()


# In[64]:


del(df_trn['tf'])


# In[65]:


df_trn.columns = ['tf', 'operon', 'tg', 'regtype', 'ev', 'PMID', 'strain']


# #### Cluster 1:

# In[66]:


cluster1


# In[67]:


#Now let's filter the regulons of each TF from the TRN

cluster_1 = df_trn[(df_trn['tf'] =='gbuR') | (df_trn['tf'] =='pfeR')\
                 | (df_trn['tf'] =='pchR') | (df_trn['tf'] =='fur')\
                 | (df_trn['tf'] =='agmR')| (df_trn['tf'] =='glpR')
                  ]


# In[68]:


cluster_1_tgs = []

#Making a list that corresponds to the first cluster's target genes (TGs)

for row in cluster_1['tg']:
    cluster_1_tgs.append(row)
    
#Make a set to avoid repetition, and then re-make a list out of it.
    
cluster1_tgs = list(set(cluster_1_tgs))
print(len(cluster1_tgs))


# #### Cluster 2: 

# In[69]:


cluster2


# In[70]:


#Now let's filter the regulons of each TF from the TRN

cluster_2 = df_trn[(df_trn['tf'] =='gacA') | (df_trn['tf'] =='qscR')\
                 | (df_trn['tf'] =='ampR') | (df_trn['tf'] =='algQ')\
                 | (df_trn['tf'] =='algR4')| (df_trn['tf'] =='mvfR')\
                 | (df_trn['tf'] =='rhlR') | (df_trn['tf'] =='rsaL')\
                 | (df_trn['tf'] =='ihf')| (df_trn['tf'] =='algR3')\
                 | (df_trn['tf'] =='lasR')
                  ]


# In[71]:


cluster_2_tgs = []

#Making a list that corresponds to the second cluster's target genes (TGs)

for row in cluster_2['tg']:
    cluster_2_tgs.append(row)
    
    
#Make a set to avoid repetition, and then re-make a list out of it.
    
cluster2_tgs = list(set(cluster_2_tgs))

#Let's see how many TGs does cluster 2 have 

print(len(cluster2_tgs))


# In[72]:


cluster3


# In[73]:


#Now let's filter the regulons of each TF from the TRN

cluster_3 = df_trn[(df_trn['tf'] =='fleQ') | (df_trn['tf'] =='vfr')\
                 | (df_trn['tf'] =='pvdS') | (df_trn['tf'] =='hu')\
                 | (df_trn['tf'] =='ptxR')| (df_trn['tf'] =='mvtA')\
                 | (df_trn['tf'] =='ptxS') | (df_trn['tf'] =='toxR')\
                 | (df_trn['tf'] =='rpoD')
                  ]


# In[74]:


cluster_3_tgs = []

#Making a list that corresponds to the cluster's target genes (TGs)

for row in cluster_3['tg']:
    cluster_3_tgs.append(row)
    
#Make a set to avoid repetition, and then re-make a list out of it.

    
cluster3_tgs = list(set(cluster_3_tgs))

print(len(cluster3_tgs))


# In[75]:


cluster4


# In[76]:


#Now let's filter the regulons of each TF from the TRN

cluster_4 = df_trn[(df_trn['tf'] =='exsA') | (df_trn['tf'] =='mexR')\
                 | (df_trn['tf'] =='mexT') | (df_trn['tf'] =='nalC')\
                 | (df_trn['tf'] =='psrA')| (df_trn['tf'] =='rpoS')\
                 | (df_trn['tf'] =='exsD') 
                  ]


# In[77]:


cluster_4_tgs = []

#Making a list that corresponds to the cluster's target genes (TGs)...

for row in cluster_4['tg']:
    cluster_4_tgs.append(row)
    
#Make a set to avoid repetition, and then re-make a list out of it...

    
cluster4_tgs = list(set(cluster_4_tgs))

print(len(cluster4_tgs))


# In[78]:


cluster5


# In[79]:


cluster_5 = df_trn[(df_trn['tf'] =='dnr') | (df_trn['tf'] =='anr')\
                 | (df_trn['tf'] =='narL') ]


# In[80]:


cluster_5_tgs = []

for row in cluster_5['tg']:
    cluster_5_tgs.append(row)
    
cluster5_tgs = list(set(cluster_5_tgs))

print(len(cluster5_tgs))


# In[81]:


cluster6


# In[82]:


#Now let's filter the regulons of each TF from the TRN

cluster_6 = df_trn[(df_trn['tf'] =='PA0779') | (df_trn['tf'] =='pilA')\
                 | (df_trn['tf'] =='fhpR') | (df_trn['tf'] =='rpoN')\
                 | (df_trn['tf'] =='pilR')]


# In[83]:


cluster_6_tgs = []

for row in cluster_6['tg']:
    cluster_6_tgs.append(row)
    
cluster6_tgs = list(set(cluster_6_tgs))

print(len(cluster6_tgs))


# In[84]:


cluster7


# In[85]:


#Now let's filter the regulons of each TF from the TRN

cluster_7 = df_trn[(df_trn['tf'] =='phoQ') | (df_trn['tf'] =='pmrA')\
                 | (df_trn['tf'] =='phoP') | (df_trn['tf'] =='tbpA')]


# In[86]:


cluster_7_tgs = []

for row in cluster_7['tg']:
    cluster_7_tgs.append(row)
    
cluster7_tgs = list(set(cluster_7_tgs))

print(len(cluster7_tgs))


# In[162]:


cluster2


# In[164]:


cluster_8 = df_trn[(df_trn['tf'] =='algR') | (df_trn['tf'] =='algZ')\
                 | (df_trn['tf'] =='algU') | (df_trn['tf'] =='mucB')]


# In[165]:


cluster_8_tgs = []

for row in cluster_8['tg']:
    cluster_8_tgs.append(row)
    
cluster8_tgs = list(set(cluster_8_tgs))

print(len(cluster8_tgs))


# In[166]:


#Initializing the labels' lists

labels1 = []
labels2 = []
labels3 = []
labels4 = []
labels5 = []
labels6 = []
labels7 = []
labels8 = []


# In[167]:


##Seting up the labels for each cluster

#C1
for row in df_x['gene name']:
    if row in cluster1_tgs:
        labels1.append(1)
    else:
        labels1.append(0)

        
#C2        
for row in df_x['gene name']:
    if row in cluster2_tgs:
        labels2.append(1)
    else:
        labels2.append(0)
        
#C3
for row in df_x['gene name']:
    if row in cluster3_tgs:
        labels3.append(1)
    else:
        labels3.append(0)

#C4 
for row in df_x['gene name']:
    if row in cluster4_tgs:
        labels4.append(1)
    else:
        labels4.append(0)
       
        
#C5
for row in df_x['gene name']:
    if row in cluster5_tgs:
        labels5.append(1)
    else:
        labels5.append(0)
        
#C6
for row in df_x['gene name']:
    if row in cluster6_tgs:
        labels6.append(1)
    else:
        labels6.append(0)
        
#C7
for row in df_x['gene name']:
    if row in cluster7_tgs:
        labels7.append(1)
    else:
        labels7.append(0)
        
#C8
for row in df_x['gene name']:
    if row in cluster8_tgs:
        labels8.append(1)
    else:
        labels8.append(0)


# In[168]:


# Adding the cluster labels for each cluster as extra columns

df_exp['cluster 1'] = labels1
df_exp['cluster 2'] = labels2
df_exp['cluster 3'] = labels3
df_exp['cluster 4'] = labels4
df_exp['cluster 5'] = labels5
df_exp['cluster 6'] = labels6
df_exp['cluster 7'] = labels7
df_exp['cluster 8'] = labels8


# In[169]:


df_exp.head()


# In[170]:


#Let's make a list to filter out the TGs in each cluster
#This will allow us to use only the genes that are regulated by TFs
#(i.e. the TGs of the TRN) as training data


TGs_list = []

for row in df_x['gene name']:
    if row in cluster1_tgs:
        TGs_list.append(1)
        
    elif row in cluster2_tgs:
        TGs_list.append(1)
        
    elif row in cluster3_tgs:
        TGs_list.append(1)
        
    elif row in cluster4_tgs:
        TGs_list.append(1)
        
    elif row in cluster5_tgs:
        TGs_list.append(1)
        
    elif row in cluster6_tgs:
        TGs_list.append(1)
        
    elif row in cluster7_tgs:
        TGs_list.append(1)
        
    elif row in cluster8_tgs:
        TGs_list.append(1)
        
    else:
        TGs_list.append(0)        


# In[171]:


##Adding the TG list as a column to the expression data

df_exp['TGs'] = TGs_list


# In[172]:


df_exp.head()


# In[173]:


##Let's filter out the genes that are regulated by TFs

regulons_df = df_exp[df_exp['TGs'] == 1]


# In[174]:


del(regulons_df['TGs'])


# In[175]:


#Let's filter out the genes that are not regulated by TFs

non_reg_df  = df_exp[df_exp['TGs'] == 0]


# In[176]:


del(non_reg_df['TGs'])


# In[ ]:


# In[177]:


##Making a dataframe called noise, by randomly picking 
##genes that are NOT REGULATED by TFs without replacement

noise = non_reg_df.sample(n =20, replace = False, axis = 0, random_state = 42)


# In[178]:


regulons_with_noise = pd.concat([regulons_df, noise]) ## unbiased train/test dataset 

regulons_with_noise.shape ##Let's look at the nrows and ncols


# In[185]:


X_data = regulons_with_noise.iloc[:,:-8]
y_data = regulons_with_noise.iloc[:,-8:]


# In[180]:


from sklearn.model_selection import train_test_split


# In[186]:


#The test subset will correspond to 30% of the data at random

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42) 


# In[112]:


import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from keras.metrics import categorical_accuracy
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,accuracy_score\
,confusion_matrix


# In[104]:


import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()

import holoviews as hv
hv.extension('bokeh')


# In[182]:


X_train.shape


# In[187]:


#softmax activation
model = Sequential()
model.add(Dense(units=500, activation='softmax', input_dim=n_components))
model.add(Dense(units=8))# 7 cluster classif. output
model.compile(loss= 'mse', optimizer='RMSprop', metrics= ['accuracy'])
history = model.fit(X_train, y_train, epochs=80, batch_size= 60)


# In[188]:


x = bokeh.plotting.figure(height=400,
                          width=650,
                          x_axis_label='epoch', 
                          y_axis_label='accuracy',
                         y_range=(0, 1), title= 'Model Training Accuracy P. aeruginosa')

x.circle(x = np.arange(1,81,1), y = history.history['acc'] , fill_alpha = 0.3)
#x.line(t, p[:,1])
#x.line(t, p[:,2])
bokeh.io.show(x)


# In[243]:


# Keras simulations using Matplotlib 

n_simulations = 30

train_acc = []
test_acc = []

for i in range(n_simulations):
    
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data
                                                        , test_size=0.3, random_state=42) 
    
    model = Sequential()
    model.add(Dense(units=500, activation='softmax', input_dim=n_components))
    model.add(Dense(units=8))# 8 cluster classif. output
    model.compile(loss= 'mse', optimizer='RMSprop', metrics= ['accuracy'])
    history = model.fit(X_train, y_train, epochs=80, batch_size= 60)

    accuracy = history.history['acc']
    loss = history.history['loss']
    train_acc.append(accuracy[79])
    
    score = model.evaluate(X_test, y_test,verbose=0)
    test_acc.append(score[1])

    # summarize history for accuracy/loss
    plt.plot(accuracy, 'o', color = 'mediumaquamarine', alpha = 0.3, markersize= 5)
    plt.plot(loss, 'o', color = 'orangered', alpha = 0.3, markersize= 5)
    plt.title('Keras Model Training $P. aeruginosa$ ', fontsize = 15)
    plt.ylabel('Acc / Loss ')
    plt.xlabel('epoch')
    plt.ylim(0,1.1)
    plt.legend(['Acc.','Loss' ], loc='best')
    
plt.savefig('keras-model-train-paeru.tiff', dpi = 350)    


# In[245]:


tos
train = ['train'] * len(train_acc)
test = ['test'] * len(train_acc)


# In[246]:


x = list(zip(train_acc, paeru,train))
y = list(zip(test_acc, paeru, test))


# In[247]:


x[:5]


# In[248]:


entries= x + y 


# In[249]:


paeru_df = pd.DataFrame(index = range(n_simulations*2))
paeru_df = pd.DataFrame(entries, columns=['accuracy', 'organism', 'type'])


# In[250]:


paeru_df.to_csv('paeru-model.csv')


# In[251]:


sns.violinplot(x = 'organism', y = 'accuracy',
               hue = 'type', data = paeru_df,
               palette = 'pastel', scale = 'width', 
              inner = 'quartile')

plt.ylim(.3, 1.1)


# In[252]:


y_pred = model.predict(X_test, batch_size =100)
y_pred_flat = np.round(y_pred.flatten())
y_test_flat = y_test.values.flatten()


# In[253]:


conf_mat = confusion_matrix(y_test_flat, y_pred_flat)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=['not in cluster', 'inside cluster'], yticklabels=['not in cluster', 'inside cluster'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Keras Classifier confusion matrix $P. aeruginosa$')
#plt.savefig('conf-mat-paeru-keras.tiff', dpi = 350)


# In[254]:


score = model.evaluate(X_test, y_test,verbose=0)
print('Keras model testing loss: {0:.3f} and accuracy: {1:.3f}'.format(score[0], score[1]))


# In[124]:


accuracy = history.history['acc']


# In[125]:


accuracy[79]


# We get a decent ~85-95% training accuracy with the Keras Classifier. Now let's try other algorithms 

# ### Comparing Keras classifier with other algorithms

# In[130]:


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


# In[257]:


grid_params = {'max_depth' : [5,30],
               'min_samples_split' : [5, 20]}


# In[258]:


fixed_params = { 'n_estimators': 50, 
               'random_state' : 42, 
               'n_jobs' : -1}


# In[ ]:


# In[259]:


grid_rf = GridSearchCV(RandomForestClassifier(**fixed_params),
                      param_grid= grid_params, 
                      n_jobs = -1,
                      scoring = 'neg_mean_absolute_error')


# In[260]:


grid_rf.fit(X_train, y_train)


# In[261]:


grid_rf.best_params_


# In[262]:


model = RandomForestClassifier(**fixed_params, **grid_rf.best_params_)


# In[263]:


model.fit(X_train, y_train)


# In[264]:


y_pred = model.predict(X_test)


y_pred_flat = y_pred.flatten()


# In[265]:


print(classification_report(y_test, y_pred))
print('Accuracy score : ', accuracy_score(y_test, y_pred))


# In[138]:


results = cross_val_score(model, X_train, y_train, cv=5)
print("Mean cross validation score: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


# In[266]:


conf_mat = confusion_matrix(y_test_flat, y_pred_flat)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=['not in cluster', 'inside cluster'], yticklabels=['not in cluster', 'inside cluster'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Random Forest Confusion Matrix $P. aeruginosa $')
plt.savefig('conf-mat-paeru-rf.tiff', dpi = 350)


# ### MLP

# In[267]:


from sklearn.neural_network import MLPClassifier

model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(1000, 20), random_state=42)

model.fit(X_train, y_train)  


# In[268]:


y_pred= model.predict(X_test)

y_test_flat = y_test.values.flatten()
y_pred_flat = y_pred.flatten()


# In[269]:


print(classification_report(y_test, y_pred))
print('Accuracy score : ', accuracy_score(y_test, y_pred))


# In[143]:


results = cross_val_score(model, X_train, y_train, cv=5)
print("Mean cross validation score: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


# In[270]:


conf_mat = confusion_matrix(y_test_flat, y_pred_flat)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=['not in cluster', 'inside cluster'], yticklabels=['not in cluster', 'inside cluster'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('MLP Confusion Matrix $P. aeruginosa $')
plt.savefig('conf-mat-paeru-mlp.tiff', dpi = 350)


# In the case of *P. aeruginosa* the best model is MLP. One should further tune the hyperparameters for better results. Moreover, one further step is to do feature engineering on the expression conditions to get better prediction accuracy.

# In[ ]: