#!/usr/bin/env python # coding: utf-8 # ## Functional prediction of hypothetical TFs in bacteria using supervised machine learning: *Pseudomonas aeruginosa* # Created by Emanuel Flores-Bautista 2018. All code contained in this notebook is licensed under the [Creative Commons License 4.0](https://creativecommons.org/licenses/by/4.0/). # This is the workflow we will use: # # 1. Load in the TF-TF network from RegulonDB as a `DataFrame`, and NetworkX. # 2. Apply clustering louvain algorithm to find the modules (caveat: each run the cluster labels change, but the clusters have the same content, i.e. we have to make our predictions in one single run). Add a new attribute in the network corresponding to cluster labels. Validate their functional relationship (e.g. SoxR and SoxS, GadEWX fall into the same modules, respectively) # 4. Data preprocessing: check for missing data, normalize, and reduce dimensionality using PCA. # 6. Multi-label classification: Assign the cluster labels to the expression dataframe. # 5. Train a multi-label neural network with the known TF exp data assigning the cluster labels (try first with a SVM). Based on this ref: https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f # 6. Validate the performance. We can try 4 different algorithms and then predict with the best one. # 7. Predict the labels for the TFs with unknown physiological function. # 8. Analyze results. # # In[1]: pwd # In[2]: cd ../Desktop/uni/bioinfo/data/paeru/ # In[5]: ##Setting the Pyplot Figures inside the notebook get_ipython().run_line_magic('matplotlib', 'inline') #Get svg graphics from the Notebook get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'svg'") import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import networkx as nx import random import matplotlib as mpl sns.set_style('white') sns.set_context("talk", font_scale=1, rc={"lines.linewidth": 2.0, 'lines.markersize': 5}) sns.set_style("ticks") sns.set_style({"xtick.direction": "in","ytick.direction": "in"}) #mpl.rc('axes', prop_cycle=(cycler('color', ['b','g','y','m','c']) )) mpl.rc('text', usetex=False) tw = 1.5 sns.set_style({"xtick.major.size": 6, "ytick.major.size": 6, "xtick.minor.size": 4, "ytick.minor.size": 4, 'axes.labelsize': 28, 'xtick.major.width': tw, 'xtick.minor.width': tw, 'ytick.major.width': tw, 'ytick.minor.width': tw}) mpl.rc('xtick', labelsize=28) mpl.rc('ytick', labelsize=28) mpl.rc('axes', linewidth=1.75) plt.gcf().subplots_adjust(bottom=0.15) sns.set_style({'axes.labelsize': 24}) # In[5]: ls *.csv # In[6]: df_trn = pd.read_csv('paeru-trn.csv', comment= '#') # In[7]: df_trn.head() # In[8]: #Pandas DataFrame to a NetworkX graph object trn = nx.from_pandas_edgelist(df= df_trn, source= 'Regulator', target='Target gene', edge_attr='mode of regulation') # In[9]: #Calculating eigenvector centrality to get the hubs eigen_cen= nx.eigenvector_centrality(trn) hubs= sorted(eigen_cen.items(), key= lambda cc: cc[1], reverse= True)[:10] hubs # In[10]: import community # In[11]: #Running the clustering louvain algorithm to divide our network into clusters communities_trn = community.best_partition(trn) # In[12]: #How many clusters do we get ? max(communities_trn.values()) # In[13]: #Extracting the LCC trn_lcc = max(nx.connected_component_subgraphs(trn), key=len) # In[14]: len(trn_lcc) # In[15]: #Running the clustering louvain algorithm to divide our network into clusters communities_trn = community.best_partition(trn_lcc) # In[16]: #How many clusters do we get with the TRN's LCC? max(communities_trn.values()) # In[17]: nx.set_node_attributes(trn_lcc, values= communities_trn, name='modularity') # Let's extract the network clusters. # In[18]: cluster1 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 0] cluster2 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 1] cluster3 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 2] cluster4 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 3] cluster5 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 4] cluster6 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 5] cluster7 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 6] cluster8 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 7] cluster9 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 8] cluster10 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 9] cluster11 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 10] cluster12 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 11] cluster13 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 12] cluster14 = [n for n in trn_lcc.nodes() if trn_lcc.node[n]['modularity'] == 13] # In[19]: nx.draw(trn_lcc, node_color = 'lightyellow', node_label = True, alpha = 0.6, font_size= 8) # In[20]: import csv with open('list_tfs.txt') as inputfile: results = list(csv.reader(inputfile)) # In[21]: results[:5] # In[22]: #Concatenating the lists with itertools import itertools tf_list = list(itertools.chain.from_iterable(results)) # In[23]: tf_list[:5] # #### Extracting the TF-TF net # In[24]: tf_tf_list = [] # In[25]: for row in df_trn['Target gene']: if row in tf_list: tf_tf_list.append(1) else: tf_tf_list.append(0) # In[26]: df_trn['tf'] = tf_tf_list # In[27]: df_trn.head() # In[28]: tf_tf_net = df_trn[df_trn['tf'] == 1] # In[29]: tf_tf_net.shape # In[30]: tf_tf_net.head() # In[31]: #Pandas DataFrame to a NetworkX graph object tf_trn = nx.from_pandas_edgelist(df= tf_tf_net, source= 'Regulator', target='Target gene', edge_attr='mode of regulation') # In[32]: #Calculating eigenvector centrality to get the hubs eigen_cen= nx.eigenvector_centrality(tf_trn) hubs= sorted(eigen_cen.items(), key= lambda cc: cc[1], reverse= True)[:10] hubs # In[33]: #Clustering the TF- TF network communities_trn = community.best_partition(trn) # In[34]: #How many clusters do we get ? max(communities_trn.values()) # In[35]: #Extracting the LCC tf_tf_lcc = max(nx.connected_component_subgraphs(tf_trn), key=len) # In[36]: len(tf_tf_lcc) # In[37]: communities_trn = community.best_partition(tf_tf_lcc) # In[38]: #How many clusters do we get ? print(max(communities_trn.values()) +1 ) # In[39]: nx.set_node_attributes(tf_tf_lcc, values= communities_trn, name='modularity') # In[40]: cluster1 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 0] cluster2 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 1] cluster3 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 2] cluster4 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 3] cluster5 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 4] cluster6 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 5] cluster7 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 6] cluster8 = [n for n in tf_tf_lcc.nodes() if tf_tf_lcc.node[n]['modularity'] == 7] # In[41]: cluster1 # In[42]: cluster2 # Now that we have our clusters of TFs, let's proceed to preprocess the expression data. # ### Expression data pre-processing # In[43]: df_x = pd.read_csv("colombos_paeru_exprdata_20151029.txt", delimiter= '\t', comment= '#') df_x.head() # In[44]: df_x.shape # In[45]: def test_missing_data(df, fname): """Look for missing entries.""" assert np.all(df.notnull()), fname + ' contains missing data' # In[46]: fname = 'colombos_paeru_exprdata_20151029.txt' # In[47]: #test_missing_data(df_x, fname) # In[48]: ##Replacing the NaNs with the median of each column (expression condition) df_x = df_x.fillna(df_x.median()) # In[49]: test_missing_data(df_x, fname) # We're good to go. # #### Dividing the expression and annotation datasets # In[50]: annot= df_x.iloc[:,:3] exp = df_x.iloc[:,3:] # In[51]: annot.head() # In[52]: from sklearn.preprocessing import StandardScaler as st std_scaler = st() # initialize stdScaler object df_ex = std_scaler.fit_transform(exp) # transform data df_ex= pd.DataFrame(df_ex) #go from std scaler object to df # In[53]: df_ex.describe()## check if mean = 0 , s.d. = 1 # Let's now apply a PCA to our normalized data. # In[54]: from sklearn.decomposition import PCA as PCA # apply dimensionality reduction pca = PCA(svd_solver='randomized', random_state = 42).fit(df_ex) # In[55]: cum_exp_var = np.cumsum(pca.explained_variance_ratio_) # look at it plt.plot(cum_exp_var, color = 'salmon', alpha = 0.7) plt.xlabel('Number of dimensions') plt.ylabel('Cumulative explained variance ratio') #plt.axvline(x= 14 , color='k', linestyle='--') plt.title('PCA Explained Variance ', fontsize= 20) plt.xlim(0,500) #plt.savefig('PCA-var.tiff',dpi=500, bbox_inches='tight') # In[56]: vr = np.cumsum(pca.explained_variance_ratio_) # In[57]: vr[300] # In[58]: n_components = 300 # In[59]: ##Now let's tranform our data to a PCA object using the original index, so we can track of the genes df_pca = pca.transform(df_ex) df_pca = pd.DataFrame(df_pca, index= df_x.index) # In[60]: df_exp = df_pca.iloc[:,:n_components] ## We'll keep 420 principal components # In[61]: ##changing the col. names to PC df_exp.columns = ['PC' + str(x) for x in range(1, n_components+1)] # In[62]: df_exp.head() # ### Multi-class classification data preparation: Assign the cluster labels to the expression dataframe. # In[63]: df_trn.tail() # In[64]: del(df_trn['tf']) # In[65]: df_trn.columns = ['tf', 'operon', 'tg', 'regtype', 'ev', 'PMID', 'strain'] # #### Cluster 1: # In[66]: cluster1 # In[67]: #Now let's filter the regulons of each TF from the TRN cluster_1 = df_trn[(df_trn['tf'] =='gbuR') | (df_trn['tf'] =='pfeR')\ | (df_trn['tf'] =='pchR') | (df_trn['tf'] =='fur')\ | (df_trn['tf'] =='agmR')| (df_trn['tf'] =='glpR') ] # In[68]: cluster_1_tgs = [] #Making a list that corresponds to the first cluster's target genes (TGs) for row in cluster_1['tg']: cluster_1_tgs.append(row) #Make a set to avoid repetition, and then re-make a list out of it. cluster1_tgs = list(set(cluster_1_tgs)) print(len(cluster1_tgs)) # #### Cluster 2: # In[69]: cluster2 # In[70]: #Now let's filter the regulons of each TF from the TRN cluster_2 = df_trn[(df_trn['tf'] =='gacA') | (df_trn['tf'] =='qscR')\ | (df_trn['tf'] =='ampR') | (df_trn['tf'] =='algQ')\ | (df_trn['tf'] =='algR4')| (df_trn['tf'] =='mvfR')\ | (df_trn['tf'] =='rhlR') | (df_trn['tf'] =='rsaL')\ | (df_trn['tf'] =='ihf')| (df_trn['tf'] =='algR3')\ | (df_trn['tf'] =='lasR') ] # In[71]: cluster_2_tgs = [] #Making a list that corresponds to the second cluster's target genes (TGs) for row in cluster_2['tg']: cluster_2_tgs.append(row) #Make a set to avoid repetition, and then re-make a list out of it. cluster2_tgs = list(set(cluster_2_tgs)) #Let's see how many TGs does cluster 2 have print(len(cluster2_tgs)) # In[72]: cluster3 # In[73]: #Now let's filter the regulons of each TF from the TRN cluster_3 = df_trn[(df_trn['tf'] =='fleQ') | (df_trn['tf'] =='vfr')\ | (df_trn['tf'] =='pvdS') | (df_trn['tf'] =='hu')\ | (df_trn['tf'] =='ptxR')| (df_trn['tf'] =='mvtA')\ | (df_trn['tf'] =='ptxS') | (df_trn['tf'] =='toxR')\ | (df_trn['tf'] =='rpoD') ] # In[74]: cluster_3_tgs = [] #Making a list that corresponds to the cluster's target genes (TGs) for row in cluster_3['tg']: cluster_3_tgs.append(row) #Make a set to avoid repetition, and then re-make a list out of it. cluster3_tgs = list(set(cluster_3_tgs)) print(len(cluster3_tgs)) # In[75]: cluster4 # In[76]: #Now let's filter the regulons of each TF from the TRN cluster_4 = df_trn[(df_trn['tf'] =='exsA') | (df_trn['tf'] =='mexR')\ | (df_trn['tf'] =='mexT') | (df_trn['tf'] =='nalC')\ | (df_trn['tf'] =='psrA')| (df_trn['tf'] =='rpoS')\ | (df_trn['tf'] =='exsD') ] # In[77]: cluster_4_tgs = [] #Making a list that corresponds to the cluster's target genes (TGs)... for row in cluster_4['tg']: cluster_4_tgs.append(row) #Make a set to avoid repetition, and then re-make a list out of it... cluster4_tgs = list(set(cluster_4_tgs)) print(len(cluster4_tgs)) # In[78]: cluster5 # In[79]: cluster_5 = df_trn[(df_trn['tf'] =='dnr') | (df_trn['tf'] =='anr')\ | (df_trn['tf'] =='narL') ] # In[80]: cluster_5_tgs = [] for row in cluster_5['tg']: cluster_5_tgs.append(row) cluster5_tgs = list(set(cluster_5_tgs)) print(len(cluster5_tgs)) # In[81]: cluster6 # In[82]: #Now let's filter the regulons of each TF from the TRN cluster_6 = df_trn[(df_trn['tf'] =='PA0779') | (df_trn['tf'] =='pilA')\ | (df_trn['tf'] =='fhpR') | (df_trn['tf'] =='rpoN')\ | (df_trn['tf'] =='pilR')] # In[83]: cluster_6_tgs = [] for row in cluster_6['tg']: cluster_6_tgs.append(row) cluster6_tgs = list(set(cluster_6_tgs)) print(len(cluster6_tgs)) # In[84]: cluster7 # In[85]: #Now let's filter the regulons of each TF from the TRN cluster_7 = df_trn[(df_trn['tf'] =='phoQ') | (df_trn['tf'] =='pmrA')\ | (df_trn['tf'] =='phoP') | (df_trn['tf'] =='tbpA')] # In[86]: cluster_7_tgs = [] for row in cluster_7['tg']: cluster_7_tgs.append(row) cluster7_tgs = list(set(cluster_7_tgs)) print(len(cluster7_tgs)) # In[162]: cluster2 # In[164]: cluster_8 = df_trn[(df_trn['tf'] =='algR') | (df_trn['tf'] =='algZ')\ | (df_trn['tf'] =='algU') | (df_trn['tf'] =='mucB')] # In[165]: cluster_8_tgs = [] for row in cluster_8['tg']: cluster_8_tgs.append(row) cluster8_tgs = list(set(cluster_8_tgs)) print(len(cluster8_tgs)) # In[166]: #Initializing the labels' lists labels1 = [] labels2 = [] labels3 = [] labels4 = [] labels5 = [] labels6 = [] labels7 = [] labels8 = [] # In[167]: ##Seting up the labels for each cluster #C1 for row in df_x['gene name']: if row in cluster1_tgs: labels1.append(1) else: labels1.append(0) #C2 for row in df_x['gene name']: if row in cluster2_tgs: labels2.append(1) else: labels2.append(0) #C3 for row in df_x['gene name']: if row in cluster3_tgs: labels3.append(1) else: labels3.append(0) #C4 for row in df_x['gene name']: if row in cluster4_tgs: labels4.append(1) else: labels4.append(0) #C5 for row in df_x['gene name']: if row in cluster5_tgs: labels5.append(1) else: labels5.append(0) #C6 for row in df_x['gene name']: if row in cluster6_tgs: labels6.append(1) else: labels6.append(0) #C7 for row in df_x['gene name']: if row in cluster7_tgs: labels7.append(1) else: labels7.append(0) #C8 for row in df_x['gene name']: if row in cluster8_tgs: labels8.append(1) else: labels8.append(0) # In[168]: # Adding the cluster labels for each cluster as extra columns df_exp['cluster 1'] = labels1 df_exp['cluster 2'] = labels2 df_exp['cluster 3'] = labels3 df_exp['cluster 4'] = labels4 df_exp['cluster 5'] = labels5 df_exp['cluster 6'] = labels6 df_exp['cluster 7'] = labels7 df_exp['cluster 8'] = labels8 # In[169]: df_exp.head() # In[170]: #Let's make a list to filter out the TGs in each cluster #This will allow us to use only the genes that are regulated by TFs #(i.e. the TGs of the TRN) as training data TGs_list = [] for row in df_x['gene name']: if row in cluster1_tgs: TGs_list.append(1) elif row in cluster2_tgs: TGs_list.append(1) elif row in cluster3_tgs: TGs_list.append(1) elif row in cluster4_tgs: TGs_list.append(1) elif row in cluster5_tgs: TGs_list.append(1) elif row in cluster6_tgs: TGs_list.append(1) elif row in cluster7_tgs: TGs_list.append(1) elif row in cluster8_tgs: TGs_list.append(1) else: TGs_list.append(0) # In[171]: ##Adding the TG list as a column to the expression data df_exp['TGs'] = TGs_list # In[172]: df_exp.head() # In[173]: ##Let's filter out the genes that are regulated by TFs regulons_df = df_exp[df_exp['TGs'] == 1] # In[174]: del(regulons_df['TGs']) # In[175]: #Let's filter out the genes that are not regulated by TFs non_reg_df = df_exp[df_exp['TGs'] == 0] # In[176]: del(non_reg_df['TGs']) # In[ ]: # In[177]: ##Making a dataframe called noise, by randomly picking ##genes that are NOT REGULATED by TFs without replacement noise = non_reg_df.sample(n =20, replace = False, axis = 0, random_state = 42) # In[178]: regulons_with_noise = pd.concat([regulons_df, noise]) ## unbiased train/test dataset regulons_with_noise.shape ##Let's look at the nrows and ncols # In[185]: X_data = regulons_with_noise.iloc[:,:-8] y_data = regulons_with_noise.iloc[:,-8:] # In[180]: from sklearn.model_selection import train_test_split # In[186]: #The test subset will correspond to 30% of the data at random X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42) # In[112]: import keras from keras.models import Sequential from keras.layers import Dense from keras.utils import np_utils from keras.metrics import categorical_accuracy from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report,accuracy_score\ ,confusion_matrix # In[104]: import bokeh.io import bokeh.plotting bokeh.io.output_notebook() import holoviews as hv hv.extension('bokeh') # In[182]: X_train.shape # In[187]: #softmax activation model = Sequential() model.add(Dense(units=500, activation='softmax', input_dim=n_components)) model.add(Dense(units=8))# 7 cluster classif. output model.compile(loss= 'mse', optimizer='RMSprop', metrics= ['accuracy']) history = model.fit(X_train, y_train, epochs=80, batch_size= 60) # In[188]: x = bokeh.plotting.figure(height=400, width=650, x_axis_label='epoch', y_axis_label='accuracy', y_range=(0, 1), title= 'Model Training Accuracy P. aeruginosa') x.circle(x = np.arange(1,81,1), y = history.history['acc'] , fill_alpha = 0.3) #x.line(t, p[:,1]) #x.line(t, p[:,2]) bokeh.io.show(x) # In[243]: # Keras simulations using Matplotlib n_simulations = 30 train_acc = [] test_acc = [] for i in range(n_simulations): X_train, X_test, y_train, y_test = train_test_split(X_data, y_data , test_size=0.3, random_state=42) model = Sequential() model.add(Dense(units=500, activation='softmax', input_dim=n_components)) model.add(Dense(units=8))# 8 cluster classif. output model.compile(loss= 'mse', optimizer='RMSprop', metrics= ['accuracy']) history = model.fit(X_train, y_train, epochs=80, batch_size= 60) accuracy = history.history['acc'] loss = history.history['loss'] train_acc.append(accuracy[79]) score = model.evaluate(X_test, y_test,verbose=0) test_acc.append(score[1]) # summarize history for accuracy/loss plt.plot(accuracy, 'o', color = 'mediumaquamarine', alpha = 0.3, markersize= 5) plt.plot(loss, 'o', color = 'orangered', alpha = 0.3, markersize= 5) plt.title('Keras Model Training $P. aeruginosa$ ', fontsize = 15) plt.ylabel('Acc / Loss ') plt.xlabel('epoch') plt.ylim(0,1.1) plt.legend(['Acc.','Loss' ], loc='best') plt.savefig('keras-model-train-paeru.tiff', dpi = 350) # In[245]: tos train = ['train'] * len(train_acc) test = ['test'] * len(train_acc) # In[246]: x = list(zip(train_acc, paeru,train)) y = list(zip(test_acc, paeru, test)) # In[247]: x[:5] # In[248]: entries= x + y # In[249]: paeru_df = pd.DataFrame(index = range(n_simulations*2)) paeru_df = pd.DataFrame(entries, columns=['accuracy', 'organism', 'type']) # In[250]: paeru_df.to_csv('paeru-model.csv') # In[251]: sns.violinplot(x = 'organism', y = 'accuracy', hue = 'type', data = paeru_df, palette = 'pastel', scale = 'width', inner = 'quartile') plt.ylim(.3, 1.1) # In[252]: y_pred = model.predict(X_test, batch_size =100) y_pred_flat = np.round(y_pred.flatten()) y_test_flat = y_test.values.flatten() # In[253]: conf_mat = confusion_matrix(y_test_flat, y_pred_flat) fig, ax = plt.subplots(figsize=(8,8)) sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=['not in cluster', 'inside cluster'], yticklabels=['not in cluster', 'inside cluster']) plt.ylabel('Actual') plt.xlabel('Predicted') plt.title('Keras Classifier confusion matrix $P. aeruginosa$') #plt.savefig('conf-mat-paeru-keras.tiff', dpi = 350) # In[254]: score = model.evaluate(X_test, y_test,verbose=0) print('Keras model testing loss: {0:.3f} and accuracy: {1:.3f}'.format(score[0], score[1])) # In[124]: accuracy = history.history['acc'] # In[125]: accuracy[79] # We get a decent ~85-95% training accuracy with the Keras Classifier. Now let's try other algorithms # ### Comparing Keras classifier with other algorithms # In[130]: from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier # In[257]: grid_params = {'max_depth' : [5,30], 'min_samples_split' : [5, 20]} # In[258]: fixed_params = { 'n_estimators': 50, 'random_state' : 42, 'n_jobs' : -1} # In[ ]: # In[259]: grid_rf = GridSearchCV(RandomForestClassifier(**fixed_params), param_grid= grid_params, n_jobs = -1, scoring = 'neg_mean_absolute_error') # In[260]: grid_rf.fit(X_train, y_train) # In[261]: grid_rf.best_params_ # In[262]: model = RandomForestClassifier(**fixed_params, **grid_rf.best_params_) # In[263]: model.fit(X_train, y_train) # In[264]: y_pred = model.predict(X_test) y_pred_flat = y_pred.flatten() # In[265]: print(classification_report(y_test, y_pred)) print('Accuracy score : ', accuracy_score(y_test, y_pred)) # In[138]: results = cross_val_score(model, X_train, y_train, cv=5) print("Mean cross validation score: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100)) # In[266]: conf_mat = confusion_matrix(y_test_flat, y_pred_flat) fig, ax = plt.subplots(figsize=(8,8)) sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=['not in cluster', 'inside cluster'], yticklabels=['not in cluster', 'inside cluster']) plt.ylabel('Actual') plt.xlabel('Predicted') plt.title('Random Forest Confusion Matrix $P. aeruginosa $') plt.savefig('conf-mat-paeru-rf.tiff', dpi = 350) # ### MLP # In[267]: from sklearn.neural_network import MLPClassifier model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(1000, 20), random_state=42) model.fit(X_train, y_train) # In[268]: y_pred= model.predict(X_test) y_test_flat = y_test.values.flatten() y_pred_flat = y_pred.flatten() # In[269]: print(classification_report(y_test, y_pred)) print('Accuracy score : ', accuracy_score(y_test, y_pred)) # In[143]: results = cross_val_score(model, X_train, y_train, cv=5) print("Mean cross validation score: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100)) # In[270]: conf_mat = confusion_matrix(y_test_flat, y_pred_flat) fig, ax = plt.subplots(figsize=(8,8)) sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=['not in cluster', 'inside cluster'], yticklabels=['not in cluster', 'inside cluster']) plt.ylabel('Actual') plt.xlabel('Predicted') plt.title('MLP Confusion Matrix $P. aeruginosa $') plt.savefig('conf-mat-paeru-mlp.tiff', dpi = 350) # In the case of *P. aeruginosa* the best model is MLP. One should further tune the hyperparameters for better results. Moreover, one further step is to do feature engineering on the expression conditions to get better prediction accuracy. # In[ ]: