#!/usr/bin/env python # coding: utf-8 # In[179]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from matplotlib.cbook import boxplot_stats from tqdm.notebook import tqdm sns.set_palette("coolwarm") from DataManager import DataManager # In[2]: dm = DataManager() # In[3]: dm.data # In[4]: dm.cell_line_df # # KEGG Data # In[5]: from Bio import SeqIO from Bio.KEGG.REST import * from Bio.KEGG.KGML import KGML_parser #from Bio.Graphics.KGML_vis import KGMLCanvas #from Bio.Graphics.ColorSpiral import ColorSpiral from IPython.display import Image, HTML, IFrame # In[6]: human_pathways = kegg_list("pathway", "hsa").read() #human_pathways.decode("utf-8").split("\n")[0:5] human_pathways = human_pathways.split("\n") human_pathways = [r.replace("path:", "").replace(" - Homo sapiens (human)", "") for r in human_pathways if r] human_pathways = {r.split("\t")[0]: r.split("\t")[1] for r in human_pathways} print(len(human_pathways)) # In[7]: human_pathways # In[8]: database = "genes" organism = "hsa" organism_tcode = "T01001" query = "MEK2" # In[ ]: # In[9]: def find_KEGG_entries(gene, filtering=True): resp = kegg_find("T01001", gene).read() resp_list = resp.split("\n") resp_list = [r for r in resp_list if r] query_results = [] for r in resp_list: entry_id, results = r.split("\t") genes, description = results.split("; ") genes = genes.split(", ") if gene in genes or not filtering: query_results.append({"entry_id": entry_id, "gene_names": genes, "description": description}) return query_results # In[10]: def parse_pathways(entry_id): entry_content = kegg_get(entry_id).read() pathways_str = entry_content.split("PATHWAY")[1].split("NETWORK")[0] pathways = pathways_str.split("\n") pathways = [p.strip() for p in pathways if p] parsed_pathways = {} for p in pathways: path_id, description = p.split(" ", 1) parsed_pathways[path_id.strip()] = description.strip() return parsed_pathways # In[11]: parse_pathways("hsa:207") # ## Gene Pathways # ### AKT 1 # In[12]: find_KEGG_entries("AKT1") # In[13]: akt1_parsed = parse_pathways("hsa:207") akt1_pathways = list(akt1_parsed.keys()) len(akt1_pathways) # ### ATK1 & 2 # In[14]: find_KEGG_entries("AKT2") # In[15]: akt2_parsed = parse_pathways("hsa:208") akt2_pathways = list(akt2_parsed.keys()) akt1_and_2_pathways = list(set(akt1_pathways + akt2_pathways)) len(akt1_and_2_pathways) # ### MEK2 # In[16]: find_KEGG_entries("MEK2") # In[17]: mek2_parsed = parse_pathways("hsa:5605") mek2_pathways = list(mek2_parsed.keys()) mek2_pathways # ### P53 # In[18]: find_KEGG_entries("P53") # In[19]: p53_parsed = parse_pathways("hsa:7157") p53_pathways = list(p53_parsed.keys()) len(p53_pathways) # ### PTEN # In[20]: find_KEGG_entries("PTEN") # In[21]: pten_parsed = parse_pathways("hsa:5728") pten_pathways = list(pten_parsed.keys()) len(pten_pathways) # ### BAX # In[22]: find_KEGG_entries("BAX") # In[23]: bax_parsed = parse_pathways("hsa:581") bax_pathways = list(bax_parsed.keys()) len(bax_pathways) # ### MEK1 # In[24]: find_KEGG_entries("MEK1") # In[25]: mek1_parsed = parse_pathways("hsa:581") mek1_pathways = list(mek1_parsed.keys()) len(mek1_pathways) # ### PI3KCA # In[26]: find_KEGG_entries("PI3K", filtering=False) # In[27]: pik3kca_parsed = parse_pathways("hsa:5290") pik3kca_pathways = list(pik3kca_parsed.keys()) len(pik3kca_pathways) # ### KRAS # In[28]: find_KEGG_entries("KRAS") # In[29]: kras_parsed = parse_pathways("hsa:3845") kras_pathways = list(kras_parsed.keys()) len(kras_pathways) # ### CTNNB1 # In[30]: find_KEGG_entries("CTNNB1") # In[31]: ctnnb1_parsed = parse_pathways("hsa:1499") ctnnb1_pathways = list(ctnnb1_parsed.keys()) len(ctnnb1_pathways) # # Pathways Priors # In[32]: mutations_order = ["HCT116 P1", "HCT116 P2", "PI3KCA wt", "AKT1", "AKT1/2", "PTEN", "KRAS wt", "MEK1", "MEK2", "CTNNB1 wt", "P53", "BAX"] mutations_order = list(dm.cell_line_df["mutation"]) hsa_pathways = sorted(list(human_pathways.keys())) pathway_lists = { "HCT116 P1": [], "HCT116 P2": [], "PI3KCA wt": pik3kca_pathways, "AKT1": akt1_pathways, "AKT1/2": akt1_and_2_pathways, "PTEN": pten_pathways, "KRAS wt": kras_pathways, "MEK1": mek1_pathways, "MEK2": mek2_pathways, "CTNNB1 wt": ctnnb1_pathways, "P53": p53_pathways, "BAX": bax_pathways } # In[33]: mutations_order # In[34]: pathways_set = set() for k, pw in pathway_lists.items(): pathways_set.update(pw) len(pathways_set) pathways_set = sorted(list(pathways_set)) # In[35]: cl_idxs = {} for i, mut in dm.cell_line_df["mutation"].iteritems(): cl_idxs[mut] = i # In[36]: cl_pathways_mask = np.zeros((len(pathways_set), len(mutations_order))) # In[37]: for i, mut in enumerate(mutations_order): impacted_pathways_idxs = [j for j, p in enumerate(pathways_set) if p in pathway_lists[mut]] cl_pathways_mask[impacted_pathways_idxs, i] = 1 cl_pathways_mask = cl_pathways_mask.T sns.heatmap(cl_pathways_mask) # In[38]: dmso_indexes = [] for row in dm.drug_df.itertuples(): #"dimethyl" in row.Name or "DMSO" in row.SecName or if "ctr" in row.GeneID and "DMSO" in row.Content: dmso_indexes.append(row.Index) dmso_indexes # In[39]: drug_pathways_mask = np.ones((len(pathways_set), len(dm.drug_df))) drug_pathways_mask[:, dmso_indexes] = 0 drug_pathways_mask = drug_pathways_mask.T sns.heatmap(drug_pathways_mask) # # Features Processing # In[40]: control_samples = dm.get_control_samples() control_samples # In[41]: avg_ctrl_features = dm.data.iloc[control_samples].drop(["drug", "cell_line", "replicate"], axis=1) avg_ctrl_features = np.mean(np.exp2(avg_ctrl_features.values), axis=0) avg_ctrl_features # In[42]: new_values = dm.data.values new_values[:,3:] = np.exp2(new_values[:,3:]) - avg_ctrl_features exp_df = pd.DataFrame(new_values, columns=dm.data.columns) exp_df["drug"] = exp_df["drug"].astype(int) exp_df["cell_line"] = exp_df["cell_line"].astype(int) exp_df["replicate"] = exp_df["replicate"].astype(int) exp_df # # Model # In[43]: np.random.seed(42) ptw_ftr_matrix = np.random.randn(len(avg_ctrl_features), cl_pathways_mask.shape[0])/4 cl_ptw_matrix = np.multiply(np.random.rand(*cl_pathways_mask.shape), cl_pathways_mask) drug_ptw_matrix = np.multiply(np.random.rand(*drug_pathways_mask.shape), drug_pathways_mask) def predict_features(cl_id, drug_idx): cl_ftrs = np.dot(ptw_ftr_matrix, cl_ptw_matrix[:, cl_id]) drug_ftrs = np.dot(ptw_ftr_matrix, drug_ptw_matrix[:, drug_idx]) return cl_ftrs.flatten() + drug_ftrs.flatten() # In[44]: def mse_reconstruction(ftrs, c, d, F): return (ftrs - np.dot(F, (c+d).reshape(-1,1)))**2 def loss_function(ftrs, c, d, F, alpha=0.1, beta=0.01, gamma=0.1): reconstruction_loss = mse_reconstruction(ftrs, c, d) l1_loss = alpha*(np.sum(np.abs(c)) + np.sum(np.abs(d))) l2_loss = beta*(np.sum(c**2) + np.sum(d**2)) l2_F_loss = gamma*(np.sum(np.multiply(F, F))) return reconstruction_loss+l1_loss+l2_loss+l2_F_loss # def grad_c(ftrs, c, d, F): # mse_grad = # # Pytorch Lightning Model # In[45]: import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning import loggers, Trainer from pytorch_lightning.callbacks.early_stopping import EarlyStopping from torch import device as device_ from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler # In[87]: class FeaturesDataset(Dataset): def __init__(self, df, control_avg=None, train=None, val_split=0.1, standardize=False): super().__init__() self.control_avg = control_avg self.data = df.values[:,3:] if control_avg is not None: self.data -= control_avg self.dcr = df.values[:,:3] if train is not None: train_idxs, val_idxs = train_test_split(list(range(len(self.data))), test_size=val_split, random_state=42) if train: self.data = self.data[train_idxs,:] self.dcr = self.dcr[train_idxs,:] else: self.data = self.data[val_idxs,:] self.dcr = self.dcr[val_idxs,:] self.standardize = standardize self.scaler = StandardScaler() if standardize: self.data = self.scaler.fit_transform(self.data) self.data = torch.tensor(self.data) self.dcr = torch.tensor(self.dcr, dtype=torch.int32) def __len__(self): return len(self.data) def __getitem__(self, i): return (self.dcr[i,:], self.data[i,:]) def get_control_cl_idxs(self): return [i for i, dcr in enumerate(self.dcr) if dcr[1] in [4,11]] def get_control_drug_idxs(self): return [i for i, dcr in enumerate(self.dcr) if dcr[0] in dmso_indexes] def get_index(self, drug, cell_line, replicate): dcr = torch.tensor([drug, cell_line, replicate]) for i in range(len(self.dcr)): if self.dcr[i, :] == dcr: return i return None class CustomLoss(nn.Module): def __init__(self, alpha=0.1, beta=0.01, gamma=0.01): super().__init__() self.reconstr_loss = nn.MSELoss() self.l1_loss = nn.L1Loss(reduction="mean") self.l2_loss = nn.MSELoss() self.alpha = alpha self.beta = beta self.gamma = gamma def forward(self, pred_ftr, target_ftr, c, d, F): reconstr_loss = self.reconstr_loss(pred_ftr, target_ftr) l1_loss = self.alpha*(self.l1_loss(c, torch.zeros_like(c))+self.l1_loss(c, torch.zeros_like(d))) l2_loss = self.beta*(self.l2_loss(c, torch.zeros_like(c))+self.l2_loss(c, torch.zeros_like(d))) l2_loss_F = self.gamma*self.l2_loss(F, torch.zeros_like(F)) return reconstr_loss+l1_loss+l2_loss+l2_loss_F class MF_Model(LightningModule): def __init__(self, cl_mask, drug_mask, hparams, train_dataset, val_dataset, dvc="cpu"): super().__init__() self.hparams = hparams self.dvc = dvc # self.use_cuda = cuda self.cl_mask = torch.tensor(cl_mask).detach() self.drug_mask = torch.tensor(drug_mask).detach() # if dvc == "cuda": # self.cl_mask = self.cl_mask.cuda() # self.drug_mask = self.drug_mask.cuda() self.train_dataset = train_dataset self.val_dataset = val_dataset torch.manual_seed(42) self.cl_embedding = nn.Embedding(*cl_mask.shape) self.cl_embedding.weight.data *= self.cl_mask # self.cl_embedding.weight.data = self.cl_mask self.drug_embedding = nn.Embedding(*drug_mask.shape) self.drug_embedding.weight.data *= self.drug_mask n_pathways = drug_mask.shape[1] n_ftrs = hparams.get("n_ftrs", 385) self.ftr_ptw = nn.Linear(n_pathways, n_ftrs, bias=False) self.layers = {"cl_embedding": self.cl_embedding, "drug_embedding": self.drug_embedding, "pathway_effect": self.ftr_ptw} self.lambda_ftr = hparams.get("lambda_ftr", 0.01) self.lambda_drg = hparams.get("lambda_drg", 0.01) self.loss_function = nn.MSELoss() self.activation = None activation = hparams.get("activation", None) if activation == "Sigmoid": self.activation = nn.Sigmoid() elif activation == "Tanh": self.activation = nn.Tanh() elif activation == "ReLU": self.activation = nn.ReLU() def get_matrix(self, name): if name == "cl_embedding": matrix = self.cl_embedding.weight.data if self.activation is not None: matrix = self.activation(matrix) return matrix elif name == "drug_embedding": matrix = self.drug_embedding.weight.data if self.activation is not None: matrix = self.activation(matrix) return matrix elif name == "pathway_effect": return self.ftr_ptw.weight.data return None def freeze_layer(self, layer_name): assert layer_name in self.layers.keys() for param in self.layers[layer_name].parameters(): param.requires_grad = False def freeze_all(self): for layer in self.layers.values(): for param in layer.parameters(): param.requires_grad = False def unfreeze_layer(self, layer_name): assert layer_name in self.layers.keys() for param in self.layers[layer_name].parameters(): param.requires_grad = True def unfreeze_all(self): for layer in self.layers.values(): for param in layer.parameters(): param.requires_grad = True def forward(self, drg_idx, cl_idx): drg_idx = drg_idx.long() cl_idx = cl_idx.long() if self.dvc == "cuda": drg_idx = drg_idx.cuda() cl_idx = cl_idx.cuda() drug_embedding = self.drug_embedding(drg_idx) cl_embedding = self.drug_embedding(cl_idx) if self.dvc == "cuda": drug_embedding *= self.drug_mask[drg_idx, :].cuda() cl_embedding *= self.cl_mask[cl_idx, :].cuda() else: drug_embedding *= self.drug_mask[drg_idx, :] cl_embedding *= self.cl_mask[cl_idx, :] drug_ftrs = self.ftr_ptw(drug_embedding) cl_ftrs = self.ftr_ptw(cl_embedding) if self.activation is not None: drug_ftrs = self.activation(drug_ftrs) cl_ftrs = self.activation(cl_ftrs) return drug_ftrs + cl_ftrs def configure_optimizers(self): lr = self.hparams.get("lr", 0.01) optimizer = torch.optim.SGD(self.parameters(), lr=lr) return optimizer def train_dataloader(self): loader = DataLoader(self.train_dataset, batch_size=self.hparams["batch_size"], num_workers=self.hparams["num_workers"], shuffle=True, drop_last=True, pin_memory=True) return loader def val_dataloader(self): loader = DataLoader(self.val_dataset, batch_size=self.hparams["batch_size"], num_workers=self.hparams["num_workers"], shuffle=False, drop_last=True, pin_memory=True) return loader def init_matrices(self, lr=0.01): # for param_group in self.optimizer.param_groups: # param_group['lr'] = lr optimizer = torch.optim.SGD(self.parameters(), lr=lr) print("Initializing cl matrix value by using control drug samples only") control_drug_idxs = self.train_dataset.get_control_drug_idxs() self.unfreeze_all() self.freeze_layer("drug_embedding") dcr, ftrs = self.train_dataset[control_drug_idxs] drg = dcr[:,0].long() cl = dcr[:,1].long() errors_list = [] #for n_iter in tqdm(range(100)): counter = 0 while True: counter +=1 pred_ftrs = self(drg, cl) error = self.loss_function(pred_ftrs, ftrs.cuda()) # mf_model.cl_embedding(cl), mf_model.drug_embedding(drg), # mf_model.ftr_ptw.weight.data) error.backward() errors_list.append(error.item()) print("\rIteration {}, Error: {}".format(counter, errors_list[-1]), end="") optimizer.step() optimizer.zero_grad() if len(errors_list)>2: if np.abs(errors_list[-1]-errors_list[-2])<0.00001: break if counter > 1000: break fig, ax = plt.subplots(figsize=(10,10)) ax.plot(range(len(errors_list)), errors_list) plt.show() print("Initializing drug matrix value by using control cl only and freezing ftrs") control_cl_idxs = self.train_dataset.get_control_drug_idxs() self.freeze_all() self.unfreeze_layer("drug_embedding") dcr, ftrs = self.train_dataset[control_cl_idxs] drg = dcr[:,0].long() cl = dcr[:,1].long() errors_list = [] #for n_iter in tqdm(range(100)): counter = 0 while True: counter +=1 pred_ftrs = self(drg, cl) error = self.loss_function(pred_ftrs, ftrs.cuda()) # mf_model.cl_embedding(cl), mf_model.drug_embedding(drg), # mf_model.ftr_ptw.weight.data) error.backward() errors_list.append(error.item()) print("\rIteration {}, Error: {}".format(counter, errors_list[-1]), end="") optimizer.step() optimizer.zero_grad() if len(errors_list)>2: if np.abs(errors_list[-1]-errors_list[-2])<0.00001: break if counter > 1000: break fig, ax = plt.subplots(figsize=(10,10)) ax.plot(range(len(errors_list)), errors_list) plt.show() print("Refine ftrs by freezing the rest on the whole dataset") self.freeze_all() self.unfreeze_layer("pathway_effect") errors_list = [] #for n_iter in tqdm(range(100)): counter = 0 loader = self.train_dataloader() while True: epoch_error = 0 for i_batch, batch in enumerate(loader): dcr, ftrs = batch drg = dcr[:,0].long() cl = dcr[:,1].long() pred_ftrs = self(drg, cl) error = self.loss_function(pred_ftrs, ftrs.cuda()) # mf_model.cl_embedding(cl), mf_model.drug_embedding(drg), # mf_model.ftr_ptw.weight.data) error.backward() epoch_error += error.item() errors_list.append(epoch_error/(i_batch+1)) counter +=1 print("\rIteration {}, Error: {}".format(counter, errors_list[-1]), end="") optimizer.step() optimizer.zero_grad() if len(errors_list)>2: if np.abs(errors_list[-1]-errors_list[-2])<0.00001: break if counter > 1000: break fig, ax = plt.subplots(figsize=(10,10)) ax.plot(range(len(errors_list)), errors_list) plt.show() def training_step(self, batch, batch_idx): dcr, ftrs = batch drg = dcr[:,0].long() cl = dcr[:,1].long() pred_ftrs = self(drg, cl) train_loss = self.loss_function(pred_ftrs, ftrs.cuda()) ftr_layer_params = torch.cat([x.view(-1) for x in self.ftr_ptw.parameters()]) l1_ftr = self.lambda_ftr * torch.norm(ftr_layer_params, 1) drg_layer_params = torch.cat([x.view(-1) for x in self.drug_embedding.parameters()]) l1_drg = self.lambda_drg * torch.norm(drg_layer_params, 1) return {"loss": train_loss + l1_ftr + l1_drg, "l1_ftr": l1_ftr, "l1_drg": l1_drg} def validation_step(self, batch, batch_idx): dcr, ftrs = batch drg = dcr[:,0].long() cl = dcr[:,1].long() pred_ftrs = self(drg, cl) val_loss = self.loss_function(pred_ftrs, ftrs.cuda()) ftr_layer_params = torch.cat([x.view(-1) for x in self.ftr_ptw.parameters()]) l1_ftr = self.lambda_ftr * torch.norm(ftr_layer_params, 1) drg_layer_params = torch.cat([x.view(-1) for x in self.drug_embedding.parameters()]) l1_drg = self.lambda_drg * torch.norm(drg_layer_params, 1) return {"loss": val_loss, "l1_ftr": l1_ftr, "l1_drg": l1_drg} def validation_epoch_end(self, outputs): if outputs: avg_loss = torch.stack([x['loss'] for x in outputs]).mean() avg_l1_ftr_loss = torch.stack([x['l1_ftr'] for x in outputs]).mean() avg_l1_drg_loss = torch.stack([x['l1_drg'] for x in outputs]).mean() tensorboard_logs = {'val_loss': avg_loss, "l1_ftr": avg_l1_ftr_loss, "l1_drg": avg_l1_drg_loss} return {'val_loss': avg_loss, 'log': tensorboard_logs} return {} # In[88]: list(mf_model.drug_embedding.parameters()) # In[116]: model_config = {"lr": 0.001, "batch_size": 32, "num_workers": 8, "activation": "Tanh", "lambda_ftr": 0.00001, "lambda_drg": 0.01} train_ftrs_ds = FeaturesDataset(exp_df, None, train=True, standardize=True) val_ftrs_ds = FeaturesDataset(exp_df, None, train=False, standardize=True) device = device_("cuda" if torch.cuda.is_available() else "cpu") mf_model = MF_Model(cl_pathways_mask, drug_pathways_mask, model_config, train_ftrs_ds, val_ftrs_ds, dvc=device.type).to(device).double() # In[97]: mf_model.init_matrices() # In[63]: pretrain_cl_emb = mf_model.get_matrix("cl_embedding").cpu().numpy().astype(float).copy() #fig, ax = plt.subplots(figsize=(10,10)) #sns.heatmap(cl_emb, ax=ax) gs = sns.clustermap(pretrain_cl_emb, col_cluster=False, method='centroid', cmap="coolwarm", vmin=-1, vmax=1 ) #gs.ax_heatmap.set_yticklabels(dm.cell_line_df["mutation"]) plt.show() # In[64]: pretrain_drug_emb = mf_model.get_matrix("drug_embedding").cpu().numpy().astype(float).copy() #fig, ax = plt.subplots(figsize=(10,10)) #sns.heatmap(drug_emb, ax=ax) # with sns.color_palette("PuBuGn_d"): # sns.set_palette("husl") sns.clustermap(pretrain_drug_emb, col_cluster=False, method='centroid', cmap="coolwarm", vmin=-1, vmax=1 ) plt.show() # In[65]: pretrain_ftr_pat = mf_model.get_matrix("pathway_effect").cpu().numpy().astype('float').copy() np.max(pretrain_ftr_pat) lim = max(np.abs(np.max(pretrain_ftr_pat)), np.abs(np.min(np.max(pretrain_ftr_pat)))) #fig, ax = plt.subplots(figsize=(18,28)) sns.clustermap(pretrain_ftr_pat, col_cluster=False, cmap="coolwarm", vmin = -lim, vmax=lim) plt.show() # In[66]: fig, ax = plt.subplots(figsize=(10,10)) ax.hist(pretrain_ftr_pat.flatten(), bins=30) plt.show() # In[117]: neptune_logger = loggers.NeptuneLogger( api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiYTNiN2FlZWQtODk2Yy00NmE5LThmMGItYWUxZmViYmE5OGQ4In0=", project_name="gvisona/idr0017", params=model_config, experiment_name="MF") torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(42) early_stopping_cb = EarlyStopping('val_loss', patience=5) # mf_model = mf_model.cuda() trainer = Trainer(gpus=1, max_epochs=100, logger=[neptune_logger], callbacks=[early_stopping_cb]) trainer.fit(mf_model) # In[134]: cl_emb = mf_model.get_matrix("cl_embedding").cpu().numpy().astype(float) #fig, ax = plt.subplots(figsize=(10,10)) #sns.heatmap(cl_emb, ax=ax) gs = sns.clustermap(cl_emb, col_cluster=False, method='centroid', cmap="coolwarm", vmin=-1, vmax=1 ) #gs.ax_heatmap.set_yticklabels(dm.cell_line_df["mutation"]) plt.show() # In[135]: drug_emb = mf_model.get_matrix("drug_embedding").cpu().numpy().astype(float).copy() #fig, ax = plt.subplots(figsize=(10,10)) #sns.heatmap(drug_emb, ax=ax) sns.clustermap(drug_emb, col_cluster=False, method='centroid', cmap="coolwarm", vmin=-1, vmax=1 ) plt.show() # In[136]: ftr_pat = mf_model.get_matrix("pathway_effect").cpu().numpy().astype('float').copy() #fig, ax = plt.subplots(figsize=(18,28)) lim = max(np.abs(np.max(ftr_pat)), np.abs(np.min(np.max(ftr_pat)))) #fig, ax = plt.subplots(figsize=(18,28)) sns.clustermap(ftr_pat, col_cluster=False, cmap="coolwarm", vmin = -lim, vmax=lim) plt.show() # In[137]: np.save("cl_emb-idr59.npy", cl_emb) np.save("drug_emb-idr59.npy", drug_emb) np.save("ftr_pat-idr59.npy", ftr_pat) # In[138]: sns.heatmap(ftr_pat, cmap="coolwarm") # In[133]: transformed_data = train_ftrs_ds.scaler.transform(exp_df.values[:,3:]) test_df = pd.DataFrame(np.concatenate((exp_df.values[:,:3], transformed_data), axis=1), columns = dm.data.columns) test_df["drug"] = test_df["drug"].astype(int) test_df["cell_line"] = test_df["cell_line"].astype(int) test_df["replicate"] = test_df["replicate"].astype(int) reconstr_ds = FeaturesDataset(test_df, train=None) # In[170]: residuals = None relative_error = None for dcr, ftrs in reconstr_ds: d,c,r = dcr.long() reconstr_ftrs = mf_model.to(device)(torch.LongTensor([d]).cuda(),torch.LongTensor([c]).cuda()) res = torch.abs(reconstr_ftrs - ftrs.cuda()) rel_err = torch.div(res, ftrs.cuda()+1e-5) if residuals is None: residuals = res else: residuals = torch.cat((residuals, res), axis=0) if relative_error is None: relative_error = rel_err else: relative_error = torch.cat((relative_error, rel_err), axis=0) # In[180]: residuals_array = residuals.cpu().detach().numpy().flatten() fig, ax = plt.subplots(figsize=(10,10)) ax.hist(residuals_array, bins=3000) plt.show() # In[181]: fig, ax = plt.subplots(figsize=(15, 15)) sns.boxplot(x=residuals_array, ax=ax) # In[187]: outliers = [y for stat in boxplot_stats(residuals_array) for y in stat['fliers']] len(outliers)/len(residuals_array) # In[186]: min(outliers) # In[189]: rel_residuals_array = relative_error.cpu().detach().numpy().flatten() fig, ax = plt.subplots(figsize=(10,10)) ax.hist(rel_residuals_array) plt.show() # In[190]: fig, ax = plt.subplots(figsize=(15, 15)) sns.boxplot(x=rel_residuals_array, ax=ax) # In[ ]: