import pandas as pd
import scipy
import numpy as np
import scipy.sparse as sp
import scipy.io as spio
from scipy.stats import pearsonr
import operator
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib as mpl
from matplotlib.text import TextPath
from matplotlib.patches import PathPatch, Rectangle
from matplotlib.font_manager import FontProperties
from matplotlib import gridspec
from matplotlib.ticker import FormatStrFormatter
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.collections as collections
import isolearn.io as isoio
from analyze_aparent_designed_mpra_helpers import *
#Load designed MPRA data (Collapsed over experiment replicates)
seq_dict = isoio.load('../data/prepared_data/apa_array_data/apa_array_data_master_seq')
print("n = " + str(len(seq_dict['array_df'])))
print('Avg read count (avg over barcodes) = ' + str(np.mean(seq_dict['array_df'].query("n_barcodes >= 2")['mean_total_count'])))
print('Pooled read count (pooled over barcodes) = ' + str(np.mean(seq_dict['array_df'].query("n_barcodes >= 2")['pooled_total_count'])))
#Load variant dataframe
variant_dict = isoio.load('../data/prepared_data/apa_variant_data/apa_variant_data_master_seq')
print("n (variants) = " + str(len(variant_dict['variant_df'])))
#Load predictions
model_name = 'aparent_theano_legacy_30_31_34_pasaligned'
pred_dict = isoio.load('predictions/apa_array_data_legacy/' + model_name + '_predictions_master_seq')
#Join mpra dataframes with prediction table and calculate cut probabilities
seq_df, variant_df = append_predictions(seq_dict['array_df'], seq_dict['pooled_cuts'], variant_dict['variant_df'], variant_dict['pooled_cuts_var'], variant_dict['pooled_cuts_ref'], pred_dict['array_df'], pred_dict['cut_prob'])
n = 39833 Avg read count (avg over barcodes) = 431.92676310713546 Pooled read count (pooled over barcodes) = 3656.5562880452626 n (variants) = 21734
unique_experiments = seq_df.query("experiment == 'max_cut'")['subexperiment'].unique()
seq_df_filtered = seq_df.query("mean_total_count >= 100 and n_barcodes >= 5")
cut_df_filtered = seq_df.query("experiment == 'max_cut' and mean_total_count >= 10").copy()
#Map and evaluate sequences
def eval_logloss(cut_prob, target_pos) :
target = np.zeros(cut_prob.shape[0])
target[target_pos] = 0.8
target[target_pos - 1] = 0.10
target[target_pos + 1] = 0.10
ent = -np.dot(target, np.log((cut_prob + 0.001) / np.sum(cut_prob + 0.001)))
return ent
start_pos = 58
end_pos = 108
cut_poses = np.arange(end_pos - start_pos) + (start_pos - 56)
true_cut_column = 'cut_prob_true'
pred_cut_column = 'cut_prob_pred'
avg_position_true = []
avg_position_pred = []
cut_probs_true = []
cut_probs_pred = []
log_loss = []
for index, row in cut_df_filtered.iterrows() :
cut_prob_true = np.ravel(row[true_cut_column])
cut_prob_true = cut_prob_true[start_pos:end_pos] / np.sum(cut_prob_true[start_pos:end_pos])
cut_probs_true.append(cut_prob_true)
cut_prob_pred = np.ravel(row[pred_cut_column])
cut_prob_pred = cut_prob_pred[start_pos:end_pos] / np.sum(cut_prob_pred[start_pos:end_pos])
cut_probs_pred.append(cut_prob_pred)
avg_pos_true = np.dot(cut_prob_true, (np.arange(cut_prob_true.shape[0]) + start_pos))
avg_pos_pred = np.dot(cut_prob_pred, (np.arange(cut_prob_pred.shape[0]) + start_pos))
avg_position_true.append(avg_pos_true)
avg_position_pred.append(avg_pos_pred)
cut_pos_caught = False
for cut_pos in [60, 65, 70, 75, 80, 85, 90, 95, 100] :
if str(cut_pos) in row['subexperiment'] :
log_loss.append(eval_logloss(np.copy(cut_prob_true), cut_pos - 55 - (start_pos - 56)))
cut_pos_caught = True
break
if not cut_pos_caught :
log_loss.append(0)
cut_df_filtered['proxcut_prob_true'] = cut_probs_true
cut_df_filtered['proxcut_prob_pred'] = cut_probs_pred
cut_df_filtered['avgpos_true'] = avg_position_true
cut_df_filtered['avgpos_pred'] = avg_position_pred
cut_df_filtered['loss_logloss'] = log_loss
cut_snv_df_filtered = cut_df_filtered.query("variant == 'snv'").copy()
cut_df_filtered = cut_df_filtered.query("variant != 'snv'").copy()
print("Total number of Max Cut sequences = " + str(len(cut_df_filtered)))
df_human = seq_df_filtered.query("experiment == 'clinvar_wt' and variant == 'wt'")
human_cutprobs = np.array(df_human[true_cut_column].values.tolist())[:, start_pos:end_pos]
human_cutprobs = human_cutprobs / np.sum(human_cutprobs, axis=1).reshape(-1, 1)
human_cutprob = np.mean(human_cutprobs, axis=0)
objective_poses = [5, 10, 15, 20, 25, 30, 35, 40, 45]
Total number of Max Cut sequences = 1274
save_figs = False
print_verbose = False
cut_to_experiment = {}
pos_shift = 55
for cut_pos in [60, 65, 70, 75, 80, 85, 90, 95, 100] :
cut_to_experiment[cut_pos - pos_shift] = []
for unique_experiment in unique_experiments :
if str(cut_pos) in unique_experiment :
cut_to_experiment[cut_pos - pos_shift].append(unique_experiment)
#Detailed view per subexperiment
fig, ax = plt.subplots(1, len(cut_to_experiment), figsize=(1.5 * len(cut_to_experiment), 6))
subexperiment_blacklist = ['ent']
cut_pos_i = None
cut_pos = None
obj_pos_i = 0
for obj_pos, _ in sorted(cut_to_experiment.items(), key=lambda kv: kv[0]) :
cut_result_dict = {}
total_n_obj_pos_seqs = 0
for subexperiment in cut_to_experiment[obj_pos] :
n_subexp_seqs = len(np.nonzero(cut_df_filtered['subexperiment'] == subexperiment)[0])
if print_verbose :
print("Objective position = " + str(obj_pos) + ", subexperiment = " + subexperiment + ", # of sequences = " + str(n_subexp_seqs))
total_n_obj_pos_seqs += n_subexp_seqs
blacklisted = False
for subexp_keyword in subexperiment_blacklist :
if subexp_keyword in subexperiment :
blacklisted = True
break
if blacklisted :
continue
keep_index = np.nonzero(cut_df_filtered['subexperiment'] == subexperiment)[0]
prox_prob = np.array(cut_df_filtered.iloc[keep_index]['proxcut_prob_true'].values.tolist())
sort_index = np.argsort(np.ravel(cut_df_filtered.iloc[keep_index]['loss_logloss'].values))[::-1]
prox_prob = prox_prob[sort_index, :]
cut_result_dict[subexperiment] = {}
cut_result_dict[subexperiment]['cut_probs'] = prox_prob
cut_result_dict[subexperiment]['loss'] = np.mean(cut_df_filtered.iloc[keep_index]['loss_logloss'].values)
if print_verbose :
print("Objective position = " + str(obj_pos) + ", Total # of sequences = " + str(total_n_obj_pos_seqs))
cut_probs = []
cut_label_coords = []
prev_label_coords = [0]
cut_labels = []
objective_pos = 0
for subexperiment, vals in sorted(cut_result_dict.items(), key=lambda kv: kv[1]['loss'])[::-1] :
prox_prob = vals['cut_probs']
objective_pos = int(subexperiment.split('_')[-1]) - 55 + 10
prox_prob = np.concatenate([np.zeros((prox_prob.shape[0], 10)), prox_prob, np.zeros((prox_prob.shape[0], 10))], axis=1)
cut_probs.append(prox_prob[:, objective_pos - 10 - cut_poses[0]: objective_pos + 11 - cut_poses[0]])
cut_label_coords.append(prev_label_coords[-1] + float(prox_prob.shape[0]) / 2.)
prev_label_coords.append(prev_label_coords[-1] + float(prox_prob.shape[0]))
cut_labels.append('_'.join(subexperiment.split('_')[:-1]))
ax[obj_pos_i].axhline(y=prev_label_coords[-1], color='black', linewidth=2, linestyle='--')
cut_probs = np.vstack(cut_probs)
ax[obj_pos_i].imshow(cut_probs, cmap='Greens', vmin=0.05, vmax=0.3, aspect='auto')
ax[obj_pos_i].axvline(x=10, color='orange', linewidth=2, linestyle='--', alpha=0.2)
ax[obj_pos_i].set_xticks([0, 10, 20])
ax[obj_pos_i].set_xticklabels([objective_pos - 20, objective_pos - 10, objective_pos + 1], fontsize=8, fontweight='bold', ha='center', va='top')
ax[obj_pos_i].set_yticks(cut_label_coords)
ax[obj_pos_i].set_yticklabels(cut_labels, fontsize=8, fontweight='bold', rotation=90, ha='right', va='center')
obj_pos_i += 1
plt.tight_layout()
if save_figs :
plt.savefig('cut_subexperiment_map.png', transparent=True, dpi=150)
plt.savefig('cut_subexperiment_map.svg')
plt.savefig('cut_subexperiment_map.eps')
plt.show()
/home/johli/anaconda3/envs/aparent/lib/python3.6/site-packages/numpy/core/fromnumeric.py:56: FutureWarning: Series.nonzero() is deprecated and will be removed in a future version.Use Series.to_numpy().nonzero() instead return getattr(obj, method)(*args, **kwds)
#Mean statistics over all subexperiments
cut_df_to_use = cut_df_filtered.loc[cut_df_filtered.subexperiment.str.contains('GGCC') | cut_df_filtered.subexperiment.str.slice(start=-2).isin(['60', '65', '70', '75', '80'])]
plot_cut_profile(cut_df_to_use, cut_to_experiment, cut_poses, objective_poses, human_cutprob, figsize=(8, 5.5), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=None)#save_fig_name='cut_profile_all_subexps'
plot_cut_map(cut_df_to_use, cut_to_experiment, cut_poses, objective_poses, human_cutprob, figsize=(4, 6), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=100)#save_fig_name='cut_map_all_subexps'
plot_position_scatter(cut_df_to_use, cut_to_experiment, cut_poses, objective_poses, human_cutprob, variant_filter="variant == 'wt'", figsize=(5, 5), save_fig_name=None, fig_dpi=150)#save_fig_name='avgcut_pred_vs_true_all_subexps'
#Mean and Max statistics over A subexperiments
cut_nt = 'A'
aruns_objective = ''
ggcc_objective = '_GGCC'
cut_to_subexperiment = {
5 : [cut_nt + aruns_objective + '_60', cut_nt + aruns_objective + '_ent_60'],
10 : [cut_nt + aruns_objective + '_65', cut_nt + aruns_objective + '_ent_65'],
15 : [cut_nt + aruns_objective + '_70', cut_nt + aruns_objective + '_ent_70'],
20 : [cut_nt + aruns_objective + '_75', cut_nt + aruns_objective + '_ent_75'],
25 : [cut_nt + aruns_objective + '_80', cut_nt + aruns_objective + '_ent_80'],
30 : [cut_nt + ggcc_objective + aruns_objective + '_85', cut_nt + ggcc_objective + aruns_objective + '_ent_85'],
35 : [cut_nt + ggcc_objective + aruns_objective + '_90', cut_nt + ggcc_objective + aruns_objective + '_ent_90'],
40 : [cut_nt + ggcc_objective + aruns_objective + '_95', cut_nt + ggcc_objective + aruns_objective + '_ent_95'],
45 : [cut_nt + ggcc_objective + aruns_objective + '_100', cut_nt + ggcc_objective + aruns_objective + '_ent_100'],
}
print('Mean cut profile')
plot_cut_profile(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(8, 5.5), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=None)#save_fig_name='cut_profile_subexp_a_mean'
print('Max cut profile')
plot_cut_profile(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(8, 5.5), save_fig_name=None, fig_dpi=150, plot_mode='max', n_samples=5)#save_fig_name='cut_profile_subexp_a_max'
print('Mean cut map')
plot_cut_map(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(4, 6), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=100)#save_fig_name='cut_map_all_subexp_a_mean'
print('Max cut map')
plot_cut_map(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(4, 6), save_fig_name=None, fig_dpi=150, plot_mode='max', n_samples=25)#save_fig_name='cut_map_all_subexp_a_max'
plot_position_scatter(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, variant_filter="variant == 'wt' or variant == 'sampled'", figsize=(5, 5), save_fig_name=None, fig_dpi=150)#save_fig_name='avgcut_pred_vs_true_subexp_a'
Mean cut profile
Max cut profile
Mean cut map
Max cut map
#Mean and Max statistics over AT subexperiments
cut_nt = 'AT'
aruns_objective = ''
ggcc_objective = '_GGCC'
cut_to_subexperiment = {
5 : [cut_nt + aruns_objective + '_60', cut_nt + aruns_objective + '_ent_60'],
10 : [cut_nt + aruns_objective + '_65', cut_nt + aruns_objective + '_ent_65'],
15 : [cut_nt + aruns_objective + '_70', cut_nt + aruns_objective + '_ent_70'],
20 : [cut_nt + aruns_objective + '_75', cut_nt + aruns_objective + '_ent_75'],
25 : [cut_nt + aruns_objective + '_80', cut_nt + aruns_objective + '_ent_80'],
30 : [cut_nt + ggcc_objective + aruns_objective + '_85', cut_nt + ggcc_objective + aruns_objective + '_ent_85'],
35 : [cut_nt + ggcc_objective + aruns_objective + '_90', cut_nt + ggcc_objective + aruns_objective + '_ent_90'],
40 : [cut_nt + ggcc_objective + aruns_objective + '_95', cut_nt + ggcc_objective + aruns_objective + '_ent_95'],
45 : [cut_nt + ggcc_objective + aruns_objective + '_100', cut_nt + ggcc_objective + aruns_objective + '_ent_100'],
}
print('Mean cut profile')
plot_cut_profile(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(8, 5.5), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=None)#save_fig_name='cut_profile_subexp_at_mean'
print('Max cut profile')
plot_cut_profile(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(8, 5.5), save_fig_name=None, fig_dpi=150, plot_mode='max', n_samples=5)#save_fig_name='cut_profile_subexp_at_max'
print('Mean cut map')
plot_cut_map(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(4, 6), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=100)#save_fig_name='cut_map_subexp_at_mean'
print('Max cut map')
plot_cut_map(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(4, 6), save_fig_name=None, fig_dpi=150, plot_mode='max', n_samples=25)#save_fig_name='cut_map_subexp_at_max'
plot_position_scatter(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, variant_filter="variant == 'wt' or variant == 'sampled'", figsize=(5, 5), save_fig_name=None, fig_dpi=150)#save_fig_name='avgcut_pred_vs_true_subexp_at'
Mean cut profile
Max cut profile
Mean cut map
Max cut map
#Mean and Max statistics over A aruns subexperiments
cut_nt = 'A'
aruns_objective = '_aruns'
ggcc_objective = '_GGCC'
cut_to_subexperiment = {
5 : [cut_nt + aruns_objective + '_60', cut_nt + aruns_objective + '_ent_60'],
10 : [cut_nt + aruns_objective + '_65', cut_nt + aruns_objective + '_ent_65'],
15 : [cut_nt + aruns_objective + '_70', cut_nt + aruns_objective + '_ent_70'],
20 : [cut_nt + aruns_objective + '_75', cut_nt + aruns_objective + '_ent_75'],
25 : [cut_nt + aruns_objective + '_80', cut_nt + aruns_objective + '_ent_80'],
30 : [cut_nt + ggcc_objective + aruns_objective + '_85', cut_nt + ggcc_objective + aruns_objective + '_ent_85'],
35 : [cut_nt + ggcc_objective + aruns_objective + '_90', cut_nt + ggcc_objective + aruns_objective + '_ent_90'],
40 : [cut_nt + ggcc_objective + aruns_objective + '_95', cut_nt + ggcc_objective + aruns_objective + '_ent_95'],
45 : [cut_nt + ggcc_objective + aruns_objective + '_100', cut_nt + ggcc_objective + aruns_objective + '_ent_100'],
}
print('Mean cut profile')
plot_cut_profile(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(8, 5.5), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=None)#save_fig_name='cut_profile_subexp_aruns_mean'
print('Max cut profile')
plot_cut_profile(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(8, 5.5), save_fig_name=None, fig_dpi=150, plot_mode='max', n_samples=5)#save_fig_name='cut_profile_subexp_aruns_max'
print('Mean cut map')
plot_cut_map(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(4, 6), save_fig_name=None, fig_dpi=150, plot_mode='mean', n_samples=100)#save_fig_name='cut_map_subexp_aruns_mean'
print('Max cut map')
plot_cut_map(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, figsize=(4, 6), save_fig_name=None, fig_dpi=150, plot_mode='max', n_samples=25)#save_fig_name='cut_map_subexp_aruns_max'
plot_position_scatter(cut_df_filtered, cut_to_subexperiment, cut_poses, objective_poses, human_cutprob, variant_filter="variant == 'wt' or variant == 'sampled'", figsize=(5, 5), save_fig_name=None, fig_dpi=150)#save_fig_name='avgcut_pred_vs_true_subexp_aruns'
Mean cut profile
Max cut profile
Mean cut map
Max cut map