from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model
from keras import backend as K
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import aparent.visualization as vis
from aparent.predictor import *
import urllib
import urllib.request
import pickle
from time import sleep
from scipy.stats import ttest_ind
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize
Using TensorFlow backend.
df = pd.read_csv('../../../aparent/data/polyadb_features_pas_3_large.csv', sep='\t')
save_dict = np.load("../../../aparent/data/polyadb_features_pas_3_large_no_x.npz")
m, l = save_dict['m'], save_dict['l']
s = np.load('../predictions/apa_polyadb_data/aparent_all_libs_resnet_no_clinvar_wt_ep_5_native_scores_large.npy')
#Mark intronic and a-rich sites
intron_mask = np.zeros(m.shape)
a_rich = np.zeros(m.shape)
for k in range(m.shape[1]) :
intron_mask[:, k] = np.array((df['pas_exists_' + str(k)] & df['site_type_' + str(k)].isin(['Intron', 'Internal_exon'])).astype(int).values, dtype=np.int32)
a_rich[:, k] = np.array((df['pas_exists_' + str(k)] & df['wide_seq_ext_' + str(k)].str.slice(175-70, 175-70+205).str.contains("AAAAAAAAA|AAAAAGAAAAA|AAAAACAAAAA|AAAAATAAAAA|AAAAAAGAAAA|AAAAAACAAAA|AAAAAATAAAA|AAAAGAAAAAA|AAAACAAAAAA|AAAATAAAAAA")).astype(int).values, dtype=np.int32)
print("s.shape = " + str(s.shape))
print("m.shape = " + str(m.shape))
print("l.shape = " + str(l.shape))
print("intron_mask.shape = " + str(intron_mask.shape))
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (305,306,308,312,313,315,319,320,322,326,327,329,333,334,336,340,341,343,347,348,350) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
s.shape = (15106, 50) m.shape = (15106, 50) l.shape = (15106, 50) intron_mask.shape = (15106, 50)
#Apply filters
min_total_l = 1
max_total_l = 1e9
min_l = np.min(l + (l == 0.) * 1e6, axis=-1)
max_l = np.max(l, axis=-1)
temp_dist_index = np.array([np.nonzero(m[i, :])[0][-1] for i in range(m.shape[0])])
keep_index = np.nonzero((intron_mask[np.arange(m.shape[0]), temp_dist_index] == 0.) & ((min_l >= min_total_l) & (max_l <= max_total_l)))[0]
s = s[keep_index, :]
m = m[keep_index, :]
l = l[keep_index, :]
intron_mask = intron_mask[keep_index, :]
a_rich = a_rich[keep_index, :]
print("s.shape = " + str(s.shape))
print("m.shape = " + str(m.shape))
print("l.shape = " + str(l.shape))
print("intron_mask.shape = " + str(intron_mask.shape))
print("a_rich.shape = " + str(a_rich.shape))
print("# non-a-rich proximal sites = " + str(int(np.sum(1. - a_rich[:, 0]))))
s.shape = (14666, 50) m.shape = (14666, 50) l.shape = (14666, 50) intron_mask.shape = (14666, 50) a_rich.shape = (14666, 50) # non-a-rich proximal sites = 13683
l_prox_cumulative = np.log(np.cumsum(l[:, ::-1], axis=1) * m[:, ::-1] + 1.)[:, ::-1]
l_cumulative = np.log(np.cumsum(l, axis=1) * m + 1.)
l = np.log(l * m + 1.)
prox_index = np.array([np.nonzero(m[i, :])[0][0] for i in range(m.shape[0])])
dist_index = np.array([np.nonzero(m[i, :])[0][-1] for i in range(m.shape[0])])
prox_mask = np.zeros(m.shape)
for i in range(m.shape[0]) :
prox_mask[i, prox_index[i]] = 1.
dist_mask = np.zeros(m.shape)
for i in range(m.shape[0]) :
dist_mask[i, dist_index[i]] = 1.
#Proximal-most vs Distal-most PAS scores (3' UTR)
s_utr3_dists = []
s_utr3_dist_means = []
s_utr3_dist_medians = []
s_utr3_dist_stds = []
for k in range(0, 30) :
s_d = []
for i in range(s.shape[0]) :
if dist_index[i] - k >= 0 and intron_mask[i, dist_index[i] - k] == 0 and a_rich[i, dist_index[i] - k] == 0 :
s_d.append(s[i, dist_index[i] - k] / np.log(2.))
s_utr3_dists.append(np.array(s_d))
if s_utr3_dists[-1].shape[0] > 1 :
s_utr3_dist_means.append(np.mean(s_utr3_dists[-1]))
s_utr3_dist_medians.append(np.median(s_utr3_dists[-1]))
s_utr3_dist_stds.append(np.std(s_utr3_dists[-1]))
else :
s_utr3_dist_means.append(0.)
s_utr3_dist_medians.append(0.)
s_utr3_dist_stds.append(0.)
s_utr3_dist_means = np.array(s_utr3_dist_means)
s_utr3_dist_medians = np.array(s_utr3_dist_medians)
s_utr3_dist_stds = np.array(s_utr3_dist_stds)
#Proximal-most vs Distal-most PAS scores (Introns)
s_intron_dists = []
s_intron_dist_means = []
s_intron_dist_medians = []
s_intron_dist_stds = []
for k in range(0, 30) :
s_d = []
for i in range(s.shape[0]) :
if dist_index[i] - k >= 0 and intron_mask[i, dist_index[i] - k] == 1 and a_rich[i, dist_index[i] - k] == 0 :
s_d.append(s[i, dist_index[i] - k] / np.log(2.))
s_intron_dists.append(np.array(s_d))
if s_intron_dists[-1].shape[0] > 1 :
s_intron_dist_means.append(np.mean(s_intron_dists[-1]))
s_intron_dist_medians.append(np.median(s_intron_dists[-1]))
s_intron_dist_stds.append(np.std(s_intron_dists[-1]))
else :
s_intron_dist_means.append(0.)
s_intron_dist_medians.append(0.)
s_intron_dist_stds.append(0.)
s_intron_dist_means = np.array(s_intron_dist_means)
s_intron_dist_medians = np.array(s_intron_dist_medians)
s_intron_dist_stds = np.array(s_intron_dist_stds)
s_dists = []
s_dist_colors = []
s_dist_names = []
for i, [s_utr3, s_intron] in enumerate(zip(s_utr3_dists[::-1], s_intron_dists[::-1])) :
s_dists.append(s_intron)
s_dists.append(s_utr3)
s_dist_colors.append('deepskyblue')
s_dist_colors.append('lightcoral')
s_dist_names.append("-" + str(len(s_utr3_dists)-i-1) if i != len(s_utr3_dists) - 1 else "Distal PAS")
s_dist_names.append("")
import seaborn as sns
f = plt.figure(figsize=(8, 4))
sns.stripplot(data=s_dists, alpha=0.05, jitter=0.25, palette=s_dist_colors)
plt.errorbar(np.arange(0, 58, 2), s_intron_dist_medians[::-1][:-1], yerr=s_intron_dist_stds[::-1][:-1], linewidth=3, color='blue', linestyle='-', label='Intron')
plt.errorbar(np.arange(1, 61, 2), s_utr3_dist_medians[::-1], yerr=s_utr3_dist_stds[::-1], linewidth=3, color='red', linestyle='-', label='UTR3')
plt.xlim(-0.5, 59.5)
#plt.ylim(-8., 4.)
plt.ylim(-11., 5.)
plt.xticks(
np.arange(60),
s_dist_names, fontsize=12, rotation=45
)
plt.yticks(fontsize=12)
plt.ylabel("APARENT2 Score", fontsize=12)
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()
#Correlation between distance to next PAS and wildtype PAS score
from scipy.stats import spearmanr
lf = l[a_rich[:, 0] == 0, ...]
sf = s[a_rich[:, 0] == 0, ...]
print(spearmanr(lf[:, 1], sf[:, 0]))
x_vals = lf[:, 1] / np.log(10)
y_vals = sf[:, 0] / np.log(2.)
bins = [
[2.5, 3.0],
[3.0, 3.5],
[3.5, 4.0],
[4.0, 4.5],
[4.5, 5.0],
[5.0, 5.5],
]
colors = [
'#2f22d8',
'#9c00b8',
'#ce0091',
'#e80068',
'#f20041',
'#ee1919',
]
bin_names = []
y_dists = []
for i, [bin_start, bin_end] in enumerate(bins) :
y_dists.append(y_vals[(x_vals >= bin_start) & (x_vals < bin_end)])
if i == 0 :
bin_names.append("d < " + "10^" + str(bin_end) + "bp")
else :
bin_names.append(">= d < " + "10^" + str(bin_end) + "bp")
import seaborn as sns
f = plt.figure(figsize=(8, 4))
sns.stripplot(data=y_dists, alpha=0.1, jitter=0.25, palette=colors)
sns.boxplot(data=y_dists, showfliers=False, palette=colors)
plt.xlim(-0.5, 5.5)
plt.ylim(-11.5, 5.0)
plt.xticks(
np.arange(6),
bin_names, fontsize=12, rotation=30
)
plt.yticks(fontsize=12)
plt.ylabel("APARENT2 Score", fontsize=12)
plt.tight_layout()
plt.show()
SpearmanrResult(correlation=0.3358648988111347, pvalue=0.0)
#Distribution of distance between proximal-most and next PAS (intronic sites only)
lf = l[(a_rich[:, 0] == 0) & ((intron_mask[:, 0] == 1) & (intron_mask[:, 1] == 1)), ...]
x_vals = lf[:, 1] / np.log(10)
f = plt.figure(figsize=(4, 4))
plt.hist(x_vals, bins=40, range=(1.5, 5.5), color='deepskyblue')
plt.xlim(1.5, 5.5)
plt.xticks([2, 3, 4, 5], ["10^2", "10^3", "10^4", "10^5"], fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Distance to next PAS", fontsize=12)
plt.ylabel("# Genes", fontsize=12)
plt.tight_layout()
plt.show()
#Correlation between distance to next PAS and wildtype PAS score (intronic sites only)
from scipy.stats import spearmanr
lf = l[(a_rich[:, 0] == 0) & ((intron_mask[:, 0] == 1) & (intron_mask[:, 1] == 1)), ...]
sf = s[(a_rich[:, 0] == 0) & ((intron_mask[:, 0] == 1) & (intron_mask[:, 1] == 1)), ...]
print(spearmanr(lf[:, 1], sf[:, 0]))
x_vals = lf[:, 1] / np.log(10)
y_vals = sf[:, 0] / np.log(2.)
bins = [
[2.5, 3.0],
[3.0, 3.5],
[3.5, 4.0],
[4.0, 4.5],
[4.5, 5.0],
[5.0, 5.5],
]
colors = [
'#2f22d8',
'#9c00b8',
'#ce0091',
'#e80068',
'#f20041',
'#ee1919',
]
bin_names = []
y_dists = []
for i, [bin_start, bin_end] in enumerate(bins) :
y_dists.append(y_vals[(x_vals >= bin_start) & (x_vals < bin_end)])
if i == 0 :
bin_names.append("d < " + "10^" + str(bin_end) + "bp")
else :
bin_names.append(">= d < " + "10^" + str(bin_end) + "bp")
import seaborn as sns
f = plt.figure(figsize=(8, 4))
sns.stripplot(data=y_dists, alpha=0.1, jitter=0.25, palette=colors)
sns.boxplot(data=y_dists, showfliers=False, palette=colors)
plt.xlim(-0.5, 5.5)
plt.ylim(-11.5, 5.0)
plt.xticks(
np.arange(6),
bin_names, fontsize=12, rotation=30
)
plt.yticks(fontsize=12)
plt.ylabel("APARENT2 Score", fontsize=12)
plt.tight_layout()
plt.show()
SpearmanrResult(correlation=0.32224283360945594, pvalue=4.746817787025723e-190)
#Correlation between distance to next PAS and wildtype PAS score (smaller bin sizes)
from scipy.stats import spearmanr
lf = l[a_rich[:, 0] == 0, ...]
sf = s[a_rich[:, 0] == 0, ...]
print(spearmanr(lf[:, 1], sf[:, 0]))
x_vals = l[:, 1] / np.log(10)
y_vals = s[:, 0] / np.log(2.)
bins = [
[2.5, 2.75],
[2.75, 3.0],
[3.0, 3.25],
[3.25, 3.5],
[3.5, 3.75],
[3.75, 4.0],
[4.0, 4.25],
[4.25, 4.5],
[4.5, 4.75],
[4.75, 5.0],
[5.0, 5.25],
]
bin_names = []
y_dists = []
for i, [bin_start, bin_end] in enumerate(bins) :
y_dists.append(y_vals[(x_vals >= bin_start) & (x_vals < bin_end)])
if i == 0 :
bin_names.append("d < " + "10^" + str(bin_end) + "bp")
else :
bin_names.append(">= d < " + "10^" + str(bin_end) + "bp")
import seaborn as sns
f = plt.figure(figsize=(8, 4))
sns.stripplot(data=y_dists, alpha=0.1, jitter=0.25)
sns.boxplot(data=y_dists, showfliers=False)
plt.xlim(-0.5, 10.5)
plt.ylim(-11.5, 5.0)
plt.xticks(
np.arange(11),
bin_names, fontsize=12, rotation=30
)
plt.yticks(fontsize=12)
plt.ylabel("APARENT2 Score", fontsize=12)
plt.tight_layout()
plt.show()
SpearmanrResult(correlation=0.3358648988111347, pvalue=0.0)
#Correlation between distance to next PAS and wildtype PAS score (smaller bin sizes, intronic sites only)
from scipy.stats import spearmanr
lf = l[(a_rich[:, 0] == 0) & ((intron_mask[:, 0] == 1) & (intron_mask[:, 1] == 1)), ...]
sf = s[(a_rich[:, 0] == 0) & ((intron_mask[:, 0] == 1) & (intron_mask[:, 1] == 1)), ...]
print(spearmanr(lf[:, 1], sf[:, 0]))
x_vals = l[:, 1] / np.log(10)
y_vals = s[:, 0] / np.log(2.)
bins = [
[2.5, 2.75],
[2.75, 3.0],
[3.0, 3.25],
[3.25, 3.5],
[3.5, 3.75],
[3.75, 4.0],
[4.0, 4.25],
[4.25, 4.5],
[4.5, 4.75],
[4.75, 5.0],
[5.0, 5.25],
]
bin_names = []
y_dists = []
for i, [bin_start, bin_end] in enumerate(bins) :
y_dists.append(y_vals[(x_vals >= bin_start) & (x_vals < bin_end)])
if i == 0 :
bin_names.append("d < " + "10^" + str(bin_end) + "bp")
else :
bin_names.append(">= d < " + "10^" + str(bin_end) + "bp")
import seaborn as sns
f = plt.figure(figsize=(8, 4))
sns.stripplot(data=y_dists, alpha=0.1, jitter=0.25)
sns.boxplot(data=y_dists, showfliers=False)
plt.xlim(-0.5, 10.5)
plt.ylim(-11.5, 5.0)
plt.xticks(
np.arange(11),
bin_names, fontsize=12, rotation=30
)
plt.yticks(fontsize=12)
plt.ylabel("APARENT2 Score", fontsize=12)
plt.tight_layout()
plt.show()
SpearmanrResult(correlation=0.32224283360945594, pvalue=4.746817787025723e-190)