Running the following R script:
library('ggplot2')
library('dplyr')
setwd('/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/')
trim_summary = read.csv('./summaries/trim_summary_report', stringsAsFactors=FALSE, header=TRUE, sep='\t')
colnames(trim_summary) = c("Run_s", "pre_trim", "post_trim")
samples_with_geno = read.table('./summaries/samples_with_genotypes.txt', stringsAsFactors=FALSE)
subjects_with_geno = read.table('./summaries/subjects_with_genotypes.txt', stringsAsFactors=FALSE)
sra_runtable = read.csv('./summaries/SRARunTable.txt', stringsAsFactors=FALSE, header=TRUE, sep='\t')
summary_table = sra_runtable %>% filter(submitted_subject_id_s %in% subjects_with_geno$V1) %>% select(Run_s, Sample_Name_s, body_site_s, histological_type_s, submitted_subject_id_s)
summary_table = left_join(summary_table, trim_summary, by="Run_s")
g = ggplot(summary_table, aes(x = log10(post_trim), y = log10(pre_trim), color= histological_type_s)) + geom_point() + ggtitle('log-log plot of pre-trim and post-trim reads') + theme(legend.position = "none")
ggsave(filename = './gallery/trim_scatter.png', plot = g, width = 7, height = 7)
star1_summary = read.csv('./summaries/STAR_1pass_hg19_summary_report', stringsAsFactors=FALSE, header=TRUE, sep='\t')
colnames(star1_summary) = c("Run_s", "Star1_input", "Star1_unique", "Star1_multi", "Star1_toomany", "Star1_unmapped")
summary_table = left_join(summary_table, star1_summary, by="Run_s")
# There are two rows that are not completed: remove them
summary_table = summary_table %>% filter(!is.na(Star1_input))
g = ggplot(summary_table, aes(x = Star1_unmapped, y = Star1_input, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs unmapped reads') + theme(legend.position = "none")
ggsave(filename = './gallery/star1_unmapped_scatter.png', plot = g, width = 7, height = 7)
g = ggplot(summary_table, aes(x = Star1_unique, y = Star1_input, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs unique-mapping reads') + theme(legend.position = "none")
ggsave(filename = './gallery/star1_unique_scatter.png', plot = g, width = 7, height = 7)
g = ggplot(summary_table, aes(x = Star1_multi, y = Star1_input, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs multi-mapping reads') + theme(legend.position = "none")
ggsave(filename = './gallery/star1_multi_scatter.png', plot = g, width = 7, height = 7)
g = ggplot(summary_table, aes(x = Star1_toomany, y = Star1_input, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs reads mapped to too many locations') + theme(legend.position = "none")
ggsave(filename = './gallery/star1_toomany_scatter.png', plot = g, width = 7, height = 7)
# Which tissue is giving us a lot of unmapped reads?
unmapped_tissue = summary_table %>% group_by(histological_type_s) %>% summarise(mean=mean(Star1_unmapped))
# Turns out they are the blood samples, as expected.
star2_summary_featurecounts = read.csv('./summaries/STAR_2pass_hg19_featurecounts_summary_report', stringsAsFactors=FALSE, header=TRUE, sep='\t')
colnames(star2_summary_featurecounts) = c("Run_s", "Star2_input_f", "Star2_unique_f", "Star2_multi_f", "Star2_toomany_f", "Star2_unmapped_f")
star2_summary_rsem = read.csv('./summaries/STAR_2pass_hg19_rsem_summary_report', stringsAsFactors=FALSE, header=TRUE, sep='\t')
colnames(star2_summary_rsem) = c("Run_s", "Star2_input_r", "Star2_unique_r", "Star2_multi_r", "Star2_toomany_r", "Star2_unmapped_r")
summary_table = left_join(summary_table, star2_summary_featurecounts, by="Run_s")
summary_table = left_join(summary_table, star2_summary_rsem, by="Run_s")
# There are two rows that are not completed for rsem: remove them
summary_table = summary_table %>% filter(!is.na(Star2_input_r))
g = ggplot(summary_table, aes(x = Star2_unmapped_f, y = Star2_input_f, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs unmapped reads') + theme(legend.position = "none")
ggsave(filename = './gallery/star2_unmapped_scatter.png', plot = g, width = 7, height = 7)
g = ggplot(summary_table, aes(x = Star2_unique_f, y = Star2_input_f, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs unique-mapping reads') + theme(legend.position = "none")
ggsave(filename = './gallery/star2_unique_scatter.png', plot = g, width = 7, height = 7)
g = ggplot(summary_table, aes(x = Star2_multi_f, y = Star2_input_f, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs multi-mapping reads') + theme(legend.position = "none")
ggsave(filename = './gallery/star2_multi_scatter.png', plot = g, width = 7, height = 7)
g = ggplot(summary_table, aes(x = Star2_toomany_f, y = Star2_input_f, color= histological_type_s)) + geom_point() + ggtitle('Plot of post-trim reads vs reads mapped to too many locations') + theme(legend.position = "none")
ggsave(filename = './gallery/star2_toomany_scatter.png', plot = g, width = 7, height = 7)
multimapping_tissue = summary_table %>% group_by(histological_type_s) %>% summarise(mean=mean(Star2_multi_f))
# The brown points are Blood, the pink points are stomach.
Returns the following plots:
from IPython.display import Image
Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/trim_scatter.png', width = 500, height = 500)
a = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star1_unique_scatter.png', width = 500, height = 500)
b = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star1_multi_scatter.png', width = 500, height = 500)
c = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star1_toomany_scatter.png', width = 500, height = 500)
d = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star1_unmapped_scatter.png', width = 500, height = 500)
display(a,b,c,d)
Brown points are blood samples, pink points are Stomach samples, two tissue types that show a lot of multi-mapping and unmapped reads.
a = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star2_unique_scatter.png', width = 500, height = 500)
b = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star2_multi_scatter.png', width = 500, height = 500)
c = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star2_toomany_scatter.png', width = 500, height = 500)
d = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/star2_unmapped_scatter.png', width = 500, height = 500)
display(a,b,c,d)
The profiles look similar to the first STAR pass.
The details of this process is explained in the misc_meta_analysis notebook. This section showcases the correlation among the first N factors, and the segregation of the genotype factors based on race.
a = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/LFA_factors.png', width = 500, height = 500)
b = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/LFAPopulation-2D.png', width = 700, height = 700)
c = Image(filename='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gallery/LFAPopulation-3D.png', width = 500, height = 500)
display(a,b,c)
Plots of gene expression profiles plotted against the first three PCs, before and after correction for PEER factors with 3 genotype PCs: