%%bash cd /tigress/BEE/gtex/external_sources/hg19/ wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.pc_transcripts.fa.gz wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.lncRNA_transcripts.fa.gz %%bash cd /tigress/BEE/gtex/external_sources/hg19/ gzip -cd gencode.v19.lncRNA_transcripts.fa.gz gzip -cd gencode.v19.pc_transcripts.fa.gz mkdir /tigress/BEE/gtex/data/auxiliary_bj5 mkdir /tigress/BEE/gtex/data/auxiliary_bj5/normalization python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/isolate_longest_sequences_in_transcriptomes.py \ /tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/isolate_longest_sequences_in_transcriptomes.py \ /tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa %%bash # Transcripts python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \ /tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \ /tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt \ > /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.seq_length.txt tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt \ >> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.seq_length.txt # Genes python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt \ > /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.seq_length.txt tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt \ >> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.seq_length.txt %%bash # Transcripts python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \ /tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \ /tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt \ > /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.gc_content.txt tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt \ >> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.gc_content.txt # Genes python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa \ /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt \ > /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.gc_content.txt tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt \ >> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.gc_content.txt %%bash echo 'export R_LIBS=/tigress/BEE/tools/R_libs' >> ~/.bashrc source ~/.bashrc # In R: source("http://bioconductor.org/biocLite.R") biocLite("EDASeq") %%bash mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/ mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/ python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC_wrapper.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/ _certain_biotypes_TPM.txt \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/ sh /tigress/BEE/gtex/tmp/low_level_scripts/normalize_only_GC_wrapper.sh %%writefile /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC_wrapper.py ################################################## # normalize_only_GC.py # # ################################################## import glob from sys import argv import os.path in_path = argv[1] in_suffix = argv[2] in_path = in_path + '*' + in_suffix out_path = argv[3] # in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/*_certain_biotypes_TPM.txt' # out_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/' matrices = glob.glob(in_path + '*') master_script = "/tigress/BEE/gtex/tmp/low_level_scripts/normalize_only_GC_wrapper.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") seq_lengths, gc_content = {}, {} seq_lengths['pc_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt' seq_lengths['pc_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt' seq_lengths['lncRNA_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt' seq_lengths['lncRNA_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt' gc_content['pc_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt' gc_content['pc_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt' gc_content['lncRNA_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt' gc_content['lncRNA_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt' k = 0 for i, matrix in enumerate(matrices): out_matrix = out_path + matrix.split('/')[-1].replace('_TPM.txt','_TPM_GC.txt') print out_matrix if os.path.isfile(out_matrix): continue k+=1 genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0] sbatchfile = "/tigress/BEE/gtex/tmp/low_level_scripts/norm_GC_" + str(i) + ".slurm" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J norm_%s # job name #SBATCH --mem=12000 # 12 GB requested #SBATCH -t 01:00:00 # to be placed in the test queue /usr/bin/Rscript /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC.R \ %s %s %s %s %s %s """%(i, matrix, seq_lengths['pc_'+genes_or_transcripts], seq_lengths['lncRNA_'+genes_or_transcripts], \ gc_content['pc_'+genes_or_transcripts], gc_content['lncRNA_'+genes_or_transcripts], out_matrix) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) print 'Number of jobs:', k+1 %%writefile /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC.R ################################################## # normalize_only_GC.R # # Use EDASeq R package to normalize gene expression within-lane by GC-content # # See: # Risso, Davide, et al. "GC-content normalization for RNA-Seq data." BMC bioinformatics 12.1 (2011): 480. ################################################## #!/usr/bin/Rscript ## Collect arguments args <-commandArgs(TRUE) ## Default setting when no arguments passed if (length(args) != 6 ){ args <- c("--help") } ## Help section if("--help" %in% args) { cat(" normalize_only_GC.R Usage: ./normalize_only_GC.R /path/to/expression_matrix.txt /path/to/seq_length_protein_coding.txt \ /path/to/seq_length_lncRNA.txt /path/to/GC_content_protein_coding.txt \ /path/to/GC_content_lncRNA.txt /path/to/normalized_expression_matrix.txt\n") q(save="no") } library(EDASeq) expr = args[1] seq_length_pc = args[2] seq_length_lncRNA = args[3] GC_content_pc = args[4] GC_content_lncRNA = args[5] norm_expr_out = args[6] # expr = "/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM.txt" # seq_length_pc = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.pc_genes.seq_length.txt" # seq_length_lncRNA = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.lncRNA_genes.seq_length.txt" # GC_content_pc = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.pc_genes.gc_content.txt" # GC_content_lncRNA = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.lncRNA_genes.gc_content.txt" # norm_expr_out = "/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM_GC.txt" # import data expr = read.table(file=expr, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) seq_length_pc = read.table(file=seq_length_pc, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) seq_length_lncRNA = read.table(file=seq_length_lncRNA, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) GC_content_pc = read.table(file=GC_content_pc, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) GC_content_lncRNA = read.table(file=GC_content_lncRNA, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) # turn covariates into named vectors to be used by normalization functions df_to_named_vector <- function(df, col){ tmp <- as.numeric(df[,col]) names(tmp) <- row.names(df) return(tmp) } seq_length_pc <- df_to_named_vector(seq_length_pc, 'length') seq_length_lncRNA <- df_to_named_vector(seq_length_lncRNA, 'length') GC_content_pc <- df_to_named_vector(GC_content_pc, 'gccontent') GC_content_lncRNA <- df_to_named_vector(GC_content_lncRNA, 'gccontent') seqs_lncRNA <- intersect(rownames(expr), names(seq_length_lncRNA)) seqs_lncRNA <- intersect(seqs_lncRNA, names(GC_content_lncRNA)) expr_lncRNA <- as.matrix(expr[seqs_lncRNA, , drop=FALSE ]) seqs_pc <- intersect(rownames(expr), names(seq_length_pc)) seqs_pc <- intersect(seqs_pc, names(GC_content_pc)) expr_pc <- as.matrix(expr[seqs_pc, , drop=FALSE ]) # reduce expression matrices to only those genes/transcripts that are expressed (non-zero) in # at least 10% of samples for this particular tissue expr_lncRNA = as.matrix(expr_lncRNA[apply(expr_lncRNA, 1, function(x) length(x[x != 0])/length(x) >= 0.10),]) expr_pc = as.matrix(expr_pc[apply(expr_pc, 1, function(x) length(x[x != 0])/length(x) >= 0.10),]) # create eSet objects data_lncRNA <- newSeqExpressionSet(expr_lncRNA, featureData=AnnotatedDataFrame(data.frame(gc=GC_content_lncRNA[row.names(expr_lncRNA)])), round=FALSE) data_pc <- newSeqExpressionSet(expr_pc, featureData=AnnotatedDataFrame(data.frame(gc=GC_content_pc[row.names(expr_pc)])), round=FALSE) # Perform within-lane normalizations separately for lincRNA and mRNA because we would expect # different distributions # Within-lane normalization for GC-content norm_lncRNA <- normCounts(withinLaneNormalization(data_lncRNA, "gc", which="full", offset=FALSE, round=FALSE)) norm_pc <- normCounts(withinLaneNormalization(data_pc, "gc", which="full", offset=FALSE, round=FALSE)) # bind the pc and lncRNA dataframes together norm <- rbind(norm_pc,norm_lncRNA) write.table(norm, file=norm_expr_out, quote=FALSE, sep='\t', row.names=T, col.names=NA) %%bash mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/LFA # In R: source("http://bioconductor.org/biocLite.R") biocLite("peer") %%bash cp GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_PostImput_20genotPCs.txt \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/ # In R, run the following: genotype_PCs = read.table('GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_PostImput_20genotPCs.txt', sep = '\t', stringsAsFactors=FALSE, header=TRUE) rownames(genotype_PCs) = sapply(c(1:dim(genotype_PCs)[1]), function(x) {paste(strsplit(genotype_PCs$FID[x], split = "-")[[1]][1], strsplit(genotype_PCs$FID[x], split = "-")[[1]][2], sep = ".")}) genotype_PCs_top3 = genotype_PCs[,c("C1","C2","C3")] write.table(genotype_PCs_top3, file = "GTEx_Genotype_PCs_top3.txt", sep = '\t') load("subSample1MLFA1.RData") row.names(subSample1MLFA) = sapply(row.names(subSample1MLFA), function(x) {paste("GTEX.", x, sep = "")}) genotype_PCs_top3_LFA = subSample1MLFA[,c(1:3)] colnames(genotype_PCs_top3_LFA) = c("C1","C2","C3") write.table(genotype_PCs_top3_LFA, file = "LFA_Genotype_PCs_top3.txt", sep = "\t") # In R: install.packages("devtools") library("devtools") install_github("Storeylab/lfa") %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/LFA/lfaWrapper.sh #!/usr/bin/Rscript # sbatch -t 24:00:00 -N 1 --ntasks-per-node=1 --mail-type=begin --mail-type=end --mail-user=bj5@princeton.edu --mem=125000 ./lfaWrapper.sh library('lfa') library('Matrix') library('R.utils') NUM_SUBJECTS = 450 NUM_CHRS = 22 # Take in subject names tempRowNames = read.table('/tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr22.allelic_dosage.txt', header = TRUE) indNames = colnames(tempRowNames) indNames = indNames[2:NUM_SUBJECTS+1] remove(tempRowNames) indNames = substr(indNames,6,10) # Tidy the subject IDs for(i in seq(1,length(indNames))){ if(substr(indNames[i],5,5)=='.'){ indNames[i] = substr(indNames[i],1,4) } } numRow = 0 for(i in seq(1,NUM_CHRS)){ numRow = numRow + countLines(paste("/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr",i,".bim",sep='')) } print(numRow) genotypeHolder = matrix(nrow=numRow,ncol=NUM_SUBJECTS) genoHolderIndex = 1 for(i in seq(1,NUM_CHRS)){ dataLength = countLines(paste('/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr',i,'.bim',sep="")) temp = read.bed(paste('/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr',i,sep="")) genotypeHolder[genoHolderIndex:(genoHolderIndex + dataLength - 1),] = temp genoHolderIndex = genoHolderIndex + dataLength remove(temp) } print(object.size(genotypeHolder)) # Sample 1M snps to form genotype PCs for(i in seq(1,10)){ indices = sample(1:numRow,1000000,replace = FALSE) genotypeSample = genotypeHolder[indices,] subSample1MLFA = lfa(genotypeSample,d=51) rownames(subSample1MLFA) = indNames save(subSample1MLFA, file=paste('subSample1MLFA',i,'.RData',sep='')) } %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_wrapper.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/ \ _genes_certain_biotypes_counts_normalized.txt \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \ featurecounts /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER/ 10 15 20 25 30 35 40 45 50 55 60 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_wrapper.py ################################################## # cis_peer_correction_chr11_wrapper.py # # ################################################## import glob from sys import argv import os in_path = argv[1] in_suffix = argv[2] genotype_pcs = argv[3] gene_metadata = argv[4] method = argv[5] # in_path = in_path + method + '/TPM_with_genotypes/' + '*' + '_' + method + in_suffix # out_path = argv[6] + "to be appended later" joblog_dir = argv[7] # Array of PEER factors that are given in the input peer_factors = argv[8:len(argv)] # Examples # argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/' # or # argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/' # argv[2] = '_genes_certain_biotypes_TPM.txt' # or # argv[2] = '_genes_certain_biotypes_counts_normalized.txt' # argv[3] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt' # argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt' # argv[5] = 'featurecounts' # argv[6] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/' # argv[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER' # argv[8:len(argv)] = ['10','20','30','40','50'] # Make the job log directories if not os.path.exists(joblog_dir): os.makedirs(joblog_dir) if not os.path.exists(joblog_dir + 'err'): os.makedirs(joblog_dir + 'err') if not os.path.exists(joblog_dir + 'out'): os.makedirs(joblog_dir + 'out') matrices = glob.glob(in_path + '*' + in_suffix) for peer_factor in peer_factors: out_path = argv[6] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' if not os.path.exists(out_path): os.makedirs(out_path) master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_" + peer_factor + "_cis_peer_correction_chr11_wrapper.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") # Examples # peer_factor = 10 # residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM.txt' # tissue_name = 'Adipose-Subcutaneous' factor_file_directory = out_path + 'PEER_factors' if not os.path.exists(factor_file_directory): os.makedirs(factor_file_directory) k = 0 for i, matrix in enumerate(matrices): filename = matrix.split('/')[-1] tissue_name = str.split(filename, '_'+method)[0] out_matrix = out_path + filename # out_matrix = out_path + filename.replace('_TPM.txt','_TPM_GC.txt') if os.path.isfile(out_matrix): continue k+=1 genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0] outfilename = method+"_"+tissue_name+"_"+peer_factor sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/PEER/" + method + '_TPM_GC_3GenPC_' + peer_factor + 'PEER1000_' + tissue_name + ".slurm" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J PEER_%s_%s # job name #SBATCH --mem=6000 # 6 GB requested #SBATCH -t 04:00:00 #SBATCH -e %s # err output directory #SBATCH -o %s # out output directory /usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R \ %s %s %s %s %s %s %s """%(tissue_name, peer_factor, joblog_dir+'err/'+outfilename+'.err', joblog_dir+'out/'+outfilename+'.out', matrix, genotype_pcs, gene_metadata, peer_factor, out_matrix, tissue_name, factor_file_directory) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R #################################################### # cis_peer_correction_chr11.R # Exploratory: Testing with chr 11. # #################################################### #!/usr/bin/Rscript ## Collect arguments args <-commandArgs(TRUE) ## Default setting when no arguments passed if (length(args) != 7 ){ args <- c("--help") } ## Help section if("--help" %in% args) { cat(" cis_peer_correction.R Usage: ./cis_peer_correction.R /path/to/expression_matrix.txt /path/to/genotype_pcs.RData \ /path/to/gene_metadata/ num_of_peer_factors \ /path/to/normalized_expression_matrix.txt tissue_name /path/to/factor_files\n") q(save="no") } expr_matrix = args[1] genotype_pcs = args[2] gene_metadata = args[3] peer_factors = as.numeric(args[4]) residual_matrix = args[5] tissue_name = args[6] factor_file_directory = args[7] # expr_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM.txt' # expr_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_counts_normalized.txt' # genotype_pcs = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt' # gene_metadata = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt' # peer_factors = '10' # residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM_GC.txt' # tissue_name = 'Brain-Hypothalamus' # factor_file_directory = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/PEER_factors' library(peer) library(dplyr) # Load in the genotype PCs (top 3 will be used for this pipeline), saved as samSample1MLFA genotype_PCs_top3 = read.table(genotype_pcs, stringsAsFactors=FALSE, header=TRUE, sep='\t') gene_metadata = read.table(gene_metadata, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) # Load in the expression matrices # Also correct for covariates using PEER with K factors and N steps expr_matrix = read.table(file=expr_matrix, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) # The samples included are already sorted for samples with genotypes. # Data massaging: left_join is similar to a databaes left join, matching entries in first table to the second expr_matrix$gene_id = row.names(expr_matrix) gene_metadata$gene_id = row.names(gene_metadata) joined_matrix = left_join(expr_matrix, gene_metadata, by="gene_id") # Take only chr 11 for now. joined_matrix = filter(joined_matrix, chr == 11) joined_matrix = arrange(joined_matrix, start) row.names(joined_matrix) = joined_matrix$gene_id joined_matrix = select(joined_matrix, 1:(dim(joined_matrix)[2]-6)) # Optional: remove genes that are expressed in less than 10% of individuals. joined_matrix = joined_matrix[sapply(1:dim(joined_matrix)[1], function(x) {(sum(joined_matrix[x,]==0)/dim(joined_matrix)[2])<0.1}),] # PEER takes in an inverted matrix as input inverted_matrix = as.data.frame(t(data.matrix(joined_matrix))) genotype_PCs_top3_df = as.data.frame(genotype_PCs_top3[,1:3]) inverted_matrix$subject_id = row.names(inverted_matrix) genotype_PCs_top3_df$subject_id = row.names(genotype_PCs_top3_df) inverted_joined = left_join(inverted_matrix, genotype_PCs_top3_df, by="subject_id") subject_list = inverted_joined$subject_id # Top 3 PCs, can be changed for future tasks genotype_top3 = as.matrix(inverted_joined[,(dim(inverted_joined)[2]-2):dim(inverted_joined)[2]]) inverted_joined = as.matrix(select(inverted_joined, c(1:(dim(inverted_joined)[2]-4)))) # Now set up PEER for detection of hidden covariates model = PEER() PEER_setPhenoMean(model,inverted_joined) PEER_setCovariates(model, genotype_top3) PEER_setNk(model,peer_factors) PEER_update(model) # Now that PEER has been set up, we can obtain the residuals and save the resulting matrix: residuals = PEER_getResiduals(model) colnames(residuals) = colnames(inverted_joined) rownames(residuals) = subject_list write.table(as.data.frame(t(data.matrix(residuals))), file=residual_matrix, quote = FALSE, sep = "\t") # Also save the factors, weights, and alpha values: write.table(PEER_getX(model), file=paste(factor_file_directory, '/', tissue_name, '_Factors.txt', sep=""), row.names=FALSE, col.names=FALSE) write.table(PEER_getW(model), file=paste(factor_file_directory, '/', tissue_name, '_Loadings.txt', sep=""), row.names=FALSE, col.names=FALSE) write.table(PEER_getAlpha(model), file=paste(factor_file_directory, '/', tissue_name, '_Alphas.txt', sep=""), row.names=FALSE, col.names=FALSE) %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_rerun_wrapper.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/ \ _genes_certain_biotypes_counts_normalized.txt \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \ featurecounts /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER/ 10 15 20 25 30 35 40 45 50 55 60 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_rerun_wrapper.py ################################################## # cis_peer_correction_chr11_rerun_wrapper.py # provide jobs that have failed with more time and memory # ################################################## import glob from sys import argv import os in_path = argv[1] in_suffix = argv[2] genotype_pcs = argv[3] gene_metadata = argv[4] method = argv[5] # in_path = in_path + method + '/TPM_with_genotypes/' + '*' + '_' + method + in_suffix # out_path = argv[6] + "to be appended later" joblog_dir = argv[7] # Array of PEER factors that are given in the input peer_factors = argv[8:len(argv)] # Examples # argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/' # or # argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/' # argv[2] = '_genes_certain_biotypes_TPM.txt' # or # argv[2] = '_genes_certain_biotypes_counts_normalized.txt' # argv[3] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt' # argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt' # argv[5] = 'featurecounts' # argv[6] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/' # argv[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER' # argv[8:len(argv)] = ['10','20','30','40','50'] # Create full tissue_list. tissue_list = [] matrices = glob.glob(in_path + '*' + in_suffix) for i, matrix in enumerate(matrices): filename = matrix.split('/')[-1] tissue_name = str.split(filename, '_'+method)[0] tissue_list.append(tissue_name) master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_cis_peer_correction_chr11_rerun_wrapper.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") k = 0 for peer_factor in peer_factors: print(peer_factor) out_path = argv[6] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' # Examples # peer_factor = 10 # residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM.txt' # tissue_name = 'Adipose-Subcutaneous' factor_file_directory = out_path + 'PEER_factors' for tissue in tissue_list: out_matrix = out_path + tissue + "_" + method + in_suffix # out_matrix = out_path + tissue + "_" + method + in_suffix.replace('_TPM.txt','_TPM_GC.txt') if os.path.isfile(out_matrix): continue print("Missing: " + tissue) genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0] outfilename = method+"_"+tissue+"_"+peer_factor sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/PEER/" + method + '_TPM_GC_3GenPC_' + peer_factor + 'PEER1000_' + tissue + ".slurm" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J PEER_%s_%s # job name #SBATCH --mem=12000 # 12 GB requested #SBATCH -t 08:00:00 #SBATCH -e %s # err output directory #SBATCH -o %s # out output directory /usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R \ %s %s %s %s %s %s %s """%(tissue, peer_factor, joblog_dir+'err/'+outfilename+'.err', joblog_dir+'out/'+outfilename+'.out', matrix, genotype_pcs, gene_metadata, peer_factor, out_matrix, tissue, factor_file_directory) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") k+=1 master_handle.close() print 'sh %s'%(master_script) print 'Number of jobs:', k %%bash # Set up the MatrixEQTL env. cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_10PEER1000 mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_20PEER1000 mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_30PEER1000 mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_40PEER1000 mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_50PEER1000 %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \ featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 0 \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60 sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_10_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_15_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_20_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_25_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_30_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_35_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_40_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_45_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_50_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_55_Noimput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_60_Noimput_cis_matrix_eqtl_chr11_wrapper.sh python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \ featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60 sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_10_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_15_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_20_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_25_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_30_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_35_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_40_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_45_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_50_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_55_Imput_cis_matrix_eqtl_chr11_wrapper.sh sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_60_Imput_cis_matrix_eqtl_chr11_wrapper.sh %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py ################################################## # cis_matrix_eqtl_wrapper.py # Wrapper for the cis_matrix_eqtl.R file and submitting the batch jobs # ################################################## import glob from sys import argv import os.path in_path = argv[1] in_suffix = argv[2] method = argv[3] mapping_dir = argv[4] joblog_dir = argv[6] incl_imput = argv[7] analysis_dir = argv[8] # Array of PEER factors that are given in the input peer_factors = argv[9:len(argv)] # Examples # argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/' # argv[2] = '_genes_certain_biotypes_TPM_GC.txt' # argv[3] = 'featurecounts' # argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping' # argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/' # argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/' # argv[7] = '0' or '1' # argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis' # argv[9:len(argv)] = ['10','20','30','40','50'] # Make the job log directories if not os.path.exists(joblog_dir): os.makedirs(joblog_dir) if not os.path.exists(joblog_dir + 'err'): os.makedirs(joblog_dir + 'err') if not os.path.exists(joblog_dir + 'out'): os.makedirs(joblog_dir + 'out') if incl_imput == "0": imput = "Noimput" else: imput = "Imput" for peer_factor in peer_factors: print(peer_factor) in_path = argv[1] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' + '*' + '_' + method + in_suffix out_path = argv[5] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' if not os.path.exists(out_path): os.makedirs(out_path) matrices = glob.glob(in_path + '*') print("Number of matrices: " + str(len(matrices))) master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_" + peer_factor + '_' + imput + "_cis_matrix_eqtl_chr11_wrapper.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") # MatrixEQTL parameters incl_trans = '0' # Don't include trans chr_number = '11' # chromosome number cis_dist = '1e5' # cis window k = 0 for i, matrix in enumerate(matrices): filename = matrix.split('/')[-1] tissue_name = str.split(filename, '_'+method)[0] out_file = out_path + filename.replace('.txt','_MatrixEQTL') if os.path.isfile(out_file): continue k+=1 genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0] sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_' + peer_factor + '_MatrixEQTL_chr' + chr_number + '_' + tissue_name + '_' + imput + ".slurm" outfilename = method+"_"+tissue_name+"_"+peer_factor sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J meqtl_%s_%s # job name #SBATCH --mem=6000 # 6 GB requested #SBATCH -t 04:00:00 #SBATCH -e %s # err output directory #SBATCH -o %s # out output directory /usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \ %s %s %s %s %s %s %s %s %s %s """%(tissue_name, peer_factor, joblog_dir+'err/'+outfilename+'_'+imput+'.err', joblog_dir+'out/'+outfilename+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) print 'Number of jobs:', k %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R #################################################### # cis_matrix_eqtl.R # Performs univariate cis-eQTL analysis using Matrix eQTL # #################################################### #!/usr/bin/Rscript # Collect arguments args <-commandArgs(TRUE) ## Default setting when no arguments passed if (length(args) != 10 ){ args <- c("--help") } ## Help section if("--help" %in% args) { cat(" cis_peer_correction.R Usage: ./cis_matrix_eqtl.R /path/to/expression_matrix.txt include_trans_binary \ include_imputed_binary chromosome_number cis_disance_def peer_factor \ /path/to/mapping/directory /path/to/output/files method /path/to/analysis/dir\n") q(save="no") } expression_file_location = args[1] incl_trans = args[2] incl_imput = args[3] chr_number = args[4] cis_dist = as.numeric(args[5]) peer_factor = args[6] # Example # args[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_45PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC.txt' # args[2] = '0' # args[3] = '0' # args[4] = '11' # args[5] = '1e5' # args[6] = '10' imput = 'Imput' if (incl_imput == '0') {imput = 'Noimput'} SNP_file_name = paste(args[7], '/genotype/genotypesChr', chr_number, '_', imput, '.txt', sep="") covariates_file_name = character() SNP_position_file = paste(args[7], '/references/SNP_positions/SNP_positions_Chr', chr_number, '.txt', sep="") gene_position_file = paste(args[7], '/references/gene_positions/gene_positions_Chr', chr_number, '.txt', sep="") output_file_name_cis = paste(args[8], '_Chr', chr_number, '_', imput, '_cis.txt', sep="") output_file_name_trans = paste(args[8], '_Chr', chr_number, '_', imput, '_trans.txt', sep="") method = paste(args[9]) expression_file_name = strsplit(expression_file_location, split="/")[[1]][length(strsplit(expression_file_location, split="/")[[1]])] tissue_name = strsplit(expression_file_name, split = paste('_', method, sep=""))[[1]][1] analysis_dir = paste(args[10], "/", tissue_name, sep="") # Example # args[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping' # args[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/ # ... TPM_GC_3GenPC_30PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL' # args[9] = 'featurecounts' # args[10] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis' # Need to check that the subjects read in from expression matrices match the subjects in genotype data library(dplyr) expression_matrix = read.csv(file = expression_file_location, header = TRUE, sep = '\t', stringsAsFactors = FALSE) genotype_matrix = read.csv(file = SNP_file_name, header = TRUE, sep = '\t', stringsAsFactors = FALSE) row.names(genotype_matrix) = genotype_matrix$SNP genotype_matrix = genotype_matrix[,sapply(colnames(expression_matrix), function(x) {match(x, colnames(genotype_matrix))})] # Also obtain genes and snps that are in the expression and genotype files, respectively. gene_positions = read.table(file = gene_position_file, header = FALSE, sep = '\t', stringsAsFactors = FALSE) snp_positions = read.table(file = SNP_position_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE) colnames(gene_positions) = c("gene_ID", "chr_num", "start", "end") gene_positions = mutate(gene_positions, chr = paste("chr", chr_num, sep = "")) gene_positions = select(gene_positions, c(gene_ID, chr, start, end)) # Make sure the list of genes and their ordering are the same expression_matrix = expression_matrix[rownames(expression_matrix) %in% gene_positions$gene_ID,] gene_positions = gene_positions[sapply(rownames(expression_matrix), function(x) {match(x, gene_positions$gene_ID)}),] # Import MatrixEQTL library(MatrixEQTL) # Use linear model with p threshold of 0.01 useModel = modelLINEAR; pvOutputThreshold_cis = 1e-2 pvOutputThreshold_trans = 0 if (incl_trans == '1') {pvOutputThreshold_trans = 1e-2} snps = SlicedData$new() snps$CreateFromMatrix(as.matrix(genotype_matrix)); gene = SlicedData$new() gene$CreateFromMatrix(as.matrix(expression_matrix)); gene_qnorm = gene$copy(); # Quantile-normalize the expression values - ties broken by averaging for( sl in 1:length(gene_qnorm) ) { mat = gene_qnorm[[sl]]; mat = t(apply(mat, 1, rank, ties.method = "average")); mat = qnorm(mat / (ncol(gene_qnorm)+1)); gene_qnorm[[sl]] = mat; } rm(sl, mat); # me = Matrix_eQTL_main( # snps = snps, # gene = gene, # output_file_name = output_file_trans, # pvOutputThreshold = pvOutputThreshold, # useModel = useModel, # verbose = TRUE, # pvalue.hist = "qqplot", # min.pv.by.genesnp = FALSE, # noFDRsaveMemory = FALSE); # Only for generating the P-value hist: me = Matrix_eQTL_main( snps = snps, gene = gene_qnorm, output_file_name = output_file_name_trans, pvOutputThreshold = pvOutputThreshold_trans, useModel = useModel, verbose = TRUE, output_file_name.cis = output_file_name_cis, pvOutputThreshold.cis = pvOutputThreshold_cis, snpspos = snp_positions, genepos = gene_positions, cisDist = cis_dist, pvalue.hist = 100, min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE) # Save the histogram output_file_name_hist = paste(analysis_dir, '/', tissue_name, '_', imput, '_Chr', chr_number, '_', peer_factor, '_hist.pdf', sep="") print(paste("Saving to:", output_file_name_hist)) pdf(output_file_name_hist) plot(me) dev.off() # repeat for the qqplot: me_q = Matrix_eQTL_main( snps = snps, gene = gene_qnorm, output_file_name = output_file_name_trans, pvOutputThreshold = pvOutputThreshold_trans, useModel = useModel, verbose = TRUE, output_file_name.cis = output_file_name_cis, pvOutputThreshold.cis = pvOutputThreshold_cis, snpspos = snp_positions, genepos = gene_positions, cisDist = cis_dist, pvalue.hist = "qqplot", min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE) # Save the qqplot output_file_name_qqplot = paste(analysis_dir, '/', tissue_name, '_', imput, '_Chr', chr_number, '_', peer_factor, '_qqplot.pdf', sep="") print(paste("Saving to:", output_file_name_hist)) pdf(output_file_name_qqplot) plot(me_q) dev.off() # Save the Matrix EQTL object as RData save(gene_qnorm, me, snps, file=paste(args[8], '_Chr', chr_number, '_', imput, '_me.RData', sep="")) python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \ featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 0 \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60 python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \ featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py ################################################## # cis_matrix_eqtl_rerun_wrapper.py # provide jobs that have failed with more time and memory # ################################################## import glob from sys import argv import os.path in_path = argv[1] in_suffix = argv[2] method = argv[3] mapping_dir = argv[4] joblog_dir = argv[6] incl_imput = argv[7] analysis_dir = argv[8] # Array of PEER factors that are given in the input peer_factors = argv[9:len(argv)] # Examples # argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/' # argv[2] = '_genes_certain_biotypes_TPM_GC.txt' # argv[3] = 'featurecounts' # argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping' # argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/' # argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/' # argv[7] = '0' or '1' # argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis' # argv[9:len(argv)] = ['10','20','30','40','50'] # Make tissue list: tissue_list = [] # Use 'TPM_GC_3GenPC_10PEER1000' to build tissue list in_path = in_path + method + '/TPM_GC_3GenPC_10PEER1000/' matrices = glob.glob(in_path + '*' + in_suffix) for i, matrix in enumerate(matrices): filename = matrix.split('/')[-1] tissue_name = str.split(filename, '_'+method)[0] tissue_list.append(tissue_name) if incl_imput == "0": imput = "Noimput" else: imput = "Imput" master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + '_' + imput + "_cis_matrix_eqtl_chr11_rerun_wrapper.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") k = 0 for peer_factor in peer_factors: print(peer_factor) out_path = argv[5] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' # MatrixEQTL parameters incl_trans = '0' # Don't include trans chr_number = '11' # chromosome number cis_dist = '1e5' # cis window for tissue in tissue_list: out_file = out_path + tissue + "_" + method + in_suffix.replace('.txt','_MatrixEQTL') check_file = out_path + tissue + "_" + method + in_suffix.replace('.txt','_MatrixEQTL_Chr') + chr_number + '_' + imput + '_me.RData' if os.path.isfile(check_file): continue k+=1 print "Missing: " + check_file genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0] sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_' + peer_factor + '_MatrixEQTL_chr' + chr_number + '_' + tissue + "_" + imput + ".slurm" outfilename = method+"_"+tissue+"_"+peer_factor sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J meqtl_%s_%s # job name #SBATCH --mem=12000 # 12 GB requested #SBATCH -t 08:00:00 # to be placed in the test queue #SBATCH -e %s # err output directory #SBATCH -o %s # out output directory /usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \ %s %s %s %s %s %s %s %s %s %s """%(tissue, peer_factor, joblog_dir+'err/'+outfilename+'_'+imput+'.err', joblog_dir+'out/'+outfilename+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) print 'Number of jobs:', k %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py ################################################## # create_dirs.py # create necessary directories # ################################################## import glob from sys import argv import os.path in_path = argv[1] in_suffix = argv[2] method = argv[3] out_path = argv[4] + method + '/' in_path = in_path + method + '/TPM_GC_3GenPC_10PEER1000/' + '*' + '_' + method + in_suffix # in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/' # in_suffix = '_genes_certain_biotypes_TPM.txt' # method = 'featurecounts' # out_path = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/" directory = out_path + 'analysis' if not os.path.exists(directory): os.makedirs(directory) matrices = glob.glob(in_path + '*') print(len(matrices)) for i, matrix in enumerate(matrices): filename = matrix.split('/')[-1] tissue_name = str.split(filename, '_'+method)[0] if not os.path.exists(directory + '/' + tissue_name): os.makedirs(directory + '/' + tissue_name) if not os.path.exists(directory + '/' + tissue_name + '/Examples/'): os.makedirs(directory + '/' + tissue_name + '/Examples') %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \ featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ # In R: install.packages("ggplot2") %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_post_wrapper.py \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ featurecounts 1e5 \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt 100 %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_post_wrapper.py ################################################## # cis_matrix_eqtl_post_wrapper.py # # ################################################## import glob from sys import argv import os.path in_path = argv[1] method = argv[2] cis_cutoff = argv[3] gene_name_file = argv[4] snp_positions_file = argv[5] tissue_list_file = argv[6] num_indiv_cutoff = int(argv[7]) # Examples # argv[1] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/' # argv[2] = 'featurecounts' # argv[3] = '1e5' # argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt' # argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt' # argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt' # argv[7] = '100' master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_cis_matrix_eqtl_post_wrapper.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") # Make tissue list: f = open(tissue_list_file) for line in f.readlines(): entry = str.split(line.strip(),'\t') # If the number of individuals is greater than the cutoff: if int(entry[1]) >= num_indiv_cutoff: tissue = entry[0] sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/" + method + '_' + tissue + "_cis_matrix_eqtl_post.slurm" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J %s_Matrix_post # job name #SBATCH --mem=4000 # 4 GB requested #SBATCH -e /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/err/Matrix_%s_post.err # err output directory #SBATCH -o /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/out/Matrix_%s_post.out # out output directory #SBATCH -t 5:00:00 /usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL.R \ %s %s %s %s \ %s \ %s \ 10 15 20 25 30 35 40 45 50 55 60 """%(tissue, tissue, tissue, in_path, method, cis_cutoff, tissue, gene_name_file, snp_positions_file) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL.R #################################################### # post_analyses_MatrixEQTL.R # Iterates through K PEER factors for every tissue and produces interesting graphs. # #################################################### #!/usr/bin/Rscript # Collect arguments args <-commandArgs(TRUE) library(dplyr) library(MatrixEQTL) library(ggplot2) # Load in data # load("Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL_Chr11_Imput_me.RData") # file_name = Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL_Chr11_Imput_me.RData in_dir = args[1] method = args[2] cis_dist = as.numeric(args[3]) in_dir = paste(in_dir, method, "/", sep='') tissue = args[4] gene_name_file = args[5] snp_position_file = args[6] peer_factors = args[7:length(args)] # args[1] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/' # args[2] = 'featurecounts' # args[3] = '1e5' # args[4] = 'WholeBlood' # args[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt' # args[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt' # args[7] = c(10,15,20,25,30,35,40,45,50,55,60) # Save the numer of significant pairs and unique genes detected at each level. Counts_by_tissue = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE) #Genes_snps_by_tissue = data.frame(tissue = character(0), imputed = character(0), gene_list = character(0), snp_list = character(0)) gene_names = read.csv(gene_name_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE) row.names(gene_names) = gene_names$gene_id print(dim(gene_names)) snp_positions = read.csv(snp_position_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE) row.names(snp_positions) = snp_positions$rsID print(dim(snp_positions)) print(tissue) for (peer in peer_factors) { print(peer) # First iterate through nonimputed: result_path = paste(in_dir, "TPM_GC_3GenPC_", peer, "PEER1000/", sep='') analysis_path = paste(in_dir, "analysis/", tissue, '/', sep='') # Match a regular expression pattern: result_file <- dir(result_path, pattern = paste(tissue,".*\\Noimput_me.RData",sep='')) # The result file of MatrixEQTL exists but the summary file (output of this) doesn't if (length(result_file) > 0) { load(paste(result_path, result_file[1], sep='')) # Plot top 5 eQTL - unique gene pairs, if they exist. temp = data.frame(me$cis$eqtls) temp$snps = as.character(temp$snps) temp$gene = as.character(temp$gene) top_genes = head(unique(temp$gene), n=5) # Only proceed if there are at least 5 genes. if (length(top_genes) == 5) { temp$TSS = gene_names[temp$gene,][,"start"] temp$TES = gene_names[temp$gene,][,"end"] temp$gene_name = gene_names[temp$gene,][,"gene_name"] temp$SNP_pos = snp_positions[temp$snps,][,"pos"] # Difference mapping: -1e5 to 0 if before TSS # scale from 1 to 1e5 if in gene # 1e5 to 2e5 if after TES temp$diff = sapply(c(1:dim(temp)[1]), function(x) { if ((temp$SNP_pos[x] - temp$TSS[x])<0) { temp$SNP_pos[x] - temp$TSS[x] } else if ((temp$SNP_pos[x] - temp$TES[x])>0) { temp$SNP_pos[x] - temp$TES[x] + cis_dist } else { cis_dist * (temp$SNP_pos[x] - temp$TSS[x]) / (temp$TES[x] - temp$TSS[x]) }}) tryCatch({ g = ggplot(temp[temp$FDR < 0.01,], aes(x = diff)) + geom_histogram(binwidth = 10000) + ggtitle('Distribution of distances to TSS') + xlab('Distance') + scale_x_discrete(breaks=c(0, 1e5), labels=c("TSS", "TES")) print(paste("Trying to save: ", analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep='')) ggsave(filename = paste(analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep=''), plot=g, width = 7, height = 7) }, error = function(e) { print(paste("Could not save: ", analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep='')) }) for (j in 1:5) { index = which(temp$gene == top_genes[j])[1] test_snp = snps$FindRow(temp[index,"snps"]) test_gene_q = gene_qnorm$FindRow(temp[index,"gene"]) test.df = data.frame(snp = as.numeric(test_snp$row), gene = as.numeric(test_gene_q$row)) row.names(test.df) = colnames(snps) test.df = test.df[!is.na(test.df$snp),] test.df$snp_chr = as.character(test.df$snp) test_name = as.character(temp[index,"gene_name"]) test_rsID = as.character(temp[index,"snps"]) tryCatch({ g = ggplot(test.df, aes(x = snp_chr, y = gene, color=snp_chr)) + geom_violin() + geom_jitter() + ggtitle(paste(test_name, test_rsID, sep='_')) + xlab(test_rsID) + ylab(test_name) print(paste("Trying to save: ", analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep='')) ggsave(filename = paste(analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''), plot = g, width = 7, height = 7) }, error = function(e) { print(paste("Could not save: ", analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep='')) }) } } # Take all n_pairs = dim(me$cis$eqtls)[1] n_genes = length(unique(me$cis$eqtls$gene)) Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "pval", imputed = "Noimput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE)) # Take FDR-controlled at 5% n_pairs = dim(filter(me$cis$eqtls, FDR <= 0.01))[1] n_genes = length(unique(filter(me$cis$eqtls, FDR <= 0.01)$gene)) Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "FDR", imputed = "Noimput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE)) } } if (dim(Counts_by_tissue)[1] != 0) { print(paste("Save: ", analysis_path, tissue, "_Summary_Noimput.RData", sep="")) save(Counts_by_tissue, file=paste(analysis_path, tissue, "_Summary_Noimput.RData", sep="")) } Counts_by_tissue = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE) for (peer in peer_factors) { print(peer) # Now iterate through imputed: # First iterate through nonimputed: result_path = paste(in_dir, "TPM_GC_3GenPC_", peer, "PEER1000/", sep='') analysis_path = paste(in_dir, "analysis/", tissue, '/', sep='') # Match a regular expression pattern: result_file <- dir(result_path, pattern = paste(tissue,".*\\Imput_me.RData",sep='')) if (length(result_file) > 0) { load(paste(result_path, result_file[1], sep='')) temp = data.frame(me$cis$eqtls) temp$snps = as.character(temp$snps) temp$gene = as.character(temp$gene) top_genes = head(unique(temp$gene), n=5) # Only proceed if there is at least 5 genes. if (length(top_genes) == 5) { temp$TSS = gene_names[temp$gene,][,"start"] temp$TES = gene_names[temp$gene,][,"end"] temp$gene_name = gene_names[temp$gene,][,"gene_name"] temp$SNP_pos = snp_positions[temp$snps,][,"pos"] # Difference mapping: -1e5 to 0 if before TSS # scale from 1 to 1e5 if in gene # 1e5 to 2e5 if after TES temp$diff = sapply(c(1:dim(temp)[1]), function(x) { if ((temp$SNP_pos[x] - temp$TSS[x])<0) { temp$SNP_pos[x] - temp$TSS[x] } else if ((temp$SNP_pos[x] - temp$TES[x])>0) { temp$SNP_pos[x] - temp$TES[x] + cis_dist } else { cis_dist * (temp$SNP_pos[x] - temp$TSS[x]) / (temp$TES[x] - temp$TSS[x]) }}) tryCatch({ g = ggplot(temp[temp$FDR < 0.01,], aes(x = diff)) + geom_histogram(binwidth = 10000) + ggtitle('Distribution of distances to TSS') + xlab('Distance') + scale_x_discrete(breaks=c(0, 1e5), labels=c("TSS", "TES")) print(paste("Trying to save: ", analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep='')) ggsave(filename = paste(analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep=''), plot=g, width = 7, height = 7) }, error = function(e) { print(paste("Could not save: ", analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep='')) }) for (j in 1:5) { index = which(temp$gene == top_genes[j])[1] test_snp = snps$FindRow(temp[index,"snps"]) test_gene_q = gene_qnorm$FindRow(temp[index,"gene"]) test.df = data.frame(snp = as.numeric(test_snp$row), gene = as.numeric(test_gene_q$row)) row.names(test.df) = colnames(snps) test.df = test.df[!is.na(test.df$snp),] test_name = as.character(temp[index,"gene_name"]) test_rsID = as.character(temp[index,"snps"]) tryCatch({ g = ggplot(test.df, aes(x = snp, y = gene, color=snp)) + geom_jitter() + ggtitle(paste(test_name, test_rsID, sep='_')) + xlab(test_rsID) + ylab(test_name) print(paste("Trying to save: ", analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep='')) ggsave(filename = paste(analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''), plot = g, width = 7, height = 7) }, error = function(e) { print(paste("Could not save: ", analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep='')) }) } } # Take all n_pairs = dim(me$cis$eqtls)[1] n_genes = length(unique(me$cis$eqtls$gene)) Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "pval", imputed = "Imput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE)) # Take FDR-controlled at 1% n_pairs = dim(filter(me$cis$eqtls, FDR <= 0.01))[1] n_genes = length(unique(filter(me$cis$eqtls, FDR <= 0.01)$gene)) Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "FDR", imputed = "Imput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE)) } } if (dim(Counts_by_tissue)[1] != 0) { print(paste("Save: ", analysis_path, tissue, "_Summary_Imput.RData", sep="")) save(Counts_by_tissue, file=paste(analysis_path, tissue, "_Summary_Imput.RData", sep="")) } %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL_aggregate.R #################################################### # post_analyses_MatrixEQTL_aggregate.R # Aggregates all results and optimizes for K PEER factors in a tissue-specific manner. # #################################################### #!/usr/bin/Rscript library(ggplot2) library(dplyr) analysis_path = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis/' tissue_list_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt' num_indiv_cutoff = 100 tissue_list = read.table(tissue_list_file, header=FALSE, sep="\t", stringsAsFactors=FALSE) colnames(tissue_list) = c("Tissue", "Count") rownames(tissue_list) = tissue_list["Tissue"][[1]] tissue_list = tissue_list[tissue_list["Count"]>=num_indiv_cutoff,] Counts_by_tissue_aggregate = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE) for (i in c(1:dim(tissue_list)[1])) { tissue = tissue_list["Tissue"][[1]][i] load(paste(analysis_path, tissue, "/", tissue, "_Summary_Imput.RData", sep="")) Counts_by_tissue_aggregate = rbind(Counts_by_tissue_aggregate, Counts_by_tissue) load(paste(analysis_path, tissue, "/", tissue, "_Summary_Noimput.RData", sep="")) Counts_by_tissue_aggregate = rbind(Counts_by_tissue_aggregate, Counts_by_tissue) } for (i in c(1:dim(tissue_list)[1])) { tissue = tissue_list["Tissue"][[1]][i] print(tissue) print(sum(Counts_by_tissue_aggregate["tissue"] == tissue)) } # We have the data frame with all counts. save(Counts_by_tissue_aggregate, file=paste(analysis_path, "All_Counts.RData", sep="")) # Save the plots of number for all tissues: g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "pval"), aes(x = peer_K, y = pairs, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none") ggsave(filename = paste(analysis_path, "All_tissues_pval_pairs.pdf", sep=''), plot = g, width = 10, height = 10) g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "FDR"), aes(x = peer_K, y = pairs, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none") ggsave(filename = paste(analysis_path, "All_tissues_FDR_pairs.pdf", sep=''), plot = g, width = 10, height = 10) g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "pval"), aes(x = peer_K, y = genes, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none") ggsave(filename = paste(analysis_path, "All_tissues_pval_genes.pdf", sep=''), plot = g, width = 10, height = 10) g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "FDR"), aes(x = peer_K, y = genes, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none") ggsave(filename = paste(analysis_path, "All_tissues_FDR_genes.pdf", sep=''), plot = g, width = 10, height = 10) # Optimize on the number of K PEER factors to use for each tissue: # Criteria 1: when it decreases peer_K_to_iterate = c(15, 20, 25, 30, 35, 40, 45, 50, 55, 60) tissue_list["optimal_dec"] = 10 for (i in c(1:dim(tissue_list)[1])) { tissue_name = tissue_list["Tissue"][[1]][i] current_opt = 10 current_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == 10)["pairs"][[1]][1] for (j in peer_K_to_iterate) { next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1] if (next_val <= current_val) { tissue_list[tissue_name, "optimal_dec"] = current_opt break } current_opt = j current_val = next_val } if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) { tissue_list[tissue_name, "optimal_dec"] = current_opt } } # Another Criteria: when it reaches 5% of maximal # Tunable alpha, when it reaches N% of maximal alpha = 0.05 tissue_list["optimal_95"] = 10 for (i in c(1:dim(tissue_list)[1])) { tissue_name = tissue_list["Tissue"][[1]][i] current_opt = 10 thresh = max(filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput")["pairs"]) * (1-alpha) for (j in peer_K_to_iterate) { next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1] if (next_val >= thresh) { tissue_list[tissue_name, "optimal_95"] = current_opt break } current_opt = j } if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) { tissue_list[tissue_name, "optimal_95"] = current_opt } } # Another Criteria: when it reaches 1% of maximal # Tunable alpha, when it reaches N% of maximal alpha = 0.01 tissue_list["optimal_99"] = 10 for (i in c(1:dim(tissue_list)[1])) { tissue_name = tissue_list["Tissue"][[1]][i] current_opt = 10 thresh = max(filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput")["pairs"]) * (1-alpha) for (j in peer_K_to_iterate) { next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1] if (next_val >= thresh) { tissue_list[tissue_name, "optimal_99"] = current_opt break } current_opt = j } if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) { tissue_list[tissue_name, "optimal_99"] = current_opt } } # Save the optimal K for future use: write.table(tissue_list, file='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt', row.names=FALSE, quote=FALSE, sep="\t") %%bash python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper_all.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \ featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/ \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/featurecounts/analysis \ /tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py \ /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \ featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/ %%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper_all.py ################################################## # cis_matrix_eqtl_wrapper_all.py # Wrapper for the cis_matrix_eqtl.R file and submitting the batch jobs # ################################################## import glob from sys import argv import os.path in_path = argv[1] in_suffix = argv[2] method = argv[3] mapping_dir = argv[4] joblog_dir = argv[6] incl_imput = argv[7] analysis_dir = argv[8] peer_factors_file = argv[9] # Examples # argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/' # argv[2] = '_genes_certain_biotypes_counts_normalized.txt' # argv[3] = 'featurecounts' # argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping' # argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/' # argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/' # argv[7] = '0' or '1' # argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/featurecounts/analysis' # argv[9] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt' # Make the job log directories if not os.path.exists(joblog_dir): os.makedirs(joblog_dir) if not os.path.exists(joblog_dir + 'err'): os.makedirs(joblog_dir + 'err') if not os.path.exists(joblog_dir + 'out'): os.makedirs(joblog_dir + 'out') if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) if incl_imput == "0": imput = "Noimput" else: imput = "Imput" # Create the array of jobs to run: tissue_list = [] peer_factors = [] f = open(peer_factors_file) # First line is a header: f.readline() for line in f.readlines(): entry = str.split(line.strip(),'\t') tissue_list.append(entry[0]) # We use the optimal_99 critera: peer_factors.append(entry[-1]) out_path = argv[5] + method + '/' if not os.path.exists(out_path): os.makedirs(out_path) master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + '_' + imput + "_cis_matrix_eqtl_wrapper_all.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") # MatrixEQTL parameters incl_trans = '0' # Don't include trans cis_dist = '1e5' # cis window k = 0 for i in range(len(tissue_list)): tissue_name = tissue_list[i] peer_factor = peer_factors[i] filename = tissue_name + '_' + method + in_suffix matrix = argv[1] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' + filename out_file = out_path + filename.replace('.txt','_MatrixEQTL') # Iterate through 22 chromosomes for j in range(22): chr_number = str(j+1) sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_MatrixEQTL_chr' + chr_number + '_' + tissue_name + '_' + imput + ".slurm" job_outfile = method+"_"+tissue_name+"_cis_all" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J meqtl_%s # job name #SBATCH --mem=8000 # 8 GB requested #SBATCH -t 06:00:00 # to be placed in the test queue #SBATCH -e %s # err output directory #SBATCH -o %s # out output directory /usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \ %s %s %s %s %s %s %s %s %s %s """%(tissue_name, joblog_dir+'err/'+job_outfile+'_'+imput+'.err', joblog_dir+'out/'+job_outfile+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") k += 1 master_handle.close() print 'sh %s'%(master_script) print 'Number of jobs:', k %%bash python /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control_wrapper.py %%writefile /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control_wrapper.py #################################################### # MatrixEQTL_cis_FDR_control_wrapper.py # # #################################################### tissue_list_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt' f = open(tissue_list_file) tissue_list = [] for line in f.readlines(): tissue_list.append(str.split(line.strip(), '\t')[0]) f.close() original_dir = "/tigress/bj5/MatrixEQTL/cis/featurecounts/" write_dir = "/tigress/bj5/cis_meta/MatrixEQTL/" master_script = "/tigress/bj5/cis_meta/scripts/MatrixEQTL_FDR_control.sh" master_handle=open(master_script, 'w') master_handle.write("#!/bin/bash\n\n") for item in tissue_list: sbatchfile = "/tigress/bj5/cis_meta/scripts/batch/MatrixEQTL_FDR_control_" + item + ".slurm" sbatchhandle=open(sbatchfile, 'w') cmd=r"""#!/bin/bash #SBATCH -J Mat_%s_FDR # job name #SBATCH --mem=4000 # 8 GB requested #SBATCH -t 01:00:00 # to be placed in the test queue #SBATCH -e %s # err output directory #SBATCH -o %s # out output directory /usr/bin/Rscript /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control.R \ %s %s %s """%(item, write_dir + item + ".err", write_dir + item + ".out", original_dir, item, write_dir) sbatchhandle.write(cmd) sbatchhandle.close() master_handle.write("sbatch " + sbatchfile + " \n") master_handle.close() print 'sh %s'%(master_script) %%writefile /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control.R #################################################### # MatrixEQTL_cis_FDR_control.R # # #################################################### #!/usr/bin/Rscript # Collect arguments args <-commandArgs(TRUE) # FDR Control of p-values from the null distribution original_dir = '/tigress/bj5/MatrixEQTL/cis/featurecounts/' tissue = 'Adipose-Subcutaneous' method = 'featurecounts' file_attr = '_genes_certain_biotypes_counts_normalized_MatrixEQTL_Chr' imputed = 'Imput' suffix = '_me.RData' write_dir = '/tigress/bj5/cis_meta/MatrixEQTL/' original_dir = args[1] tissue = args[2] write_dir = args[3] snps_list = character() genes_list = character() pvals_list = numeric() betas_list = numeric() statistic_list = numeric() total_tests = 0 total_called_eqtls = 0 for (i in c(1:22)) { print(i) filename = paste(original_dir, tissue, '_', method, file_attr, i, '_', imputed, suffix, sep='') load(filename) snps_list = append(snps_list, sapply(me$cis$eqtls$snps, as.character)) genes_list = append(genes_list, sapply(me$cis$eqtls$gene, as.character)) pvals_list = append(pvals_list, me$cis$eqtls$pvalue) betas_list = append(betas_list, me$cis$eqtls$beta) statistic_list = append(statistic_list, me$cis$eqtls$statistic) total_tests = total_tests + me$cis$ntests total_called_eqtls = total_called_eqtls + me$cis$neqtls } ordering = order(pvals_list) ordered_pvals_list = pvals_list[ordering] ordered_snps_list = snps_list[ordering] ordered_genes_list = genes_list[ordering] ordered_betas_list = betas_list[ordering] ordered_statistic_list = statistic_list[ordering] # Perform Benjamini-Hochberg alpha_hat = sapply(c(1:total_called_eqtls), function(x) {ordered_pvals_list[x] * total_tests / x}) alpha_hat[(alpha_hat >= 1.0)] = 1.0 cur_index = 1 alpha = numeric() ptm <- proc.time() while (cur_index <= length(alpha_hat)) { temp = alpha_hat[cur_index:length(alpha_hat)] min_index = which.min(temp) min_alpha = min(temp) alpha = append(alpha, rep(min_alpha, min_index)) cur_index = cur_index + min_index } print("FDR calculation time: ") proc.time() - ptm num_eqtls = sum(alpha <= 0.05) result = data.frame(snp = ordered_snps_list[1:num_eqtls], gene = ordered_genes_list[1:num_eqtls], pval = ordered_pvals_list[1:num_eqtls], beta = ordered_betas_list[1:num_eqtls], statistic = ordered_statistic_list[1:num_eqtls], FDR = alpha[1:num_eqtls]) write.table(result, file=paste(write_dir, tissue, "_FDR_5percent.txt", sep=""), quote = FALSE, sep = "\t", row.names = FALSE) print(paste("Total number of tests:", total_tests)) print(paste("Total eQTLs called at 5% FDR:", num_eqtls))