%%bash

cd /tigress/BEE/gtex/external_sources/hg19/
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.pc_transcripts.fa.gz
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.lncRNA_transcripts.fa.gz

%%bash

cd /tigress/BEE/gtex/external_sources/hg19/
gzip -cd gencode.v19.lncRNA_transcripts.fa.gz
gzip -cd gencode.v19.pc_transcripts.fa.gz

mkdir /tigress/BEE/gtex/data/auxiliary_bj5
mkdir /tigress/BEE/gtex/data/auxiliary_bj5/normalization

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/isolate_longest_sequences_in_transcriptomes.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/isolate_longest_sequences_in_transcriptomes.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa

%%bash
# Transcripts

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt

cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.seq_length.txt 
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.seq_length.txt 

# Genes

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_sequence_length.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt

cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.seq_length.txt 
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.seq_length.txt 

%%bash
# Transcripts

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.pc_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/external_sources/hg19/gencode.v19.lncRNA_transcripts.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt

cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.gc_content.txt 
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.transcripts.gc_content.txt 

# Genes

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/compute_GC_content.py \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.longest.fa \
/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt

cat /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt \
> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.gc_content.txt 
tail -n+2 /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt \
>> /tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.genes.gc_content.txt 

%%bash
echo 'export R_LIBS=/tigress/BEE/tools/R_libs' >> ~/.bashrc
source ~/.bashrc

# In R:

source("http://bioconductor.org/biocLite.R")
biocLite("EDASeq")

%%bash

mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/
mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/
mkdir /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/

python /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/ _certain_biotypes_TPM.txt \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/

sh /tigress/BEE/gtex/tmp/low_level_scripts/normalize_only_GC_wrapper.sh

%%writefile /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC_wrapper.py
##################################################
#  normalize_only_GC.py
#
#
##################################################
import glob
from sys import argv
import os.path

in_path = argv[1]
in_suffix = argv[2]
in_path = in_path + '*' + in_suffix
out_path = argv[3]

# in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/*_certain_biotypes_TPM.txt'
# out_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/'

matrices = glob.glob(in_path + '*')

master_script = "/tigress/BEE/gtex/tmp/low_level_scripts/normalize_only_GC_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")

seq_lengths, gc_content = {}, {}

seq_lengths['pc_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.seq_length.txt'
seq_lengths['pc_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.seq_length.txt'
seq_lengths['lncRNA_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.seq_length.txt'
seq_lengths['lncRNA_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.seq_length.txt'

gc_content['pc_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_genes.gc_content.txt'
gc_content['pc_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.pc_transcripts.gc_content.txt'
gc_content['lncRNA_genes'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_genes.gc_content.txt'
gc_content['lncRNA_transcripts'] = '/tigress/BEE/gtex/data/auxiliary_bj5/normalization/gencode.v19.lncRNA_transcripts.gc_content.txt'

k = 0
for i, matrix in enumerate(matrices):
    out_matrix = out_path + matrix.split('/')[-1].replace('_TPM.txt','_TPM_GC.txt')
    print out_matrix
    if os.path.isfile(out_matrix):
        continue
    k+=1
    genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
    sbatchfile = "/tigress/BEE/gtex/tmp/low_level_scripts/norm_GC_" + str(i) + ".slurm"
    sbatchhandle=open(sbatchfile, 'w')
    cmd=r"""#!/bin/bash
#SBATCH -J norm_%s      # job name
#SBATCH --mem=12000           # 12 GB requested
#SBATCH -t 01:00:00           # to be placed in the test queue

/usr/bin/Rscript /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC.R \
%s %s %s %s %s %s

"""%(i, matrix, seq_lengths['pc_'+genes_or_transcripts], seq_lengths['lncRNA_'+genes_or_transcripts], \
     gc_content['pc_'+genes_or_transcripts], gc_content['lncRNA_'+genes_or_transcripts], out_matrix)
    sbatchhandle.write(cmd)
    sbatchhandle.close()
    master_handle.write("sbatch " + sbatchfile  + " \n")

master_handle.close()

print 'sh %s'%(master_script)
print 'Number of jobs:', k+1

%%writefile /tigress/BEE/gtex/analyses/group_general/top_level_scripts/normalize_only_GC.R
##################################################
#  normalize_only_GC.R
#
#  Use EDASeq R package to normalize gene expression within-lane by GC-content
# 
#  See:
#  Risso, Davide, et al. "GC-content normalization for RNA-Seq data." BMC bioinformatics 12.1 (2011): 480.
##################################################
#!/usr/bin/Rscript

## Collect arguments
args <-commandArgs(TRUE)

## Default setting when no arguments passed
if (length(args) != 6 ){
  args <- c("--help")
}
## Help section
if("--help" %in% args) {
  cat("
      normalize_only_GC.R
 
      Usage:
      ./normalize_only_GC.R /path/to/expression_matrix.txt /path/to/seq_length_protein_coding.txt \
      /path/to/seq_length_lncRNA.txt /path/to/GC_content_protein_coding.txt \
      /path/to/GC_content_lncRNA.txt /path/to/normalized_expression_matrix.txt\n")
  q(save="no")
}


library(EDASeq)

expr = args[1]
seq_length_pc = args[2]
seq_length_lncRNA = args[3]
GC_content_pc = args[4]
GC_content_lncRNA  = args[5]

norm_expr_out  = args[6]

# expr = "/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM.txt"
# seq_length_pc = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.pc_genes.seq_length.txt"
# seq_length_lncRNA = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.lncRNA_genes.seq_length.txt"
# GC_content_pc = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.pc_genes.gc_content.txt"
# GC_content_lncRNA = "/tigress/BEE/gtex/data/auxiliary/normalization/gencode.v19.lncRNA_genes.gc_content.txt"

# norm_expr_out = "/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM_GC.txt"

# import data
expr = read.table(file=expr, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) 
seq_length_pc = read.table(file=seq_length_pc, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) 
seq_length_lncRNA = read.table(file=seq_length_lncRNA, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) 
GC_content_pc = read.table(file=GC_content_pc, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) 
GC_content_lncRNA = read.table(file=GC_content_lncRNA, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) 

# turn covariates into named vectors to be used by normalization functions
df_to_named_vector <- function(df, col){
    tmp <- as.numeric(df[,col])
    names(tmp) <- row.names(df)
    return(tmp)
    }

seq_length_pc <- df_to_named_vector(seq_length_pc, 'length')
seq_length_lncRNA <- df_to_named_vector(seq_length_lncRNA, 'length')
GC_content_pc <- df_to_named_vector(GC_content_pc, 'gccontent')
GC_content_lncRNA <- df_to_named_vector(GC_content_lncRNA, 'gccontent')

seqs_lncRNA <- intersect(rownames(expr), names(seq_length_lncRNA))
seqs_lncRNA <- intersect(seqs_lncRNA, names(GC_content_lncRNA))
expr_lncRNA <- as.matrix(expr[seqs_lncRNA, , drop=FALSE ])

seqs_pc <- intersect(rownames(expr), names(seq_length_pc))
seqs_pc <- intersect(seqs_pc, names(GC_content_pc))
expr_pc <- as.matrix(expr[seqs_pc, , drop=FALSE ])

# reduce expression matrices to only those genes/transcripts that are expressed (non-zero) in
# at least 10% of samples for this particular tissue
expr_lncRNA = as.matrix(expr_lncRNA[apply(expr_lncRNA, 1, function(x) length(x[x != 0])/length(x) >= 0.10),])
expr_pc = as.matrix(expr_pc[apply(expr_pc, 1, function(x) length(x[x != 0])/length(x) >= 0.10),])
    
# create eSet objects
data_lncRNA <- newSeqExpressionSet(expr_lncRNA, featureData=AnnotatedDataFrame(data.frame(gc=GC_content_lncRNA[row.names(expr_lncRNA)])), round=FALSE)
data_pc <- newSeqExpressionSet(expr_pc, featureData=AnnotatedDataFrame(data.frame(gc=GC_content_pc[row.names(expr_pc)])), round=FALSE)

# Perform within-lane normalizations separately for lincRNA and mRNA because we would expect
# different distributions

# Within-lane normalization for GC-content
norm_lncRNA <- normCounts(withinLaneNormalization(data_lncRNA, "gc", which="full", offset=FALSE, round=FALSE))
norm_pc <- normCounts(withinLaneNormalization(data_pc, "gc", which="full", offset=FALSE, round=FALSE))

# bind the pc and lncRNA dataframes together
norm <- rbind(norm_pc,norm_lncRNA)

write.table(norm, file=norm_expr_out, quote=FALSE, sep='\t', row.names=T, col.names=NA)


%%bash

mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/LFA

# In R:

source("http://bioconductor.org/biocLite.R")
biocLite("peer")

%%bash

cp GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_PostImput_20genotPCs.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/

# In R, run the following:

genotype_PCs = read.table('GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_PostImput_20genotPCs.txt', sep = '\t', stringsAsFactors=FALSE, header=TRUE)
rownames(genotype_PCs) = sapply(c(1:dim(genotype_PCs)[1]), function(x) {paste(strsplit(genotype_PCs$FID[x], split = "-")[[1]][1], strsplit(genotype_PCs$FID[x], split = "-")[[1]][2], sep = ".")})
genotype_PCs_top3 = genotype_PCs[,c("C1","C2","C3")]
write.table(genotype_PCs_top3, file = "GTEx_Genotype_PCs_top3.txt", sep = '\t')

load("subSample1MLFA1.RData")
row.names(subSample1MLFA) = sapply(row.names(subSample1MLFA), function(x) {paste("GTEX.", x, sep = "")})
genotype_PCs_top3_LFA = subSample1MLFA[,c(1:3)]
colnames(genotype_PCs_top3_LFA) = c("C1","C2","C3")
write.table(genotype_PCs_top3_LFA, file = "LFA_Genotype_PCs_top3.txt", sep = "\t")


# In R:

install.packages("devtools")
library("devtools")
install_github("Storeylab/lfa")

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/LFA/lfaWrapper.sh

#!/usr/bin/Rscript
# sbatch -t 24:00:00 -N 1 --ntasks-per-node=1 --mail-type=begin --mail-type=end --mail-user=bj5@princeton.edu --mem=125000 ./lfaWrapper.sh
library('lfa')
library('Matrix')
library('R.utils')

NUM_SUBJECTS = 450
NUM_CHRS = 22

# Take in subject names
tempRowNames = read.table('/tigress/BEE/gtex/data/genotype/imputed_genotypes/allelic_dosage/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr22.allelic_dosage.txt', header = TRUE)
indNames = colnames(tempRowNames)
indNames = indNames[2:NUM_SUBJECTS+1]
remove(tempRowNames)
indNames = substr(indNames,6,10)

# Tidy the subject IDs
for(i in seq(1,length(indNames))){
    if(substr(indNames[i],5,5)=='.'){
        indNames[i] = substr(indNames[i],1,4)
    }
}

numRow = 0
for(i in seq(1,NUM_CHRS)){
    numRow = numRow + countLines(paste("/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr",i,".bim",sep=''))
}
print(numRow)
genotypeHolder = matrix(nrow=numRow,ncol=NUM_SUBJECTS)
genoHolderIndex = 1
for(i in seq(1,NUM_CHRS)){
    dataLength = countLines(paste('/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr',i,'.bim',sep=""))
    temp = read.bed(paste('/tigress/BEE/gtex/data/genotype/imputed_genotypes/plink/GTEx_Analysis_2015-01-12_OMNI_2.5M_5M_450Indiv_allchr_genot_imput_info04_maf05_HWEp1E6_dbSNPIDs.chr',i,sep=""))
    genotypeHolder[genoHolderIndex:(genoHolderIndex + dataLength - 1),] = temp
    genoHolderIndex = genoHolderIndex + dataLength
    remove(temp)
}
print(object.size(genotypeHolder))

# Sample 1M snps to form genotype PCs
for(i in seq(1,10)){
    indices = sample(1:numRow,1000000,replace = FALSE)
    genotypeSample = genotypeHolder[indices,]

    subSample1MLFA = lfa(genotypeSample,d=51)
    rownames(subSample1MLFA) = indNames
    save(subSample1MLFA, file=paste('subSample1MLFA',i,'.RData',sep=''))
}

%%bash

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/ \
_genes_certain_biotypes_counts_normalized.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \
featurecounts /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER/ 10 15 20 25 30 35 40 45 50 55 60 


%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_wrapper.py
##################################################
#  cis_peer_correction_chr11_wrapper.py
#
#
##################################################

import glob
from sys import argv
import os

in_path = argv[1]
in_suffix = argv[2]
genotype_pcs = argv[3]
gene_metadata = argv[4]
method = argv[5]
# in_path = in_path + method + '/TPM_with_genotypes/' + '*' + '_' + method + in_suffix
# out_path = argv[6] + "to be appended later"
joblog_dir = argv[7]

# Array of PEER factors that are given in the input
peer_factors = argv[8:len(argv)]

# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/'
# or
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/'
# argv[2] = '_genes_certain_biotypes_TPM.txt'
# or 
# argv[2] = '_genes_certain_biotypes_counts_normalized.txt'
# argv[3] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# argv[5] = 'featurecounts'
# argv[6] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER'
# argv[8:len(argv)] = ['10','20','30','40','50']

# Make the job log directories
if not os.path.exists(joblog_dir):
    os.makedirs(joblog_dir)
if not os.path.exists(joblog_dir + 'err'):
    os.makedirs(joblog_dir + 'err')
if not os.path.exists(joblog_dir + 'out'):
    os.makedirs(joblog_dir + 'out')

matrices = glob.glob(in_path + '*' + in_suffix)

for peer_factor in peer_factors:
    out_path = argv[6] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_" + peer_factor + "_cis_peer_correction_chr11_wrapper.sh"
    master_handle=open(master_script, 'w')
    master_handle.write("#!/bin/bash\n\n")

    # Examples
    # peer_factor = 10
    # residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM.txt'
    # tissue_name = 'Adipose-Subcutaneous'
    factor_file_directory = out_path + 'PEER_factors'
    if not os.path.exists(factor_file_directory):
        os.makedirs(factor_file_directory)

    k = 0
    for i, matrix in enumerate(matrices):
        filename = matrix.split('/')[-1]
        tissue_name = str.split(filename, '_'+method)[0]
        out_matrix = out_path + filename
        # out_matrix = out_path + filename.replace('_TPM.txt','_TPM_GC.txt')
        if os.path.isfile(out_matrix):
            continue
        k+=1
        genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
        outfilename = method+"_"+tissue_name+"_"+peer_factor
        sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/PEER/" + method + '_TPM_GC_3GenPC_' + peer_factor + 'PEER1000_' + tissue_name + ".slurm"
        sbatchhandle=open(sbatchfile, 'w')
        cmd=r"""#!/bin/bash
#SBATCH -J PEER_%s_%s   # job name
#SBATCH --mem=6000     # 6 GB requested
#SBATCH -t 04:00:00     
#SBATCH -e %s           # err output directory
#SBATCH -o %s           # out output directory

/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R \
%s %s %s %s %s %s %s

    """%(tissue_name, peer_factor, joblog_dir+'err/'+outfilename+'.err', joblog_dir+'out/'+outfilename+'.out', matrix, genotype_pcs, gene_metadata, peer_factor, out_matrix, tissue_name, factor_file_directory)
        sbatchhandle.write(cmd)
        sbatchhandle.close()
        master_handle.write("sbatch " + sbatchfile  + " \n")

    master_handle.close()

    print 'sh %s'%(master_script)

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R
####################################################
# cis_peer_correction_chr11.R
# Exploratory: Testing with chr 11.
#
####################################################

#!/usr/bin/Rscript

## Collect arguments
args <-commandArgs(TRUE)

## Default setting when no arguments passed
if (length(args) != 7 ){
  args <- c("--help")
}
## Help section
if("--help" %in% args) {
  cat("
      cis_peer_correction.R
 
      Usage:
      ./cis_peer_correction.R /path/to/expression_matrix.txt /path/to/genotype_pcs.RData \
      /path/to/gene_metadata/ num_of_peer_factors \
      /path/to/normalized_expression_matrix.txt tissue_name /path/to/factor_files\n")
  q(save="no")
}

expr_matrix = args[1]
genotype_pcs = args[2]
gene_metadata = args[3]
peer_factors = as.numeric(args[4])
residual_matrix = args[5]
tissue_name = args[6]
factor_file_directory = args[7]

# expr_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/featurecounts/TPM_with_genotypes/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM.txt'
# expr_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_counts_normalized.txt'
# genotype_pcs = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt'
# gene_metadata = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# peer_factors = '10'
# residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Brain-Hypothalamus_featurecounts_genes_certain_biotypes_TPM_GC.txt'
# tissue_name = 'Brain-Hypothalamus'
# factor_file_directory = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/PEER_factors'

library(peer)
library(dplyr)

# Load in the genotype PCs (top 3 will be used for this pipeline), saved as samSample1MLFA
genotype_PCs_top3 = read.table(genotype_pcs, stringsAsFactors=FALSE, header=TRUE, sep='\t')
gene_metadata = read.table(gene_metadata, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)

# Load in the expression matrices
# Also correct for covariates using PEER with K factors and N steps

expr_matrix = read.table(file=expr_matrix, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1)
# The samples included are already sorted for samples with genotypes.

# Data massaging: left_join is similar to a databaes left join, matching entries in first table to the second
expr_matrix$gene_id = row.names(expr_matrix)
gene_metadata$gene_id = row.names(gene_metadata)
joined_matrix = left_join(expr_matrix, gene_metadata, by="gene_id")
# Take only chr 11 for now.
joined_matrix = filter(joined_matrix, chr == 11)
joined_matrix = arrange(joined_matrix, start)
row.names(joined_matrix) = joined_matrix$gene_id
joined_matrix = select(joined_matrix, 1:(dim(joined_matrix)[2]-6))

# Optional: remove genes that are expressed in less than 10% of individuals.
joined_matrix = joined_matrix[sapply(1:dim(joined_matrix)[1], function(x) {(sum(joined_matrix[x,]==0)/dim(joined_matrix)[2])<0.1}),]

# PEER takes in an inverted matrix as input
inverted_matrix = as.data.frame(t(data.matrix(joined_matrix)))
genotype_PCs_top3_df = as.data.frame(genotype_PCs_top3[,1:3])
inverted_matrix$subject_id = row.names(inverted_matrix)
genotype_PCs_top3_df$subject_id = row.names(genotype_PCs_top3_df)
inverted_joined = left_join(inverted_matrix, genotype_PCs_top3_df, by="subject_id")
subject_list = inverted_joined$subject_id

# Top 3 PCs, can be changed for future tasks
genotype_top3 = as.matrix(inverted_joined[,(dim(inverted_joined)[2]-2):dim(inverted_joined)[2]])
inverted_joined = as.matrix(select(inverted_joined, c(1:(dim(inverted_joined)[2]-4))))

# Now set up PEER for detection of hidden covariates
model = PEER()

PEER_setPhenoMean(model,inverted_joined)
PEER_setCovariates(model, genotype_top3)
PEER_setNk(model,peer_factors)
PEER_update(model)

# Now that PEER has been set up, we can obtain the residuals and save the resulting matrix:
residuals = PEER_getResiduals(model)
colnames(residuals) = colnames(inverted_joined)
rownames(residuals) = subject_list

write.table(as.data.frame(t(data.matrix(residuals))), file=residual_matrix, quote = FALSE, sep = "\t")

# Also save the factors, weights, and alpha values:
write.table(PEER_getX(model), file=paste(factor_file_directory, '/', tissue_name, '_Factors.txt', sep=""), row.names=FALSE, col.names=FALSE)
write.table(PEER_getW(model), file=paste(factor_file_directory, '/', tissue_name, '_Loadings.txt', sep=""), row.names=FALSE, col.names=FALSE)
write.table(PEER_getAlpha(model), file=paste(factor_file_directory, '/', tissue_name, '_Alphas.txt', sep=""), row.names=FALSE, col.names=FALSE)


%%bash

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_rerun_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/ \
_genes_certain_biotypes_counts_normalized.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \
featurecounts /tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER/ 10 15 20 25 30 35 40 45 50 55 60 


%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11_rerun_wrapper.py
##################################################
#  cis_peer_correction_chr11_rerun_wrapper.py
#  provide jobs that have failed with more time and memory
#
##################################################

import glob
from sys import argv
import os

in_path = argv[1]
in_suffix = argv[2]
genotype_pcs = argv[3]
gene_metadata = argv[4]
method = argv[5]
# in_path = in_path + method + '/TPM_with_genotypes/' + '*' + '_' + method + in_suffix
# out_path = argv[6] + "to be appended later"
joblog_dir = argv[7]

# Array of PEER factors that are given in the input
peer_factors = argv[8:len(argv)]

# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/'
# or
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/normalized_by_length_GC_and_depth_hg19/'
# argv[2] = '_genes_certain_biotypes_TPM.txt'
# or 
# argv[2] = '_genes_certain_biotypes_counts_normalized.txt'
# argv[3] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/GTEx_Genotype_PCs_top3.txt'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# argv[5] = 'featurecounts'
# argv[6] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/PEER'
# argv[8:len(argv)] = ['10','20','30','40','50']

# Create full tissue_list.
tissue_list = []
matrices = glob.glob(in_path + '*' + in_suffix)
for i, matrix in enumerate(matrices):
    filename = matrix.split('/')[-1]
    tissue_name = str.split(filename, '_'+method)[0]
    tissue_list.append(tissue_name)

master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_cis_peer_correction_chr11_rerun_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")

k = 0
for peer_factor in peer_factors:
    print(peer_factor)
    out_path = argv[6] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'

    # Examples
    # peer_factor = 10
    # residual_matrix = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_10PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM.txt'
    # tissue_name = 'Adipose-Subcutaneous'
    factor_file_directory = out_path + 'PEER_factors'

    for tissue in tissue_list:
        out_matrix = out_path + tissue + "_" + method + in_suffix
        # out_matrix = out_path + tissue + "_" + method + in_suffix.replace('_TPM.txt','_TPM_GC.txt')
        if os.path.isfile(out_matrix):
            continue
        print("Missing: " + tissue)
        genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
        outfilename = method+"_"+tissue+"_"+peer_factor
        sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/PEER/" + method + '_TPM_GC_3GenPC_' + peer_factor + 'PEER1000_' + tissue + ".slurm"
        sbatchhandle=open(sbatchfile, 'w')
        cmd=r"""#!/bin/bash
#SBATCH -J PEER_%s_%s      # job name
#SBATCH --mem=12000     # 12 GB requested
#SBATCH -t 08:00:00     
#SBATCH -e %s           # err output directory
#SBATCH -o %s           # out output directory

/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_peer_correction_chr11.R \
%s %s %s %s %s %s %s

    """%(tissue, peer_factor, joblog_dir+'err/'+outfilename+'.err', joblog_dir+'out/'+outfilename+'.out', matrix, genotype_pcs, gene_metadata, peer_factor, out_matrix, tissue, factor_file_directory)
        sbatchhandle.write(cmd)
        sbatchhandle.close()
        master_handle.write("sbatch " + sbatchfile  + " \n")
        k+=1

master_handle.close()

print 'sh %s'%(master_script)
print 'Number of jobs:', k

%%bash

# Set up the MatrixEQTL env.

cd /tigress/BEE/gtex/analyses/user_specific/cis-mapping
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts

mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_10PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_20PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_30PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_40PEER1000
mkdir /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/TPM_GC_3GenPC_50PEER1000


%%bash 

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 0 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60

sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_10_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_15_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_20_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_25_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_30_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_35_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_40_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_45_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_50_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_55_Noimput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_60_Noimput_cis_matrix_eqtl_chr11_wrapper.sh

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60

sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_10_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_15_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_20_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_25_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_30_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_35_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_40_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_45_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_50_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_55_Imput_cis_matrix_eqtl_chr11_wrapper.sh
sh /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/featurecounts_60_Imput_cis_matrix_eqtl_chr11_wrapper.sh

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper.py
##################################################
#  cis_matrix_eqtl_wrapper.py
#  Wrapper for the cis_matrix_eqtl.R file and submitting the batch jobs
#
##################################################

import glob
from sys import argv
import os.path

in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
mapping_dir = argv[4]
joblog_dir = argv[6]
incl_imput = argv[7]
analysis_dir = argv[8]

# Array of PEER factors that are given in the input
peer_factors = argv[9:len(argv)]

# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[2] = '_genes_certain_biotypes_TPM_GC.txt'
# argv[3] = 'featurecounts'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/'
# argv[7] = '0' or '1'
# argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis'
# argv[9:len(argv)] = ['10','20','30','40','50']

# Make the job log directories
if not os.path.exists(joblog_dir):
    os.makedirs(joblog_dir)
if not os.path.exists(joblog_dir + 'err'):
    os.makedirs(joblog_dir + 'err')
if not os.path.exists(joblog_dir + 'out'):
    os.makedirs(joblog_dir + 'out')

if incl_imput == "0":
    imput = "Noimput"
else:
    imput = "Imput"

for peer_factor in peer_factors:
    print(peer_factor)

    in_path = argv[1] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' + '*' + '_' + method + in_suffix
    out_path = argv[5] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'

    if not os.path.exists(out_path):
        os.makedirs(out_path)

    matrices = glob.glob(in_path + '*')
    print("Number of matrices: " + str(len(matrices)))

    master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_" + peer_factor + '_' + imput + "_cis_matrix_eqtl_chr11_wrapper.sh"
    master_handle=open(master_script, 'w')
    master_handle.write("#!/bin/bash\n\n")

    # MatrixEQTL parameters
    incl_trans = '0'    # Don't include trans
    chr_number = '11'   # chromosome number
    cis_dist = '1e5'    # cis window

    k = 0
    for i, matrix in enumerate(matrices):
        filename = matrix.split('/')[-1]
        tissue_name = str.split(filename, '_'+method)[0]
        out_file = out_path + filename.replace('.txt','_MatrixEQTL')
        if os.path.isfile(out_file):
            continue
        k+=1
        genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
        sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_' + peer_factor + '_MatrixEQTL_chr' + chr_number + '_' + tissue_name + '_' + imput + ".slurm"
        outfilename = method+"_"+tissue_name+"_"+peer_factor
        sbatchhandle=open(sbatchfile, 'w')
        cmd=r"""#!/bin/bash
#SBATCH -J meqtl_%s_%s      # job name
#SBATCH --mem=6000           # 6 GB requested
#SBATCH -t 04:00:00           
#SBATCH -e %s           # err output directory
#SBATCH -o %s           # out output directory

/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \
%s %s %s %s %s %s %s %s %s %s

"""%(tissue_name, peer_factor, joblog_dir+'err/'+outfilename+'_'+imput+'.err', joblog_dir+'out/'+outfilename+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir)
        sbatchhandle.write(cmd)
        sbatchhandle.close()
        master_handle.write("sbatch " + sbatchfile  + " \n")

    master_handle.close()

    print 'sh %s'%(master_script)
    print 'Number of jobs:', k

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R
####################################################
# cis_matrix_eqtl.R
# Performs univariate cis-eQTL analysis using Matrix eQTL
#
####################################################

#!/usr/bin/Rscript

# Collect arguments
args <-commandArgs(TRUE)

## Default setting when no arguments passed
if (length(args) != 10 ){
  args <- c("--help")
}
## Help section
if("--help" %in% args) {
  cat("
      cis_peer_correction.R
 
      Usage:
      ./cis_matrix_eqtl.R /path/to/expression_matrix.txt include_trans_binary \
      include_imputed_binary chromosome_number cis_disance_def peer_factor  \
      /path/to/mapping/directory /path/to/output/files method /path/to/analysis/dir\n")
  q(save="no")
}

expression_file_location = args[1]
incl_trans = args[2]
incl_imput = args[3]
chr_number = args[4]
cis_dist = as.numeric(args[5])
peer_factor = args[6]

# Example
# args[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/featurecounts/TPM_GC_3GenPC_45PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC.txt'
# args[2] = '0'
# args[3] = '0'
# args[4] = '11'
# args[5] = '1e5'
# args[6] = '10'

imput = 'Imput'
if (incl_imput == '0') {imput = 'Noimput'}
SNP_file_name = paste(args[7], '/genotype/genotypesChr', chr_number, '_', imput, '.txt', sep="")
covariates_file_name = character()
SNP_position_file = paste(args[7], '/references/SNP_positions/SNP_positions_Chr', chr_number, '.txt', sep="")
gene_position_file = paste(args[7], '/references/gene_positions/gene_positions_Chr', chr_number, '.txt', sep="")

output_file_name_cis = paste(args[8], '_Chr', chr_number, '_', imput, '_cis.txt', sep="")
output_file_name_trans = paste(args[8], '_Chr', chr_number, '_', imput, '_trans.txt', sep="")

method = paste(args[9])
expression_file_name = strsplit(expression_file_location, split="/")[[1]][length(strsplit(expression_file_location, split="/")[[1]])]
tissue_name = strsplit(expression_file_name, split = paste('_', method, sep=""))[[1]][1] 
analysis_dir = paste(args[10], "/", tissue_name, sep="")

# Example
# args[7] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# args[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/
# ... TPM_GC_3GenPC_30PEER1000/Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL'
# args[9] = 'featurecounts'
# args[10] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis'

# Need to check that the subjects read in from expression matrices match the subjects in genotype data
library(dplyr)

expression_matrix = read.csv(file = expression_file_location, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
genotype_matrix = read.csv(file = SNP_file_name, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
row.names(genotype_matrix) = genotype_matrix$SNP

genotype_matrix = genotype_matrix[,sapply(colnames(expression_matrix), function(x) {match(x, colnames(genotype_matrix))})]

# Also obtain genes and snps that are in the expression and genotype files, respectively.
gene_positions = read.table(file = gene_position_file, header = FALSE, sep = '\t', stringsAsFactors = FALSE)
snp_positions = read.table(file = SNP_position_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE)

colnames(gene_positions) = c("gene_ID", "chr_num", "start", "end")
gene_positions = mutate(gene_positions, chr = paste("chr", chr_num, sep = ""))
gene_positions = select(gene_positions, c(gene_ID, chr, start, end))
# Make sure the list of genes and their ordering are the same
expression_matrix = expression_matrix[rownames(expression_matrix) %in% gene_positions$gene_ID,]
gene_positions = gene_positions[sapply(rownames(expression_matrix), function(x) {match(x, gene_positions$gene_ID)}),]

# Import MatrixEQTL
library(MatrixEQTL)

# Use linear model with p threshold of 0.01
useModel = modelLINEAR;
pvOutputThreshold_cis = 1e-2
pvOutputThreshold_trans = 0
if (incl_trans == '1') {pvOutputThreshold_trans = 1e-2}

snps = SlicedData$new()
snps$CreateFromMatrix(as.matrix(genotype_matrix));

gene = SlicedData$new()
gene$CreateFromMatrix(as.matrix(expression_matrix));

gene_qnorm = gene$copy();

# Quantile-normalize the expression values - ties broken by averaging
for( sl in 1:length(gene_qnorm) ) {
  mat = gene_qnorm[[sl]];
  mat = t(apply(mat, 1, rank, ties.method = "average"));
  mat = qnorm(mat / (ncol(gene_qnorm)+1));
  gene_qnorm[[sl]] = mat;
}
rm(sl, mat);

# me = Matrix_eQTL_main(
#     snps = snps,
#     gene = gene,
#     output_file_name = output_file_trans,
#     pvOutputThreshold = pvOutputThreshold,
#     useModel = useModel, 
#     verbose = TRUE,
#     pvalue.hist = "qqplot",
#     min.pv.by.genesnp = FALSE,
#     noFDRsaveMemory = FALSE);

# Only for generating the P-value hist:
me = Matrix_eQTL_main(
    snps = snps,
    gene = gene_qnorm,
    output_file_name = output_file_name_trans,
    pvOutputThreshold = pvOutputThreshold_trans,
    useModel = useModel, 
    verbose = TRUE,
    output_file_name.cis = output_file_name_cis,
    pvOutputThreshold.cis = pvOutputThreshold_cis,
    snpspos = snp_positions, 
    genepos = gene_positions,
    cisDist = cis_dist,
    pvalue.hist = 100,
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE)

# Save the histogram
output_file_name_hist = paste(analysis_dir, '/', tissue_name, '_', imput, '_Chr', chr_number, '_', peer_factor, '_hist.pdf', sep="")
print(paste("Saving to:", output_file_name_hist))
pdf(output_file_name_hist)
plot(me)
dev.off()

# repeat for the qqplot:
me_q = Matrix_eQTL_main(
  snps = snps,
  gene = gene_qnorm,
  output_file_name = output_file_name_trans,
  pvOutputThreshold = pvOutputThreshold_trans,
  useModel = useModel, 
  verbose = TRUE,
  output_file_name.cis = output_file_name_cis,
  pvOutputThreshold.cis = pvOutputThreshold_cis,
  snpspos = snp_positions, 
  genepos = gene_positions,
  cisDist = cis_dist,
  pvalue.hist = "qqplot",
  min.pv.by.genesnp = FALSE,
  noFDRsaveMemory = FALSE)

# Save the qqplot
output_file_name_qqplot = paste(analysis_dir, '/', tissue_name, '_', imput, '_Chr', chr_number, '_', peer_factor, '_qqplot.pdf', sep="")
print(paste("Saving to:", output_file_name_hist))
pdf(output_file_name_qqplot)
plot(me_q)
dev.off()

# Save the Matrix EQTL object as RData
save(gene_qnorm, me, snps, file=paste(args[8], '_Chr', chr_number, '_', imput, '_me.RData', sep=""))

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 0 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis 10 15 20 25 30 35 40 45 50 55 60


%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_rerun_wrapper.py
##################################################
#  cis_matrix_eqtl_rerun_wrapper.py
#  provide jobs that have failed with more time and memory
#
##################################################

import glob
from sys import argv
import os.path

in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
mapping_dir = argv[4]
joblog_dir = argv[6]
incl_imput = argv[7]
analysis_dir = argv[8]

# Array of PEER factors that are given in the input
peer_factors = argv[9:len(argv)]

# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[2] = '_genes_certain_biotypes_TPM_GC.txt'
# argv[3] = 'featurecounts'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/'
# argv[7] = '0' or '1'
# argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis'
# argv[9:len(argv)] = ['10','20','30','40','50']

# Make tissue list:
tissue_list = []
# Use 'TPM_GC_3GenPC_10PEER1000' to build tissue list
in_path = in_path + method + '/TPM_GC_3GenPC_10PEER1000/'
matrices = glob.glob(in_path + '*' + in_suffix)
for i, matrix in enumerate(matrices):
    filename = matrix.split('/')[-1]
    tissue_name = str.split(filename, '_'+method)[0]
    tissue_list.append(tissue_name)

if incl_imput == "0":
    imput = "Noimput"
else:
    imput = "Imput"

master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + '_' + imput + "_cis_matrix_eqtl_chr11_rerun_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")

k = 0
for peer_factor in peer_factors:
    print(peer_factor)
    out_path = argv[5] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/'

    # MatrixEQTL parameters
    incl_trans = '0'    # Don't include trans
    chr_number = '11'   # chromosome number
    cis_dist = '1e5'    # cis window

    for tissue in tissue_list:
        out_file = out_path + tissue + "_" + method + in_suffix.replace('.txt','_MatrixEQTL')
        check_file = out_path + tissue + "_" + method + in_suffix.replace('.txt','_MatrixEQTL_Chr') + chr_number + '_' + imput + '_me.RData'
        if os.path.isfile(check_file):
            continue
        k+=1
        print "Missing: " + check_file
        genes_or_transcripts = ['genes' if 'genes' in matrix else 'transcripts'][0]
        sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_' + peer_factor + '_MatrixEQTL_chr' + chr_number + '_' + tissue + "_" + imput + ".slurm"
        outfilename = method+"_"+tissue+"_"+peer_factor
        sbatchhandle=open(sbatchfile, 'w')
        cmd=r"""#!/bin/bash
#SBATCH -J meqtl_%s_%s      # job name
#SBATCH --mem=12000           # 12 GB requested
#SBATCH -t 08:00:00           # to be placed in the test queue
#SBATCH -e %s           # err output directory
#SBATCH -o %s           # out output directory

/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \
%s %s %s %s %s %s %s %s %s %s

"""%(tissue, peer_factor, joblog_dir+'err/'+outfilename+'_'+imput+'.err', joblog_dir+'out/'+outfilename+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir)
        sbatchhandle.write(cmd)
        sbatchhandle.close()
        master_handle.write("sbatch " + sbatchfile  + " \n")

master_handle.close()

print 'sh %s'%(master_script)
print 'Number of jobs:', k

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py

##################################################
#  create_dirs.py
#  create necessary directories
#
##################################################

import glob
from sys import argv
import os.path

in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
out_path = argv[4] + method + '/'

in_path = in_path + method + '/TPM_GC_3GenPC_10PEER1000/' + '*' + '_' + method + in_suffix

# in_path = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/expression_matrices/'
# in_suffix = '_genes_certain_biotypes_TPM.txt'
# method = 'featurecounts'
# out_path = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/"

directory = out_path + 'analysis'
if not os.path.exists(directory):
    os.makedirs(directory)

matrices = glob.glob(in_path + '*')
print(len(matrices))

for i, matrix in enumerate(matrices):
    filename = matrix.split('/')[-1]
    tissue_name = str.split(filename, '_'+method)[0]
    
    if not os.path.exists(directory + '/' + tissue_name):
        os.makedirs(directory + '/' + tissue_name)
    if not os.path.exists(directory + '/' + tissue_name + '/Examples/'):
        os.makedirs(directory + '/' + tissue_name + '/Examples')
        

%%bash 

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/

# In R:

install.packages("ggplot2")

%%bash

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_post_wrapper.py \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/ featurecounts 1e5 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt 100

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_post_wrapper.py
##################################################
#  cis_matrix_eqtl_post_wrapper.py
#  
#
##################################################

import glob
from sys import argv
import os.path

in_path = argv[1]
method = argv[2]
cis_cutoff = argv[3]
gene_name_file = argv[4]
snp_positions_file = argv[5]
tissue_list_file = argv[6]
num_indiv_cutoff = int(argv[7])

# Examples
# argv[1] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# argv[2] = 'featurecounts'
# argv[3] = '1e5'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'
# argv[7] = '100'

master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + "_cis_matrix_eqtl_post_wrapper.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")

# Make tissue list:
f = open(tissue_list_file)

for line in f.readlines():
    entry = str.split(line.strip(),'\t')
    # If the number of individuals is greater than the cutoff:
    if int(entry[1]) >= num_indiv_cutoff:
        tissue = entry[0]
        sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/" + method + '_' + tissue + "_cis_matrix_eqtl_post.slurm"
        sbatchhandle=open(sbatchfile, 'w')
        cmd=r"""#!/bin/bash
#SBATCH -J %s_Matrix_post      # job name
#SBATCH --mem=4000           # 4 GB requested
#SBATCH -e /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/err/Matrix_%s_post.err           # err output directory
#SBATCH -o /tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/out/Matrix_%s_post.out           # out output directory
#SBATCH -t 5:00:00           

/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL.R \
%s %s %s %s \
%s \
%s \
10 15 20 25 30 35 40 45 50 55 60
"""%(tissue, tissue, tissue, in_path, method, cis_cutoff, tissue, gene_name_file, snp_positions_file)
        sbatchhandle.write(cmd)
        sbatchhandle.close()
        master_handle.write("sbatch " + sbatchfile  + " \n")

master_handle.close()
print 'sh %s'%(master_script)

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL.R 
####################################################
# post_analyses_MatrixEQTL.R 
# Iterates through K PEER factors for every tissue and produces interesting graphs.
#
####################################################

#!/usr/bin/Rscript

# Collect arguments
args <-commandArgs(TRUE)

library(dplyr)
library(MatrixEQTL)
library(ggplot2)

# Load in data
# load("Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL_Chr11_Imput_me.RData")
# file_name = Adipose-Subcutaneous_featurecounts_genes_certain_biotypes_TPM_GC_MatrixEQTL_Chr11_Imput_me.RData

in_dir = args[1]
method = args[2]
cis_dist = as.numeric(args[3])
in_dir = paste(in_dir, method, "/", sep='')
tissue = args[4]
gene_name_file = args[5]
snp_position_file = args[6]
peer_factors = args[7:length(args)]

# args[1] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/'
# args[2] = 'featurecounts'
# args[3] = '1e5'
# args[4] = 'WholeBlood'
# args[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/gene_names_positions.txt'
# args[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/SNP_positions/SNP_positions_Chr11.txt'
# args[7] = c(10,15,20,25,30,35,40,45,50,55,60)

# Save the numer of significant pairs and unique genes detected at each level.
Counts_by_tissue = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE)
#Genes_snps_by_tissue = data.frame(tissue = character(0), imputed = character(0), gene_list = character(0), snp_list = character(0))

gene_names = read.csv(gene_name_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
row.names(gene_names) = gene_names$gene_id
print(dim(gene_names))
snp_positions = read.csv(snp_position_file, header = TRUE, sep = '\t', stringsAsFactors = FALSE)
row.names(snp_positions) = snp_positions$rsID
print(dim(snp_positions))
print(tissue)

for (peer in peer_factors) {
    print(peer)

    # First iterate through nonimputed:
    result_path = paste(in_dir, "TPM_GC_3GenPC_", peer, "PEER1000/", sep='')
    analysis_path = paste(in_dir, "analysis/", tissue, '/', sep='')
    # Match a regular expression pattern:
    result_file <- dir(result_path, pattern = paste(tissue,".*\\Noimput_me.RData",sep=''))

    # The result file of MatrixEQTL exists but the summary file (output of this) doesn't
    if (length(result_file) > 0) {

    load(paste(result_path, result_file[1], sep=''))

    # Plot top 5 eQTL - unique gene pairs, if they exist.
    temp = data.frame(me$cis$eqtls)
    temp$snps = as.character(temp$snps)
    temp$gene = as.character(temp$gene)
    
    top_genes = head(unique(temp$gene), n=5)
    # Only proceed if there are at least 5 genes.
    if (length(top_genes) == 5) {

        temp$TSS = gene_names[temp$gene,][,"start"]
        temp$TES = gene_names[temp$gene,][,"end"]
        temp$gene_name = gene_names[temp$gene,][,"gene_name"]
        temp$SNP_pos = snp_positions[temp$snps,][,"pos"]
        # Difference mapping: -1e5 to 0 if before TSS
        # scale from 1 to 1e5 if in gene
        # 1e5 to 2e5 if after TES
        temp$diff = sapply(c(1:dim(temp)[1]), function(x) {
            if ((temp$SNP_pos[x] - temp$TSS[x])<0) {
                temp$SNP_pos[x] - temp$TSS[x]
            } else if ((temp$SNP_pos[x] - temp$TES[x])>0) {
                temp$SNP_pos[x] - temp$TES[x] + cis_dist
            } else {
                cis_dist * (temp$SNP_pos[x] - temp$TSS[x]) / (temp$TES[x] - temp$TSS[x])
            }})

        tryCatch({
            g = ggplot(temp[temp$FDR < 0.01,], aes(x = diff)) + geom_histogram(binwidth = 10000) + ggtitle('Distribution of distances to TSS') + xlab('Distance') + scale_x_discrete(breaks=c(0, 1e5), labels=c("TSS", "TES"))
            print(paste("Trying to save: ", analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep=''))
            ggsave(filename = paste(analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep=''), plot=g, width = 7, height = 7)
        }, error = function(e) {
            print(paste("Could not save: ", analysis_path, tissue, "_Noimput_", peer, "_Distance_distribution.pdf", sep=''))
        })

        for (j in 1:5) {
            index = which(temp$gene == top_genes[j])[1]
            test_snp = snps$FindRow(temp[index,"snps"])
            test_gene_q = gene_qnorm$FindRow(temp[index,"gene"])

            test.df = data.frame(snp = as.numeric(test_snp$row), gene = as.numeric(test_gene_q$row))
            row.names(test.df) = colnames(snps)
            test.df = test.df[!is.na(test.df$snp),]
            test.df$snp_chr = as.character(test.df$snp)
            test_name = as.character(temp[index,"gene_name"])
            test_rsID = as.character(temp[index,"snps"])

            tryCatch({
                g = ggplot(test.df, aes(x = snp_chr, y = gene, color=snp_chr)) + geom_violin() + geom_jitter() + ggtitle(paste(test_name, test_rsID, sep='_')) + xlab(test_rsID) + ylab(test_name)
                print(paste("Trying to save: ", analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
                ggsave(filename = paste(analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''), plot = g, width = 7, height = 7)
            }, error = function(e) {
                print(paste("Could not save: ", analysis_path, "Examples/", tissue, "_Noimput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
            })
        }
    }
    
    # Take all
    n_pairs = dim(me$cis$eqtls)[1]
    n_genes = length(unique(me$cis$eqtls$gene))
    Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "pval", imputed = "Noimput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))

    # Take FDR-controlled at 5%
    n_pairs = dim(filter(me$cis$eqtls, FDR <= 0.01))[1]
    n_genes = length(unique(filter(me$cis$eqtls, FDR <= 0.01)$gene))
    Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "FDR", imputed = "Noimput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))
}
}

if (dim(Counts_by_tissue)[1] != 0) {
    print(paste("Save: ", analysis_path, tissue, "_Summary_Noimput.RData", sep=""))
    save(Counts_by_tissue, file=paste(analysis_path, tissue, "_Summary_Noimput.RData", sep=""))
}

Counts_by_tissue = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE)

for (peer in peer_factors) {
    print(peer)

    # Now iterate through imputed:
    # First iterate through nonimputed:
    result_path = paste(in_dir, "TPM_GC_3GenPC_", peer, "PEER1000/", sep='')
    analysis_path = paste(in_dir, "analysis/", tissue, '/', sep='')
    # Match a regular expression pattern:
    result_file <- dir(result_path, pattern = paste(tissue,".*\\Imput_me.RData",sep=''))

    if (length(result_file) > 0) {

    load(paste(result_path, result_file[1], sep=''))

    temp = data.frame(me$cis$eqtls)
    temp$snps = as.character(temp$snps)
    temp$gene = as.character(temp$gene)
    
    top_genes = head(unique(temp$gene), n=5)
    # Only proceed if there is at least 5 genes.
    if (length(top_genes) == 5) {

        temp$TSS = gene_names[temp$gene,][,"start"]
        temp$TES = gene_names[temp$gene,][,"end"]
        temp$gene_name = gene_names[temp$gene,][,"gene_name"]
        temp$SNP_pos = snp_positions[temp$snps,][,"pos"]
        # Difference mapping: -1e5 to 0 if before TSS
        # scale from 1 to 1e5 if in gene
        # 1e5 to 2e5 if after TES
        temp$diff = sapply(c(1:dim(temp)[1]), function(x) {
            if ((temp$SNP_pos[x] - temp$TSS[x])<0) {
                temp$SNP_pos[x] - temp$TSS[x]
            } else if ((temp$SNP_pos[x] - temp$TES[x])>0) {
                temp$SNP_pos[x] - temp$TES[x] + cis_dist
            } else {
                cis_dist * (temp$SNP_pos[x] - temp$TSS[x]) / (temp$TES[x] - temp$TSS[x])
            }})

        tryCatch({
            g = ggplot(temp[temp$FDR < 0.01,], aes(x = diff)) + geom_histogram(binwidth = 10000) + ggtitle('Distribution of distances to TSS') + xlab('Distance') + scale_x_discrete(breaks=c(0, 1e5), labels=c("TSS", "TES"))
            print(paste("Trying to save: ", analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep=''))
            ggsave(filename = paste(analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep=''), plot=g, width = 7, height = 7)
        }, error = function(e) {
            print(paste("Could not save: ", analysis_path, tissue, "_Imput_", peer, "_Distance_distribution.pdf", sep=''))
       })

        for (j in 1:5) {
            index = which(temp$gene == top_genes[j])[1]
            test_snp = snps$FindRow(temp[index,"snps"])
            test_gene_q = gene_qnorm$FindRow(temp[index,"gene"])

            test.df = data.frame(snp = as.numeric(test_snp$row), gene = as.numeric(test_gene_q$row))
            row.names(test.df) = colnames(snps)
            test.df = test.df[!is.na(test.df$snp),]
            test_name = as.character(temp[index,"gene_name"])
            test_rsID = as.character(temp[index,"snps"])

            tryCatch({
                g = ggplot(test.df, aes(x = snp, y = gene, color=snp)) + geom_jitter() + ggtitle(paste(test_name, test_rsID, sep='_')) + xlab(test_rsID) + ylab(test_name)
                print(paste("Trying to save: ", analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
                ggsave(filename = paste(analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''), plot = g, width = 7, height = 7)
            }, error = function(e) {
                print(paste("Could not save: ", analysis_path, "Examples/", tissue, "_Imput_", peer, "_", test_name, "_", test_rsID, ".pdf", sep=''))
           })
        }
    }

    # Take all
    n_pairs = dim(me$cis$eqtls)[1]
    n_genes = length(unique(me$cis$eqtls$gene))
    Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "pval", imputed = "Imput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))

    # Take FDR-controlled at 1%
    n_pairs = dim(filter(me$cis$eqtls, FDR <= 0.01))[1]
    n_genes = length(unique(filter(me$cis$eqtls, FDR <= 0.01)$gene))
    Counts_by_tissue = rbind(Counts_by_tissue, data.frame(tissue = tissue, control = "FDR", imputed = "Imput", peer_K = peer, pairs = n_pairs, genes = n_genes, stringsAsFactors = FALSE))

}
}

if (dim(Counts_by_tissue)[1] != 0) {
    print(paste("Save: ", analysis_path, tissue, "_Summary_Imput.RData", sep=""))
    save(Counts_by_tissue, file=paste(analysis_path, tissue, "_Summary_Imput.RData", sep=""))
}

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/post_analyses_MatrixEQTL_aggregate.R 

####################################################
# post_analyses_MatrixEQTL_aggregate.R 
# Aggregates all results and optimizes for K PEER factors in a tissue-specific manner.
#
####################################################

#!/usr/bin/Rscript

library(ggplot2)
library(dplyr)

analysis_path = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/chr11_test/featurecounts/analysis/'
tissue_list_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'
num_indiv_cutoff = 100

tissue_list = read.table(tissue_list_file, header=FALSE, sep="\t", stringsAsFactors=FALSE)
colnames(tissue_list) = c("Tissue", "Count")
rownames(tissue_list) = tissue_list["Tissue"][[1]]
tissue_list = tissue_list[tissue_list["Count"]>=num_indiv_cutoff,]

Counts_by_tissue_aggregate = data.frame(tissue = character(0), control = character(0), imputed = character(0), peer_K = numeric(0), pairs = numeric(0), genes = numeric(0), stringsAsFactors = FALSE)

for (i in c(1:dim(tissue_list)[1])) {
    tissue = tissue_list["Tissue"][[1]][i]
    load(paste(analysis_path, tissue, "/", tissue, "_Summary_Imput.RData", sep=""))
    Counts_by_tissue_aggregate = rbind(Counts_by_tissue_aggregate, Counts_by_tissue)
    load(paste(analysis_path, tissue, "/", tissue, "_Summary_Noimput.RData", sep=""))
    Counts_by_tissue_aggregate = rbind(Counts_by_tissue_aggregate, Counts_by_tissue)
}

for (i in c(1:dim(tissue_list)[1])) {
    tissue = tissue_list["Tissue"][[1]][i]
    print(tissue)
    print(sum(Counts_by_tissue_aggregate["tissue"] == tissue))
}

# We have the data frame with all counts.
save(Counts_by_tissue_aggregate, file=paste(analysis_path, "All_Counts.RData", sep=""))

# Save the plots of number for all tissues:
g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "pval"), aes(x = peer_K, y = pairs, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_pval_pairs.pdf", sep=''), plot = g, width = 10, height = 10)

g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "FDR"), aes(x = peer_K, y = pairs, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_FDR_pairs.pdf", sep=''), plot = g, width = 10, height = 10)

g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "pval"), aes(x = peer_K, y = genes, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_pval_genes.pdf", sep=''), plot = g, width = 10, height = 10)

g = ggplot(data = filter(Counts_by_tissue_aggregate, control == "FDR"), aes(x = peer_K, y = genes, color = tissue)) + geom_line(aes(group = tissue)) + facet_grid(. ~ imputed) + theme(legend.position = "none")
ggsave(filename = paste(analysis_path, "All_tissues_FDR_genes.pdf", sep=''), plot = g, width = 10, height = 10)


# Optimize on the number of K PEER factors to use for each tissue:
# Criteria 1: when it decreases
peer_K_to_iterate = c(15, 20, 25, 30, 35, 40, 45, 50, 55, 60)
tissue_list["optimal_dec"] = 10

for (i in c(1:dim(tissue_list)[1])) {
    tissue_name = tissue_list["Tissue"][[1]][i]
    current_opt = 10
    current_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == 10)["pairs"][[1]][1]
    for (j in peer_K_to_iterate) {
        next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1]
        if (next_val <= current_val) {
            tissue_list[tissue_name, "optimal_dec"] = current_opt
            break
        }
        current_opt = j
        current_val = next_val
    }
    if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) {
        tissue_list[tissue_name, "optimal_dec"] = current_opt
    }
}

# Another Criteria: when it reaches 5% of maximal
# Tunable alpha, when it reaches N% of maximal
alpha = 0.05
tissue_list["optimal_95"] = 10

for (i in c(1:dim(tissue_list)[1])) {
    tissue_name = tissue_list["Tissue"][[1]][i]
    current_opt = 10
    thresh = max(filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput")["pairs"]) * (1-alpha)
        for (j in peer_K_to_iterate) {
        next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1]
        if (next_val >= thresh) {
            tissue_list[tissue_name, "optimal_95"] = current_opt
            break
        }
        current_opt = j
    }
    if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) {
        tissue_list[tissue_name, "optimal_95"] = current_opt
    }
}

# Another Criteria: when it reaches 1% of maximal
# Tunable alpha, when it reaches N% of maximal
alpha = 0.01
tissue_list["optimal_99"] = 10

for (i in c(1:dim(tissue_list)[1])) {
    tissue_name = tissue_list["Tissue"][[1]][i]
    current_opt = 10
    thresh = max(filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput")["pairs"]) * (1-alpha)
        for (j in peer_K_to_iterate) {
        next_val = filter(Counts_by_tissue_aggregate, tissue==tissue_name, control=="FDR", imputed=="Imput", peer_K == j)["pairs"][[1]][1]
        if (next_val >= thresh) {
            tissue_list[tissue_name, "optimal_99"] = current_opt
            break
        }
        current_opt = j
    }
    if (current_opt == peer_K_to_iterate[length(peer_K_to_iterate)]) {
        tissue_list[tissue_name, "optimal_99"] = current_opt
    }
}

# Save the optimal K for future use:

write.table(tissue_list, file='/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt', row.names=FALSE, quote=FALSE, sep="\t")

%%bash

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper_all.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/ \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/ 1 \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/featurecounts/analysis \
/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt

python /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/create_dirs.py \
/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/ _genes_certain_biotypes_counts_normalized.txt \
featurecounts /tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/

%%writefile /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl_wrapper_all.py
##################################################
#  cis_matrix_eqtl_wrapper_all.py
#  Wrapper for the cis_matrix_eqtl.R file and submitting the batch jobs
#
##################################################

import glob
from sys import argv
import os.path

in_path = argv[1]
in_suffix = argv[2]
method = argv[3]
mapping_dir = argv[4]
joblog_dir = argv[6]
incl_imput = argv[7]
analysis_dir = argv[8]
peer_factors_file = argv[9]

# Examples
# argv[1] = '/tigress/BEE/gtex/data/phenotype/expression/expression_matrices/normalized/hg19/'
# argv[2] = '_genes_certain_biotypes_counts_normalized.txt'
# argv[3] = 'featurecounts'
# argv[4] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping'
# argv[5] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/'
# argv[6] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/joblogs/MatrixEQTL/'
# argv[7] = '0' or '1'
# argv[8] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/MatrixEQTL/featurecounts/analysis'
# argv[9] = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_optimal_PEER_k.txt'

# Make the job log directories
if not os.path.exists(joblog_dir):
    os.makedirs(joblog_dir)
if not os.path.exists(joblog_dir + 'err'):
    os.makedirs(joblog_dir + 'err')
if not os.path.exists(joblog_dir + 'out'):
    os.makedirs(joblog_dir + 'out')
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)

if incl_imput == "0":
    imput = "Noimput"
else:
    imput = "Imput"

# Create the array of jobs to run:
tissue_list = []
peer_factors = []
f = open(peer_factors_file)
# First line is a header:
f.readline()
for line in f.readlines():
    entry = str.split(line.strip(),'\t')
    tissue_list.append(entry[0])
    # We use the optimal_99 critera:
    peer_factors.append(entry[-1])

out_path = argv[5] + method + '/'

if not os.path.exists(out_path):
    os.makedirs(out_path)

master_script = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/" + method + '_' + imput + "_cis_matrix_eqtl_wrapper_all.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")

# MatrixEQTL parameters
incl_trans = '0'    # Don't include trans
cis_dist = '1e5'    # cis window

k = 0
for i in range(len(tissue_list)):
    tissue_name = tissue_list[i]
    peer_factor = peer_factors[i]
    filename = tissue_name + '_' + method + in_suffix
    matrix = argv[1] + method + '/TPM_GC_3GenPC_' + peer_factor + 'PEER1000/' + filename
    out_file = out_path + filename.replace('.txt','_MatrixEQTL')
    # Iterate through 22 chromosomes
    for j in range(22):
        chr_number = str(j+1)
        sbatchfile = "/tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/batch/MatrixEQTL/" + method + '_TPM_GC_3GenPC_MatrixEQTL_chr' + chr_number + '_' + tissue_name + '_' + imput + ".slurm"
        job_outfile = method+"_"+tissue_name+"_cis_all"
        sbatchhandle=open(sbatchfile, 'w')
        cmd=r"""#!/bin/bash
#SBATCH -J meqtl_%s     # job name
#SBATCH --mem=8000           # 8 GB requested
#SBATCH -t 06:00:00           # to be placed in the test queue
#SBATCH -e %s           # err output directory
#SBATCH -o %s           # out output directory

/usr/bin/Rscript /tigress/BEE/gtex/analyses/user_specific/cis-mapping/scripts/cis_matrix_eqtl.R \
%s %s %s %s %s %s %s %s %s %s
"""%(tissue_name, joblog_dir+'err/'+job_outfile+'_'+imput+'.err', joblog_dir+'out/'+job_outfile+'_'+imput+'.out', matrix, incl_trans, incl_imput, chr_number, cis_dist, peer_factor, mapping_dir, out_file, method, analysis_dir)
        sbatchhandle.write(cmd)
        sbatchhandle.close()
        master_handle.write("sbatch " + sbatchfile  + " \n")
        k += 1

master_handle.close()

print 'sh %s'%(master_script)
print 'Number of jobs:', k


%%bash

python /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control_wrapper.py

%%writefile /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control_wrapper.py
####################################################
# MatrixEQTL_cis_FDR_control_wrapper.py
# 
#
####################################################
tissue_list_file = '/tigress/BEE/gtex/analyses/user_specific/cis-mapping/references/tissues_by_number_of_samples.txt'
f = open(tissue_list_file)
tissue_list = []
for line in f.readlines():
    tissue_list.append(str.split(line.strip(), '\t')[0])
f.close()

original_dir = "/tigress/bj5/MatrixEQTL/cis/featurecounts/"
write_dir = "/tigress/bj5/cis_meta/MatrixEQTL/"

master_script = "/tigress/bj5/cis_meta/scripts/MatrixEQTL_FDR_control.sh"
master_handle=open(master_script, 'w')
master_handle.write("#!/bin/bash\n\n")

for item in tissue_list:
    sbatchfile = "/tigress/bj5/cis_meta/scripts/batch/MatrixEQTL_FDR_control_" + item + ".slurm"
    sbatchhandle=open(sbatchfile, 'w')
    cmd=r"""#!/bin/bash
#SBATCH -J Mat_%s_FDR     # job name
#SBATCH --mem=4000           # 8 GB requested
#SBATCH -t 01:00:00           # to be placed in the test queue
#SBATCH -e %s           # err output directory
#SBATCH -o %s           # out output directory

/usr/bin/Rscript /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control.R \
%s %s %s
"""%(item, write_dir + item + ".err", write_dir + item + ".out", original_dir, item, write_dir)
    sbatchhandle.write(cmd)
    sbatchhandle.close()
    master_handle.write("sbatch " + sbatchfile  + " \n")

master_handle.close()

print 'sh %s'%(master_script)

%%writefile /tigress/bj5/cis_meta/scripts/MatrixEQTL_cis_FDR_control.R
####################################################
# MatrixEQTL_cis_FDR_control.R
# 
#
####################################################

#!/usr/bin/Rscript

# Collect arguments
args <-commandArgs(TRUE)

# FDR Control of p-values from the null distribution
original_dir = '/tigress/bj5/MatrixEQTL/cis/featurecounts/'
tissue = 'Adipose-Subcutaneous'
method = 'featurecounts'
file_attr = '_genes_certain_biotypes_counts_normalized_MatrixEQTL_Chr'
imputed = 'Imput'
suffix = '_me.RData'
write_dir = '/tigress/bj5/cis_meta/MatrixEQTL/'

original_dir = args[1]
tissue = args[2]
write_dir = args[3]

snps_list = character()
genes_list = character()
pvals_list = numeric()
betas_list = numeric()
statistic_list = numeric()
total_tests = 0
total_called_eqtls = 0

for (i in c(1:22)) {
    print(i)
    filename = paste(original_dir, tissue, '_', method, file_attr, i, '_', imputed, suffix, sep='')
    load(filename)

    snps_list = append(snps_list, sapply(me$cis$eqtls$snps, as.character))
    genes_list = append(genes_list, sapply(me$cis$eqtls$gene, as.character))
    pvals_list = append(pvals_list, me$cis$eqtls$pvalue)
    betas_list = append(betas_list, me$cis$eqtls$beta)
    statistic_list = append(statistic_list, me$cis$eqtls$statistic)
    total_tests = total_tests + me$cis$ntests
    total_called_eqtls = total_called_eqtls + me$cis$neqtls
}

ordering = order(pvals_list)
ordered_pvals_list = pvals_list[ordering]
ordered_snps_list = snps_list[ordering]
ordered_genes_list = genes_list[ordering]
ordered_betas_list = betas_list[ordering]
ordered_statistic_list = statistic_list[ordering]

# Perform Benjamini-Hochberg
alpha_hat = sapply(c(1:total_called_eqtls), function(x) {ordered_pvals_list[x] * total_tests / x})
alpha_hat[(alpha_hat >= 1.0)] = 1.0
cur_index = 1

alpha = numeric()
ptm <- proc.time()
while (cur_index <= length(alpha_hat)) {
    temp = alpha_hat[cur_index:length(alpha_hat)]
    min_index = which.min(temp)
    min_alpha = min(temp)
    alpha = append(alpha, rep(min_alpha, min_index))
    cur_index = cur_index + min_index
}
print("FDR calculation time: ")
proc.time() - ptm

num_eqtls = sum(alpha <= 0.05)
result = data.frame(snp = ordered_snps_list[1:num_eqtls], gene = ordered_genes_list[1:num_eqtls], pval = ordered_pvals_list[1:num_eqtls], beta = ordered_betas_list[1:num_eqtls], statistic = ordered_statistic_list[1:num_eqtls], FDR = alpha[1:num_eqtls])
write.table(result, file=paste(write_dir, tissue, "_FDR_5percent.txt", sep=""), quote = FALSE, sep = "\t", row.names = FALSE)

print(paste("Total number of tests:", total_tests))
print(paste("Total eQTLs called at 5% FDR:", num_eqtls))