conda install brockman-pipeline
conda install ruby amused bedtools ucsc-twobittofa samtools trimmomatic r-jackstraw r-ggplot2 r-tsne -y
install.packages("devtools")
library(devtools)
install_github("Carldeboer/BrockmanR")
library(BrockmanR)
library(irlba)
Loading required package: ggplot2 Loading required package: tsne Loading required package: Matrix
chrom_size = read.table('../../input/mm9/mm9.chrom.sizes')
head(chrom_size)
chrom_size$V3 <- rep(0,dim(chrom_size)[1])
head(chrom_size)
chromBED <- chrom_size[, c("V1", "V3", "V2")]
head(chromBED)
V1 | V2 |
---|---|
<fct> | <int> |
chr1 | 197195432 |
chr2 | 181748087 |
chr3 | 159599783 |
chr4 | 155630120 |
chr5 | 152537259 |
chr6 | 149517037 |
V1 | V2 | V3 |
---|---|---|
<fct> | <int> | <dbl> |
chr1 | 197195432 | 0 |
chr2 | 181748087 | 0 |
chr3 | 159599783 | 0 |
chr4 | 155630120 | 0 |
chr5 | 152537259 | 0 |
chr6 | 149517037 | 0 |
V1 | V3 | V2 |
---|---|---|
<fct> | <dbl> | <int> |
chr1 | 0 | 197195432 |
chr2 | 0 | 181748087 |
chr3 | 0 | 159599783 |
chr4 | 0 | 155630120 |
chr5 | 0 | 152537259 |
chr6 | 0 | 149517037 |
# write.table(file = 'chroms.bed', chromBED,col.names = FALSE,row.names = FALSE,quote = FALSE,sep="\t")
bamfile <- list.files(path = "../../input/sc-bams_nodup/",pattern = "\\.bam$")
length(bamfile)
path_to_bamfile <- paste0("../../../../input/sc-bams_nodup/",bamfile)
cellnames <- sapply(strsplit(bamfile,'.',fixed = TRUE), "[[", 2)
sample_list_bam <- cbind(cellnames,path_to_bamfile)
head(sample_list_bam)
# write.table(file = './Brockman-master/Example/sample_list_bam.txt', sample_list_bam,col.names = FALSE,row.names = FALSE,quote = FALSE,sep="\t")
cellnames | path_to_bamfile |
---|---|
AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT | ../../../../input/sc-bams_nodup/BoneMarrow_62016.AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT.header.bam |
AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC | ../../../../input/sc-bams_nodup/BoneMarrow_62016.AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC.header.bam |
AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC | ../../../../input/sc-bams_nodup/BoneMarrow_62016.AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC.header.bam |
AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC | ../../../../input/sc-bams_nodup/BoneMarrow_62016.AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC.header.bam |
AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC | ../../../../input/sc-bams_nodup/BoneMarrow_62016.AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC.header.bam |
AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT | ../../../../input/sc-bams_nodup/BoneMarrow_62016.AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT.header.bam |
elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){
set.seed(2019)
mat = data.matrix(mat)
SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)
options(repr.plot.width=width, repr.plot.height=height)
df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);
# print(SVD$d[1:num_pcs])
p <- ggplot(df_plot, aes(x = PC, y = SD)) +
geom_point(col="#cd5c5c",size = 1) +
ggtitle(title)
return(p)
}
bsub < example_run_pipeine_LSF.sh
start_time <- Sys.time()
metadata <- read.table('../../input/metadata.tsv',
header = TRUE,
stringsAsFactors=FALSE,quote="",row.names=1)
sampleDesc <- metadata
allData = BrockmanR::inputKMerFreqs(
fileNames = sprintf("./Brockman-master/Example/kmer_frequencies/%s.freq.gz",
rownames(metadata)),
IDs = rownames(metadata))
Inputting i=1/12178 Inputting i=51/12178 Inputting i=101/12178 Inputting i=151/12178 Inputting i=201/12178 Inputting i=251/12178 Inputting i=301/12178 Inputting i=351/12178 Inputting i=401/12178 Inputting i=451/12178 Inputting i=501/12178 Inputting i=551/12178 Inputting i=601/12178 Inputting i=651/12178 Inputting i=701/12178 Inputting i=751/12178 Inputting i=801/12178 Inputting i=851/12178 Inputting i=901/12178 Inputting i=951/12178 Inputting i=1001/12178 Inputting i=1051/12178 Inputting i=1101/12178 Inputting i=1151/12178 Inputting i=1201/12178 Inputting i=1251/12178 Inputting i=1301/12178 Inputting i=1351/12178 Inputting i=1401/12178 Inputting i=1451/12178 Inputting i=1501/12178 Inputting i=1551/12178 Inputting i=1601/12178 Inputting i=1651/12178 Inputting i=1701/12178 Inputting i=1751/12178 Inputting i=1801/12178 Inputting i=1851/12178 Inputting i=1901/12178 Inputting i=1951/12178 Inputting i=2001/12178 Inputting i=2051/12178 Inputting i=2101/12178 Inputting i=2151/12178 Inputting i=2201/12178 Inputting i=2251/12178 Inputting i=2301/12178 Inputting i=2351/12178 Inputting i=2401/12178 Inputting i=2451/12178 Inputting i=2501/12178 Inputting i=2551/12178 Inputting i=2601/12178 Inputting i=2651/12178 Inputting i=2701/12178 Inputting i=2751/12178 Inputting i=2801/12178 Inputting i=2851/12178 Inputting i=2901/12178 Inputting i=2951/12178 Inputting i=3001/12178 Inputting i=3051/12178 Inputting i=3101/12178 Inputting i=3151/12178 Inputting i=3201/12178 Inputting i=3251/12178 Inputting i=3301/12178 Inputting i=3351/12178 Inputting i=3401/12178 Inputting i=3451/12178 Inputting i=3501/12178 Inputting i=3551/12178 Inputting i=3601/12178 Inputting i=3651/12178 Inputting i=3701/12178 Inputting i=3751/12178 Inputting i=3801/12178 Inputting i=3851/12178 Inputting i=3901/12178 Inputting i=3951/12178 Inputting i=4001/12178 Inputting i=4051/12178 Inputting i=4101/12178 Inputting i=4151/12178 Inputting i=4201/12178 Inputting i=4251/12178 Inputting i=4301/12178 Inputting i=4351/12178 Inputting i=4401/12178 Inputting i=4451/12178 Inputting i=4501/12178 Inputting i=4551/12178 Inputting i=4601/12178 Inputting i=4651/12178 Inputting i=4701/12178 Inputting i=4751/12178 Inputting i=4801/12178 Inputting i=4851/12178 Inputting i=4901/12178 Inputting i=4951/12178 Inputting i=5001/12178 Inputting i=5051/12178 Inputting i=5101/12178 Inputting i=5151/12178 Inputting i=5201/12178 Inputting i=5251/12178 Inputting i=5301/12178 Inputting i=5351/12178 Inputting i=5401/12178 Inputting i=5451/12178 Inputting i=5501/12178 Inputting i=5551/12178 Inputting i=5601/12178 Inputting i=5651/12178 Inputting i=5701/12178 Inputting i=5751/12178 Inputting i=5801/12178 Inputting i=5851/12178 Inputting i=5901/12178 Inputting i=5951/12178 Inputting i=6001/12178 Inputting i=6051/12178 Inputting i=6101/12178 Inputting i=6151/12178 Inputting i=6201/12178 Inputting i=6251/12178 Inputting i=6301/12178 Inputting i=6351/12178 Inputting i=6401/12178 Inputting i=6451/12178 Inputting i=6501/12178 Inputting i=6551/12178 Inputting i=6601/12178 Inputting i=6651/12178 Inputting i=6701/12178 Inputting i=6751/12178 Inputting i=6801/12178 Inputting i=6851/12178 Inputting i=6901/12178 Inputting i=6951/12178 Inputting i=7001/12178 Inputting i=7051/12178 Inputting i=7101/12178 Inputting i=7151/12178 Inputting i=7201/12178 Inputting i=7251/12178 Inputting i=7301/12178 Inputting i=7351/12178 Inputting i=7401/12178 Inputting i=7451/12178 Inputting i=7501/12178 Inputting i=7551/12178 Inputting i=7601/12178 Inputting i=7651/12178 Inputting i=7701/12178 Inputting i=7751/12178 Inputting i=7801/12178 Inputting i=7851/12178 Inputting i=7901/12178 Inputting i=7951/12178 Inputting i=8001/12178 Inputting i=8051/12178 Inputting i=8101/12178 Inputting i=8151/12178 Inputting i=8201/12178 Inputting i=8251/12178 Inputting i=8301/12178 Inputting i=8351/12178 Inputting i=8401/12178 Inputting i=8451/12178 Inputting i=8501/12178 Inputting i=8551/12178 Inputting i=8601/12178 Inputting i=8651/12178 Inputting i=8701/12178 Inputting i=8751/12178 Inputting i=8801/12178 Inputting i=8851/12178 Inputting i=8901/12178 Inputting i=8951/12178 Inputting i=9001/12178 Inputting i=9051/12178 Inputting i=9101/12178 Inputting i=9151/12178 Inputting i=9201/12178 Inputting i=9251/12178 Inputting i=9301/12178 Inputting i=9351/12178 Inputting i=9401/12178 Inputting i=9451/12178 Inputting i=9501/12178 Inputting i=9551/12178 Inputting i=9601/12178 Inputting i=9651/12178 Inputting i=9701/12178 Inputting i=9751/12178 Inputting i=9801/12178 Inputting i=9851/12178 Inputting i=9901/12178 Inputting i=9951/12178 Inputting i=10001/12178 Inputting i=10051/12178 Inputting i=10101/12178 Inputting i=10151/12178 Inputting i=10201/12178 Inputting i=10251/12178 Inputting i=10301/12178 Inputting i=10351/12178 Inputting i=10401/12178 Inputting i=10451/12178 Inputting i=10501/12178 Inputting i=10551/12178 Inputting i=10601/12178 Inputting i=10651/12178 Inputting i=10701/12178 Inputting i=10751/12178 Inputting i=10801/12178 Inputting i=10851/12178 Inputting i=10901/12178 Inputting i=10951/12178 Inputting i=11001/12178 Inputting i=11051/12178 Inputting i=11101/12178 Inputting i=11151/12178 Inputting i=11201/12178 Inputting i=11251/12178 Inputting i=11301/12178 Inputting i=11351/12178 Inputting i=11401/12178 Inputting i=11451/12178 Inputting i=11501/12178 Inputting i=11551/12178 Inputting i=11601/12178 Inputting i=11651/12178 Inputting i=11701/12178 Inputting i=11751/12178 Inputting i=11801/12178 Inputting i=11851/12178 Inputting i=11901/12178 Inputting i=11951/12178 Inputting i=12001/12178 Inputting i=12051/12178 Inputting i=12101/12178 Inputting i=12151/12178
allData[1:5,1:5]
TCCGCGAACTAACTAGGTTGCTACGGTCATAGAGGC | TCCGCGAAAGGTCAGCTTTGCGGATAGTGTACTGAC | ATTACTCGTTGCCGTAGGCTTAATCTTGTATAGCCT | TCCGCGAAACCAGGCGCAAAGCTAGGTTGTACTGAC | ATTCAGAATCGTAGCATCGCGCAATGACCCTATCCT | |
---|---|---|---|---|---|
TTTTAAAA | 0.0116696442 | 0.0163818000 | 0.0116088289 | 0.0141069454 | 0.0129692197 |
TTTGCAAA | 0.0044008404 | 0.0046254494 | 0.0034826487 | 0.0046684855 | 0.0026603528 |
TTTGAAAA | 0.0060326127 | 0.0063359021 | 0.0056109340 | 0.0056326293 | 0.0046556173 |
TTTCGAAA | 0.0002472382 | 0.0001927271 | 0.0006449349 | 0.0003044664 | 0.0003325441 |
TTTCCAAA | 0.0044255642 | 0.0044327224 | 0.0040630901 | 0.0040976109 | 0.0035748490 |
p_elbow_BROCKMAN <- elbow_plot(allData,num_pcs = 30, title = 'PCA on BROCKMAN')
p_elbow_BROCKMAN
pcs = doKMerPCA(allData, nPCs = 10);
head(pcs)
Scaling data Doing PCA
pcs$nPCs
end_time <- Sys.time()
end_time - start_time
Time difference of 7.174332 hours
df_out = t(pcs$x)[1:pcs$nPCs,]
dim(df_out)
df_out[1:5,1:5]
all(colnames(df_out) == rownames(metadata))
saveRDS(df_out, file = '../../output/feature_matrices/FM_BROCKMAN_cusanovich2018subset.rds')
# combine our sample description table with the tSNE projection
pcs$tSNEProj = merge(pcs$tSNEProj,metadata, by.x="ID",by.y="row.names")
#pcs$tSNEProj$treated = !grepl("-rep",pcs$tSNEProj$celltype)
dim(pcs$tSNEProj)
head(pcs$tSNEProj)
ID | tSNE1 | tSNE2 | label |
---|---|---|---|
<chr> | <dbl> | <dbl> | <chr> |
AGCGATAGAACGAATTCGAAGCCTACGACCTATCCT | 87.78452 | -47.96175 | Lung |
AGCGATAGAACGAATTCGAAGCCTACGATATAGCCT | 71.48026 | 27.36386 | Lung |
AGCGATAGAACGAATTCGACTGAGCGACTATAGCCT | -84.67282 | -45.55337 | Lung |
AGCGATAGAACGAATTCGCAATGAGTCCCAGGACGT | -68.08292 | 17.79370 | Thymus |
AGCGATAGAACGAATTCGCCTCCGACGGGTACTGAC | 73.41360 | 14.74591 | Lung |
AGCGATAGAACGAATTCGGCGATTGCAGCCTATCCT | 27.77479 | 27.37983 | Heart |
options(repr.plot.width=6, repr.plot.height=4)
p=ggplot(pcs$tSNEProj, aes(x=tSNE1, y=tSNE2, colour=label)) + theme_classic() + geom_point();
print(p)
sessionInfo()
R version 3.5.1 (2018-07-02) Platform: x86_64-conda_cos6-linux-gnu (64-bit) Running under: CentOS Linux 7 (Core) Matrix products: default BLAS: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_BROCKMAN3/lib/R/lib/libRblas.so LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_BROCKMAN3/lib/R/lib/libRlapack.so locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 [7] LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] irlba_2.3.2 Matrix_1.2-17 BrockmanR_0.0.0.9000 [4] tsne_0.1-3 ggplot2_3.2.0 loaded via a namespace (and not attached): [1] Rcpp_1.0.1 pillar_1.4.2 compiler_3.5.1 base64enc_0.1-3 [5] tools_3.5.1 zeallot_0.1.0 digest_0.6.20 uuid_0.1-2 [9] jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0 [13] lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0 IRdisplay_0.7.0 [17] IRkernel_1.0.1 repr_1.0.1 withr_2.1.2 dplyr_0.8.3 [21] vctrs_0.2.0 grid_3.5.1 tidyselect_0.2.5 glue_1.3.1 [25] R6_2.4.0 pbdZMQ_0.3-3 purrr_0.3.2 magrittr_1.5 [29] backports_1.1.4 scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 [33] colorspace_1.4-1 labeling_0.3 lazyeval_0.2.2 munsell_0.5.0 [37] crayon_1.3.4
save.image(file = 'BROCKMAN_cusanovich2018subset.RData')