# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("cicero")
library(cicero)
library(data.table)
library(Matrix)
library(proxy)
library(reshape2)
library(BuenColors)
library(umap)
Loading required package: monocle Loading required package: Matrix Loading required package: Biobase Loading required package: BiocGenerics Loading required package: parallel Attaching package: ‘BiocGenerics’ The following objects are masked from ‘package:parallel’: clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB The following objects are masked from ‘package:Matrix’: colMeans, colSums, rowMeans, rowSums, which The following objects are masked from ‘package:stats’: IQR, mad, sd, var, xtabs The following objects are masked from ‘package:base’: anyDuplicated, append, as.data.frame, basename, cbind, colMeans, colnames, colSums, dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min Welcome to Bioconductor Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'. Loading required package: ggplot2 Loading required package: VGAM Loading required package: stats4 Loading required package: splines Loading required package: DDRTree Loading required package: irlba Loading required package: Gviz Loading required package: S4Vectors Attaching package: ‘S4Vectors’ The following object is masked from ‘package:Matrix’: expand The following object is masked from ‘package:base’: expand.grid Loading required package: IRanges Loading required package: GenomicRanges Loading required package: GenomeInfoDb Loading required package: grid Attaching package: 'data.table' The following object is masked from 'package:GenomicRanges': shift The following object is masked from 'package:IRanges': shift The following objects are masked from 'package:S4Vectors': first, second Attaching package: 'proxy' The following object is masked from 'package:IRanges': as.matrix The following object is masked from 'package:S4Vectors': as.matrix The following object is masked from 'package:Matrix': as.matrix The following objects are masked from 'package:stats': as.dist, dist The following object is masked from 'package:base': as.matrix Attaching package: 'reshape2' The following objects are masked from 'package:data.table': dcast, melt Loading required package: MASS
bsub < count_reads_peaks.sh
path = './count_reads_peaks_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)
#assuming tab separated values with a header
datalist = lapply(files, function(x)fread(paste0(path,x))$V4)
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist)
dim(datafr)
df_regions = read.csv("../../input/atac_v1_pbmc_5k_peaks.bed",
sep = '\t',header=FALSE,stringsAsFactors=FALSE)
dim(df_regions)
peaknames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")
head(peaknames)
head(sapply(strsplit(files,'\\.'),'[', 2))
colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = peaknames
datafr[1:5,1:5]
AAACGAAAGCGCAATG-1 | AAACGAAAGGGTATCG-1 | AAACGAAAGTAACATG-1 | AAACGAAAGTTACACC-1 | AAACGAACAGAGATGC-1 | |
---|---|---|---|---|---|
chr1_10244_10510 | 1 | 0 | 1 | 6 | 0 |
chr1_237575_237942 | 0 | 0 | 0 | 0 | 0 |
chr1_565098_565554 | 0 | 0 | 6 | 0 | 0 |
chr1_569172_569645 | 0 | 0 | 8 | 6 | 0 |
chr1_713421_715095 | 4 | 4 | 10 | 4 | 0 |
dim(datafr)
# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')
mat_sparse = as(datafr, "dgTMatrix")
cicero_data = data.frame(cbind(Peak=rownames(datafr)[mat_sparse@i+1],
Cell=colnames(datafr)[mat_sparse@j+1],
Count=mat_sparse@x),stringsAsFactors = FALSE)
cicero_data$Count = as.numeric(cicero_data$Count)
head(cicero_data)
Peak | Cell | Count |
---|---|---|
chr1_10244_10510 | AAACGAAAGCGCAATG-1 | 1 |
chr1_713421_715095 | AAACGAAAGCGCAATG-1 | 4 |
chr1_752386_753061 | AAACGAAAGCGCAATG-1 | 4 |
chr1_762084_763359 | AAACGAAAGCGCAATG-1 | 2 |
chr1_894003_896944 | AAACGAAAGCGCAATG-1 | 4 |
chr1_928466_937967 | AAACGAAAGCGCAATG-1 | 4 |
start_time <- Sys.time()
metadata <- read.table('../../input/metadata.tsv',
header = TRUE,
stringsAsFactors=FALSE,quote="",row.names=1)
input_cds <- make_atac_cds(cicero_data, binarize = TRUE)
pData(input_cds)$label = metadata[rownames(pData(input_cds)),'label']
dim(input_cds)
#Ensure there are no peaks included with zero reads
input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,]
dim(input_cds)
set.seed(2019)
input_cds <- detectGenes(input_cds)
input_cds <- estimateSizeFactors(input_cds)
input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=15,
reduction_method = 'tSNE', norm_method = "none")
tsne_coords <- t(reducedDimA(input_cds))
head(tsne_coords)
28.711891 | 25.01078 |
13.204125 | 31.22400 |
-7.203896 | 19.77712 |
-1.562194 | 23.16863 |
32.384441 | -10.08500 |
14.256548 | -25.58791 |
df_tsne_coords = data.frame(cbind(tsne_coords,as.character(pData(input_cds)$label)),stringsAsFactors = FALSE)
colnames(df_tsne_coords) = c('tsne_1','tsne_2','label')
df_tsne_coords$tsne_1 = as.numeric(df_tsne_coords$tsne_1)
df_tsne_coords$tsne_2 = as.numeric(df_tsne_coords$tsne_2)
options(repr.plot.width=4, repr.plot.height=3)
p <- ggplot(shuf(df_tsne_coords), aes(x = tsne_1, y = tsne_2, color = label)) +
geom_point(size = 1)+
ggtitle('tSNE') + theme_classic()
p
data("human.hg19.genome")
genome_ref = human.hg19.genome
file_tss='../../input/hg19/hg19-tss.bed'
head(genome_ref)
V1 | V2 |
---|---|
chr1 | 249250621 |
chr2 | 243199373 |
chr3 | 198022430 |
chr4 | 191154276 |
chr5 | 180915260 |
chr6 | 171115067 |
row.names(tsne_coords) <- row.names(pData(input_cds))
cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
conns <- run_cicero(cicero_cds, genome_ref) # Takes a few minutes to run
Overlap QC metrics: Cells per bin: 50 Maximum shared cells bin-bin: 44 Mean shared cells bin-bin: 0.540258832870187 Median shared cells bin-bin: 0
[1] "Starting Cicero" [1] "Calculating distance_parameter value" [1] "Running models" [1] "Assembling connections" [1] "Done"
gene_annotation <- read.table(file_tss,sep='\t')
names(gene_annotation)[4] <- "gene"
gene_annotation_pos <- subset(gene_annotation, V5 == "+")
gene_annotation_pos$V3 <- gene_annotation_pos$V2 + 1
gene_annotation_neg <- subset(gene_annotation, V5 == "-")
gene_annotation_neg$V2 <- gene_annotation_neg$V3 - 1
tss <- rbind(gene_annotation_pos, gene_annotation_neg)
input_cds <- annotate_cds_by_site(input_cds, tss)
# generate unnormalized gene activity matrix
unnorm_ga <- build_gene_activity_matrix(input_cds, conns)
unnorm_ga <- unnorm_ga[!Matrix::rowSums(unnorm_ga) == 0,]
# make a list of num_genes_expressed
num_genes <- pData(input_cds)$num_genes_expressed
names(num_genes) <- row.names(pData(input_cds))
# normalize
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)
fm_Cicero = as.matrix(cicero_gene_activities)
end_time <- Sys.time()
end_time - start_time
Time difference of 12.29284 mins
all(colnames(fm_Cicero) == rownames(metadata))
dim(fm_Cicero)
fm_Cicero[1:5,1:5]
AAACGAAAGCGCAATG-1 | AAACGAAAGGGTATCG-1 | AAACGAAAGTAACATG-1 | AAACGAAAGTTACACC-1 | AAACGAACAGAGATGC-1 | |
---|---|---|---|---|---|
A2LD1 | 0.0000000000 | 0.000000e+00 | 0.0000000000 | 0.0000000000 | 0.0002671473 |
A4GALT | 0.0000000000 | 0.000000e+00 | 0.0000000000 | 0.0000000000 | 0.0000000000 |
AAAS | 0.0000000000 | 0.000000e+00 | 0.0001690673 | 0.0001752499 | 0.0001030934 |
AACS | 0.0003260717 | 3.392227e-05 | 0.0001507090 | 0.0003147217 | 0.0000000000 |
AADAT | 0.0000000000 | 0.000000e+00 | 0.0000000000 | 0.0000000000 | 0.0000000000 |
saveRDS(fm_Cicero, file = '../../output/feature_matrices/FM_Cicero_10xpbmc5k.rds')
df_umap_Cicero <- umap(t(fm_Cicero))$layout
df_umap = data.frame(cbind(df_umap_Cicero,metadata[rownames(pData(input_cds)),'label']),stringsAsFactors = FALSE)
colnames(df_umap) = c('umap_1','umap_2','label')
df_umap$umap_1 = as.numeric(df_umap$umap_1)
df_umap$umap_2 = as.numeric(df_umap$umap_2)
df_umap$label = as.character(df_umap$label)
options(repr.plot.width=5, repr.plot.height=4)
p <- ggplot(df_umap, aes(x = umap_1, y = umap_2, color = label)) +
geom_point(size = 1) +
ggtitle('Cicero') + theme_classic()
p
sessionInfo()
R version 3.5.1 (2018-07-02) Platform: x86_64-conda_cos6-linux-gnu (64-bit) Running under: CentOS Linux 7 (Core) Matrix products: default BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_Cicero/lib/R/lib/libRblas.so locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 [7] LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] grid splines stats4 parallel stats graphics grDevices [8] utils datasets methods base other attached packages: [1] umap_0.2.2.0 BuenColors_0.5.5 MASS_7.3-51.1 [4] reshape2_1.4.3 proxy_0.4-23 data.table_1.12.0 [7] cicero_1.0.15 Gviz_1.26.5 GenomicRanges_1.34.0 [10] GenomeInfoDb_1.18.1 IRanges_2.16.0 S4Vectors_0.20.1 [13] monocle_2.10.1 DDRTree_0.1.5 irlba_2.3.3 [16] VGAM_1.0-6 ggplot2_3.1.0 Biobase_2.42.0 [19] BiocGenerics_0.28.0 Matrix_1.2-17 loaded via a namespace (and not attached): [1] Rtsne_0.15 colorspace_1.4-1 [3] biovizBase_1.30.1 IRdisplay_0.7.0 [5] htmlTable_1.13.1 XVector_0.22.0 [7] base64enc_0.1-3 dichromat_2.0-0 [9] rstudioapi_0.10 ggrepel_0.8.0 [11] bit64_0.9-7 RSpectra_0.14-0 [13] AnnotationDbi_1.44.0 docopt_0.6.1 [15] knitr_1.22 glasso_1.10 [17] IRkernel_0.8.15 Formula_1.2-3 [19] jsonlite_1.6 Rsamtools_1.34.0 [21] cluster_2.0.7-1 pheatmap_1.0.12 [23] compiler_3.5.1 httr_1.4.0 [25] backports_1.1.3 assertthat_0.2.1 [27] lazyeval_0.2.2 limma_3.38.3 [29] acepack_1.4.1 htmltools_0.3.6 [31] prettyunits_1.0.2 tools_3.5.1 [33] igraph_1.2.4 gtable_0.3.0 [35] glue_1.3.1 GenomeInfoDbData_1.2.1 [37] RANN_2.6.1 dplyr_0.8.0.1 [39] Rcpp_1.0.1 slam_0.1-45 [41] Biostrings_2.50.2 rtracklayer_1.42.1 [43] xfun_0.6 stringr_1.4.0 [45] ensembldb_2.6.8 XML_3.98-1.19 [47] zlibbioc_1.28.0 scales_1.0.0 [49] BSgenome_1.50.0 VariantAnnotation_1.28.13 [51] hms_0.4.2 ProtGenerics_1.14.0 [53] SummarizedExperiment_1.12.0 AnnotationFilter_1.6.0 [55] RColorBrewer_1.1-2 curl_3.3 [57] reticulate_1.12 memoise_1.1.0 [59] gridExtra_2.3 biomaRt_2.38.0 [61] rpart_4.1-13 fastICA_1.2-1 [63] latticeExtra_0.6-28 stringi_1.4.3 [65] RSQLite_2.1.1 checkmate_1.9.1 [67] GenomicFeatures_1.34.8 densityClust_0.3 [69] BiocParallel_1.16.6 repr_0.19.2 [71] rlang_0.3.3 pkgconfig_2.0.2 [73] matrixStats_0.54.0 bitops_1.0-6 [75] qlcMatrix_0.9.7 evaluate_0.13 [77] lattice_0.20-38 purrr_0.3.2 [79] labeling_0.3 GenomicAlignments_1.18.1 [81] htmlwidgets_1.3 bit_1.1-14 [83] tidyselect_0.2.5 plyr_1.8.4 [85] magrittr_1.5 R6_2.4.0 [87] Hmisc_4.2-0 combinat_0.0-8 [89] pbdZMQ_0.3-3 DelayedArray_0.8.0 [91] DBI_1.0.0 pillar_1.3.1 [93] foreign_0.8-71 withr_2.1.2 [95] survival_2.43-3 RCurl_1.95-4.12 [97] nnet_7.3-12 tibble_2.1.1 [99] crayon_1.3.4 uuid_0.1-2 [101] viridis_0.5.1 progress_1.2.0 [103] blob_1.1.1 FNN_1.1.3 [105] HSMMSingleCell_1.2.0 sparsesvd_0.1-4 [107] digest_0.6.18 munsell_0.5.0 [109] viridisLite_0.3.0
save.image(file = 'Cicero_10xpbmc5k.RData')