Installation¶

In [1]:

# if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#   BiocManager::install("cicero")

Vignette: https://www.bioconductor.org/packages/devel/bioc/vignettes/cicero/inst/doc/website.html

In [ ]:

Import packages¶

In [2]:

library(cicero)
library(data.table)
library(Matrix)
library(proxy)
library(reshape2)
library(BuenColors)
library(umap)

Loading required package: monocle
Loading required package: Matrix
Loading required package: Biobase
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:Matrix’:

    colMeans, colSums, rowMeans, rowSums, which

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, tapply, union, unique,
    unsplit, which, which.max, which.min

Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Loading required package: ggplot2
Loading required package: VGAM
Loading required package: stats4
Loading required package: splines
Loading required package: DDRTree
Loading required package: irlba
Loading required package: Gviz
Loading required package: S4Vectors

Attaching package: ‘S4Vectors’

The following object is masked from ‘package:Matrix’:

    expand

The following object is masked from ‘package:base’:

    expand.grid

Loading required package: IRanges
Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
Loading required package: grid

Attaching package: 'data.table'

The following object is masked from 'package:GenomicRanges':

    shift

The following object is masked from 'package:IRanges':

    shift

The following objects are masked from 'package:S4Vectors':

    first, second


Attaching package: 'proxy'

The following object is masked from 'package:IRanges':

    as.matrix

The following object is masked from 'package:S4Vectors':

    as.matrix

The following object is masked from 'package:Matrix':

    as.matrix

The following objects are masked from 'package:stats':

    as.dist, dist

The following object is masked from 'package:base':

    as.matrix


Attaching package: 'reshape2'

The following objects are masked from 'package:data.table':

    dcast, melt

Loading required package: MASS

In [ ]:

Preprocess¶

bsub < count_reads_peaks.sh

In [3]:

path = './count_reads_peaks_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)

5335

In [ ]:

#assuming tab separated values with a header    
datalist = lapply(files, function(x)fread(paste0(path,x))$V4) 
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist)

In [5]:

dim(datafr)

97998
5335

In [7]:

df_regions = read.csv("../../input/atac_v1_pbmc_5k_peaks.bed",
                      sep = '\t',header=FALSE,stringsAsFactors=FALSE)

In [8]:

dim(df_regions)

97998
3

In [9]:

peaknames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")

In [10]:

head(peaknames)

'chr1_10244_10510'
'chr1_237575_237942'
'chr1_565098_565554'
'chr1_569172_569645'
'chr1_713421_715095'
'chr1_752386_753061'

In [11]:

head(sapply(strsplit(files,'\\.'),'[', 2))

'AAACGAAAGCGCAATG-1'
'AAACGAAAGGGTATCG-1'
'AAACGAAAGTAACATG-1'
'AAACGAAAGTTACACC-1'
'AAACGAACAGAGATGC-1'
'AAACGAACATGCTATG-1'

In [12]:

colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = peaknames

In [13]:

datafr[1:5,1:5]

	AAACGAAAGCGCAATG-1	AAACGAAAGGGTATCG-1	AAACGAAAGTAACATG-1	AAACGAAAGTTACACC-1
chr1_10244_10510	1	0	1	6
chr1_237575_237942	0	0	0	0
chr1_565098_565554	0	0	6	0
chr1_569172_569645	0	0	8	6
chr1_713421_715095	4	4	10	4

In [14]:

dim(datafr)

97998
5335

In [15]:

# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')

In [16]:

mat_sparse = as(datafr, "dgTMatrix")
cicero_data = data.frame(cbind(Peak=rownames(datafr)[mat_sparse@i+1],
                               Cell=colnames(datafr)[mat_sparse@j+1],
                               Count=mat_sparse@x),stringsAsFactors = FALSE)
cicero_data$Count = as.numeric(cicero_data$Count)

In [17]:

head(cicero_data)

Peak	Cell	Count
chr1_10244_10510	AAACGAAAGCGCAATG-1	1
chr1_713421_715095	AAACGAAAGCGCAATG-1	4
chr1_752386_753061	AAACGAAAGCGCAATG-1	4
chr1_762084_763359	AAACGAAAGCGCAATG-1	2
chr1_894003_896944	AAACGAAAGCGCAATG-1	4
chr1_928466_937967	AAACGAAAGCGCAATG-1	4

In [ ]:

Obtain Feature Matrix¶

In [18]:

start_time <- Sys.time()

In [19]:

metadata <- read.table('../../input/metadata.tsv',
                         header = TRUE,
                         stringsAsFactors=FALSE,quote="",row.names=1)

In [20]:

input_cds <- make_atac_cds(cicero_data, binarize = TRUE)

In [21]:

pData(input_cds)$label = metadata[rownames(pData(input_cds)),'label']

In [22]:

dim(input_cds)

Features: 97959
Samples: 5335

In [23]:

#Ensure there are no peaks included with zero reads
input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,]

In [24]:

dim(input_cds)

Features: 97959
Samples: 5335

In [25]:

set.seed(2019)
input_cds <- detectGenes(input_cds)
input_cds <- estimateSizeFactors(input_cds)
input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=15,
                        reduction_method = 'tSNE', norm_method = "none")

In [26]:

tsne_coords <- t(reducedDimA(input_cds))

In [27]:

head(tsne_coords)

28.711891	25.01078
13.204125	31.22400
-7.203896	19.77712
-1.562194	23.16863
32.384441	-10.08500
14.256548	-25.58791

In [28]:

df_tsne_coords = data.frame(cbind(tsne_coords,as.character(pData(input_cds)$label)),stringsAsFactors = FALSE)
colnames(df_tsne_coords) = c('tsne_1','tsne_2','label')
df_tsne_coords$tsne_1 = as.numeric(df_tsne_coords$tsne_1)
df_tsne_coords$tsne_2 = as.numeric(df_tsne_coords$tsne_2)

In [29]:

options(repr.plot.width=4, repr.plot.height=3)
p <- ggplot(shuf(df_tsne_coords), aes(x = tsne_1, y = tsne_2, color = label)) +
  geom_point(size = 1)+
  ggtitle('tSNE') + theme_classic()
p

In [ ]:

In [30]:

data("human.hg19.genome")
genome_ref = human.hg19.genome
file_tss='../../input/hg19/hg19-tss.bed'

In [31]:

head(genome_ref)

V1	V2
chr1	249250621
chr2	243199373
chr3	198022430
chr4	191154276
chr5	180915260
chr6	171115067

In [32]:

row.names(tsne_coords) <- row.names(pData(input_cds))
cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)
conns <- run_cicero(cicero_cds, genome_ref) # Takes a few minutes to run

Overlap QC metrics:
Cells per bin: 50
Maximum shared cells bin-bin: 44
Mean shared cells bin-bin: 0.540258832870187
Median shared cells bin-bin: 0

[1] "Starting Cicero"
[1] "Calculating distance_parameter value"
[1] "Running models"
[1] "Assembling connections"
[1] "Done"

In [33]:

gene_annotation <- read.table(file_tss,sep='\t')
names(gene_annotation)[4] <- "gene"    
gene_annotation_pos <- subset(gene_annotation, V5 == "+")
gene_annotation_pos$V3 <- gene_annotation_pos$V2 + 1
gene_annotation_neg <- subset(gene_annotation, V5 == "-")
gene_annotation_neg$V2 <- gene_annotation_neg$V3 - 1
tss <- rbind(gene_annotation_pos, gene_annotation_neg)

In [34]:

input_cds <- annotate_cds_by_site(input_cds, tss)
# generate unnormalized gene activity matrix
unnorm_ga <- build_gene_activity_matrix(input_cds, conns)
unnorm_ga <- unnorm_ga[!Matrix::rowSums(unnorm_ga) == 0,]

# make a list of num_genes_expressed
num_genes <- pData(input_cds)$num_genes_expressed
names(num_genes) <- row.names(pData(input_cds))

# normalize
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)    
fm_Cicero = as.matrix(cicero_gene_activities)

In [35]:

end_time <- Sys.time()

In [36]:

end_time - start_time

Time difference of 12.29284 mins

In [ ]:

In [37]:

all(colnames(fm_Cicero) == rownames(metadata))

TRUE

In [43]:

dim(fm_Cicero)
fm_Cicero[1:5,1:5]

12492
5335

	AAACGAAAGCGCAATG-1	AAACGAAAGGGTATCG-1	AAACGAAAGTAACATG-1	AAACGAAAGTTACACC-1	AAACGAACAGAGATGC-1
A2LD1	0.0000000000	0.000000e+00	0.0000000000	0.0000000000	0.0002671473
A4GALT	0.0000000000	0.000000e+00	0.0000000000	0.0000000000	0.0000000000
AAAS	0.0000000000	0.000000e+00	0.0001690673	0.0001752499	0.0001030934
AACS	0.0003260717	3.392227e-05	0.0001507090	0.0003147217	0.0000000000
AADAT	0.0000000000	0.000000e+00	0.0000000000	0.0000000000	0.0000000000

In [ ]:

In [38]:

saveRDS(fm_Cicero, file = '../../output/feature_matrices/FM_Cicero_10xpbmc5k.rds')

In [ ]:

Downstream Analysis¶

In [39]:

df_umap_Cicero <- umap(t(fm_Cicero))$layout

In [44]:

df_umap = data.frame(cbind(df_umap_Cicero,metadata[rownames(pData(input_cds)),'label']),stringsAsFactors = FALSE)
colnames(df_umap) = c('umap_1','umap_2','label')
df_umap$umap_1 = as.numeric(df_umap$umap_1)
df_umap$umap_2 = as.numeric(df_umap$umap_2)
df_umap$label = as.character(df_umap$label)
options(repr.plot.width=5, repr.plot.height=4)
p <- ggplot(df_umap, aes(x = umap_1, y = umap_2, color = label)) +
  geom_point(size = 1) + 
  ggtitle('Cicero') + theme_classic()
p

In [45]:

sessionInfo()

R version 3.5.1 (2018-07-02)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_Cicero/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] grid      splines   stats4    parallel  stats     graphics  grDevices
 [8] utils     datasets  methods   base     

other attached packages:
 [1] umap_0.2.2.0         BuenColors_0.5.5     MASS_7.3-51.1       
 [4] reshape2_1.4.3       proxy_0.4-23         data.table_1.12.0   
 [7] cicero_1.0.15        Gviz_1.26.5          GenomicRanges_1.34.0
[10] GenomeInfoDb_1.18.1  IRanges_2.16.0       S4Vectors_0.20.1    
[13] monocle_2.10.1       DDRTree_0.1.5        irlba_2.3.3         
[16] VGAM_1.0-6           ggplot2_3.1.0        Biobase_2.42.0      
[19] BiocGenerics_0.28.0  Matrix_1.2-17       

loaded via a namespace (and not attached):
  [1] Rtsne_0.15                  colorspace_1.4-1           
  [3] biovizBase_1.30.1           IRdisplay_0.7.0            
  [5] htmlTable_1.13.1            XVector_0.22.0             
  [7] base64enc_0.1-3             dichromat_2.0-0            
  [9] rstudioapi_0.10             ggrepel_0.8.0              
 [11] bit64_0.9-7                 RSpectra_0.14-0            
 [13] AnnotationDbi_1.44.0        docopt_0.6.1               
 [15] knitr_1.22                  glasso_1.10                
 [17] IRkernel_0.8.15             Formula_1.2-3              
 [19] jsonlite_1.6                Rsamtools_1.34.0           
 [21] cluster_2.0.7-1             pheatmap_1.0.12            
 [23] compiler_3.5.1              httr_1.4.0                 
 [25] backports_1.1.3             assertthat_0.2.1           
 [27] lazyeval_0.2.2              limma_3.38.3               
 [29] acepack_1.4.1               htmltools_0.3.6            
 [31] prettyunits_1.0.2           tools_3.5.1                
 [33] igraph_1.2.4                gtable_0.3.0               
 [35] glue_1.3.1                  GenomeInfoDbData_1.2.1     
 [37] RANN_2.6.1                  dplyr_0.8.0.1              
 [39] Rcpp_1.0.1                  slam_0.1-45                
 [41] Biostrings_2.50.2           rtracklayer_1.42.1         
 [43] xfun_0.6                    stringr_1.4.0              
 [45] ensembldb_2.6.8             XML_3.98-1.19              
 [47] zlibbioc_1.28.0             scales_1.0.0               
 [49] BSgenome_1.50.0             VariantAnnotation_1.28.13  
 [51] hms_0.4.2                   ProtGenerics_1.14.0        
 [53] SummarizedExperiment_1.12.0 AnnotationFilter_1.6.0     
 [55] RColorBrewer_1.1-2          curl_3.3                   
 [57] reticulate_1.12             memoise_1.1.0              
 [59] gridExtra_2.3               biomaRt_2.38.0             
 [61] rpart_4.1-13                fastICA_1.2-1              
 [63] latticeExtra_0.6-28         stringi_1.4.3              
 [65] RSQLite_2.1.1               checkmate_1.9.1            
 [67] GenomicFeatures_1.34.8      densityClust_0.3           
 [69] BiocParallel_1.16.6         repr_0.19.2                
 [71] rlang_0.3.3                 pkgconfig_2.0.2            
 [73] matrixStats_0.54.0          bitops_1.0-6               
 [75] qlcMatrix_0.9.7             evaluate_0.13              
 [77] lattice_0.20-38             purrr_0.3.2                
 [79] labeling_0.3                GenomicAlignments_1.18.1   
 [81] htmlwidgets_1.3             bit_1.1-14                 
 [83] tidyselect_0.2.5            plyr_1.8.4                 
 [85] magrittr_1.5                R6_2.4.0                   
 [87] Hmisc_4.2-0                 combinat_0.0-8             
 [89] pbdZMQ_0.3-3                DelayedArray_0.8.0         
 [91] DBI_1.0.0                   pillar_1.3.1               
 [93] foreign_0.8-71              withr_2.1.2                
 [95] survival_2.43-3             RCurl_1.95-4.12            
 [97] nnet_7.3-12                 tibble_2.1.1               
 [99] crayon_1.3.4                uuid_0.1-2                 
[101] viridis_0.5.1               progress_1.2.0             
[103] blob_1.1.1                  FNN_1.1.3                  
[105] HSMMSingleCell_1.2.0        sparsesvd_0.1-4            
[107] digest_0.6.18               munsell_0.5.0              
[109] viridisLite_0.3.0

In [46]:

save.image(file = 'Cicero_10xpbmc5k.RData')

In [ ]: