Vignettes¶

http://atlas.gs.washington.edu/fly-atac/docs/

In [ ]:

Import packages¶

In [1]:

library(data.table)
library(dplyr)
library(Matrix)
library(BuenColors)
library(stringr)
library(cowplot)
library(RColorBrewer)
library(ggpubr)
library(irlba)
library(umap)
library(gplots)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:data.table’:

    between, first, last

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: MASS

Attaching package: ‘MASS’

The following object is masked from ‘package:dplyr’:

    select

Loading required package: ggplot2

Attaching package: ‘cowplot’

The following object is masked from ‘package:ggplot2’:

    ggsave

Loading required package: magrittr

Attaching package: ‘ggpubr’

The following object is masked from ‘package:cowplot’:

    get_legend


Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess

In [ ]:

Preprocess¶

make windows¶

bedtools makewindows -g ../input/mm9/mm9.chrom.sizes -w 5000 > mm9.windows.5kb.bed

Count reads¶

bsub < count_reads_windows.sh

In [2]:

path = './count_reads_windows_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)

12178

In [3]:

#assuming tab separated values with a header    
datalist = lapply(files, function(x)fread(paste0(path,x))$V4) 
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist)

In [4]:

dim(datafr)

545170
12178

In [5]:

df_regions = read.csv("./mm9.windows.5kb.bed",
                      sep = '\t',header=FALSE,stringsAsFactors=FALSE)

In [6]:

dim(df_regions)
head(df_regions)

545170
3

A data.frame: 6 × 3
V1	V2	V3
<chr>	<int>	<int>
chr1	0	5000
chr1	5000	10000
chr1	10000	15000
chr1	15000	20000
chr1	20000	25000
chr1	25000	30000

In [7]:

winnames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")

In [8]:

head(winnames)

'chr1_0_5000'
'chr1_5000_10000'
'chr1_10000_15000'
'chr1_15000_20000'
'chr1_20000_25000'
'chr1_25000_30000'

In [10]:

head(sapply(strsplit(files,'\\.'),'[', 2))

'AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT'
'AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC'
'AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC'
'AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC'
'AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC'
'AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT'

In [9]:

colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = winnames

In [22]:

dim(datafr)
head(datafr[,1:3])

545170
12178

A matrix: 6 × 3 of type int
	AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT	AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC	AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC
chr1_0_5000	0	0	0
chr1_5000_10000	0	0	0
chr1_10000_15000	0	0	0
chr1_15000_20000	0	0	0
chr1_20000_25000	0	0	0
chr1_25000_30000	0	0	0

In [12]:

# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')

In [13]:

elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){
    set.seed(2019) 
    mat = data.matrix(mat)
    SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)
    options(repr.plot.width=width, repr.plot.height=height)
    df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);
#     print(SVD$d[1:num_pcs])
    p <- ggplot(df_plot, aes(x = PC, y = SD)) +
      geom_point(col="#cd5c5c",size = 1) + 
      ggtitle(title)
    return(p)
}

run_umap <- function(fm_mat){
    umap_object = umap(t(fm_mat),random_state = 2019)
    df_umap = umap_object$layout
    return(df_umap)
}


plot_umap <- function(df_umap,labels,title='UMAP',colormap=colormap){
    set.seed(2019) 
    df_umap = data.frame(cbind(df_umap,labels),stringsAsFactors = FALSE)
    colnames(df_umap) = c('umap1','umap2','celltype')
    df_umap$umap1 = as.numeric(df_umap$umap1)
    df_umap$umap2 = as.numeric(df_umap$umap2)
    options(repr.plot.width=4, repr.plot.height=4)
    p <- ggplot(shuf(df_umap), aes(x = umap1, y = umap2, color = celltype)) +
      geom_point(size = 1) + scale_color_manual(values = colormap,breaks=unique(labels)) +
      ggtitle(title)
    return(p)
}

In [ ]:

Identify Clades¶

In [14]:

metadata <- read.table('../input/metadata.tsv',
                         header = TRUE,
                         stringsAsFactors=FALSE,quote="",row.names=1)

In [15]:

binary_mat = as.matrix((datafr > 0) + 0)
binary_mat = Matrix(binary_mat, sparse = TRUE)

In [16]:

num_cells_ncounted = rowSums(binary_mat)
ncounts = binary_mat[which(num_cells_ncounted >= num_cells_ncounted[order(num_cells_ncounted,decreasing=T)[20000]]),]
new_counts = colSums(ncounts)
# ncounts = ncounts[,new_counts >= quantile(new_counts,probs=0.1)]
ncounts = ncounts[rowSums(ncounts) > 0,]

options(repr.plot.width=8, repr.plot.height=4)
par(mfrow=c(1,2))
hist(log10(num_cells_ncounted),main="No. of Cells Each Site is Observed In",breaks=50)
abline(v=log10(num_cells_ncounted[order(num_cells_ncounted,decreasing=T)[20000]]),lwd=2,col="indianred")
hist(log10(new_counts),main="Number of Sites Each Cell Uses",breaks=50)
# abline(v=log10(quantile(new_counts,probs=0.1)),lwd=2,col="indianred")

In [17]:

dim(ncounts)

20049
12178

In [18]:

nfreqs = t(t(ncounts) / Matrix::colSums(ncounts))
idf = as(log(1 + ncol(ncounts) / Matrix::rowSums(ncounts)), "sparseVector")
tf_idf_counts = as(Diagonal(x=as.vector(idf)), "sparseMatrix") %*% nfreqs

In [19]:

dim(tf_idf_counts)

20049
12178

In [23]:

p_elbow <- elbow_plot(tf_idf_counts,num_pcs = 100, title = 'PCA')

In [24]:

p_elbow

In [ ]:

In [26]:

set.seed(2019)
num_pcs = 25
SVD = irlba(tf_idf_counts, num_pcs, num_pcs)
sk_diag = matrix(0, nrow=num_pcs, ncol=num_pcs)
diag(sk_diag) = SVD$d

##remove component 1 as suggested
sk_diag[1,1] = 0

SVDumap_vd = t(sk_diag %*% t(SVD$v))

In [27]:

dim(SVDumap_vd)

12178
25

In [28]:

LSI_out = t(t(sk_diag %*% t(SVD$v)) %*% t(SVD$u))
LSI_out = t(scale(t(LSI_out)))
LSI_out[LSI_out > 1.5] = 1.5
LSI_out[LSI_out < -1.5] = -1.5

In [29]:

dim(LSI_out)

20049
12178

In [30]:

#This step can take a minute too
hclust_cells = hclust(proxy::dist(t(sk_diag %*% t(SVD$v)), method="cosine"), method="ward.D2")
hclust_genes = hclust(proxy::dist(t(sk_diag %*% t(SVD$u)), method="cosine"), method="ward.D2")

color_pal = colorRampPalette(brewer.pal(8, "Set2"))(length(unique(metadata$label)))
hmcols = colorpanel(100, "steelblue", "white", "tomato")
cells_tree_cut = cutree(hclust_cells, length(unique(metadata$label)))
lsi_cells = cbind(colnames(ncounts),cells_tree_cut)

heatmap¶

In [31]:

options(repr.plot.width=4, repr.plot.height=6)
heatmap.2(LSI_out, 
          col=hmcols,
          ColSideColors=color_pal[as.factor(cells_tree_cut)],
          #RowSideColors=color_pal[as.factor(genes_tree_cut)],
          Rowv = as.dendrogram(hclust_genes), Colv = as.dendrogram(hclust_cells),
          labRow=FALSE, labCol=FALSE, trace="none",  scale="none",
          useRaster=TRUE)

In [32]:

str(hclust_cells)

List of 7
 $ merge      : int [1:12177, 1:2] -7392 -5089 -5034 -5419 -5081 -7344 -4965 -5156 -5138 -5008 ...
 $ height     : num [1:12177] 0.00295 0.00361 0.00381 0.00384 0.00411 ...
 $ order      : int [1:12178] 5465 5198 5348 5122 5197 5679 5087 7058 4912 5669 ...
 $ labels     : NULL
 $ method     : chr "ward.D2"
 $ call       : language hclust(d = proxy::dist(t(sk_diag %*% t(SVD$v)), method = "cosine"), method = "ward.D2")
 $ dist.method: chr "cosine"
 - attr(*, "class")= chr "hclust"

In [ ]:

In [33]:

dim(SVDumap_vd)

12178
25

In [34]:

df_umap_LSI <- run_umap(t(SVDumap_vd))
df_umap_LSI = as.data.frame(df_umap_LSI)
row.names(df_umap_LSI) = colnames(datafr)

In [86]:

labels = metadata[colnames(datafr),'label']
num_colors = length(unique(metadata$label))
colormap = colorRampPalette(brewer.pal(8, "Dark2"))(num_colors)
names(colormap) = sort(unique(metadata$label))

In [87]:

p_LSI <- plot_umap(df_umap_LSI,labels = labels,colormap = colormap,title='Cusanovich2018')+labs(color='tissue')
p_LSI

In [88]:

labels2 = as.character(cells_tree_cut)
num_colors = length(unique(metadata$label))
colormap2= colorRampPalette(brewer.pal(8, "Set2"))(num_colors)
names(colormap2) = as.character(seq(1,num_colors))

In [89]:

p_LSI_cluster <- plot_umap(df_umap_LSI,labels = labels2,colormap = colormap2,title='Cusanovich2018')
p_LSI_cluster = p_LSI_cluster+labs(color='cluster')
p_LSI_cluster

In [90]:

# options(repr.plot.width=4*3, repr.plot.height=4*1)
# p_group = cowplot::plot_grid(p_LSI,p_LSI_cluster,
#                    labels = "AUTO",ncol = 2)
# p_group

In [91]:

p_LSI.legend <- get_legend(p_LSI)
p_LSI_cluster.legend <- get_legend(p_LSI_cluster)

In [98]:

p_LSI <- p_LSI + theme(legend.position='none')
p_LSI_cluster <- p_LSI_cluster + theme(legend.position='none')

options(repr.plot.width=4*3, repr.plot.height=4*1)
p_group = cowplot::plot_grid(p_LSI + theme(legend.position='none'), 
                             p_LSI.legend,
                             p_LSI_cluster + theme(legend.position='none'),
                             p_LSI_cluster.legend,
                             labels = c('A','','B',''),ncol = 4, rel_widths=c(1,0.5,1,0.5))
p_group

In [99]:

cowplot::ggsave(p_group,filename = 'Cusanovich2018_clades_cusanovich2018subset.pdf',width = 4*3, height = 4*1)

In [101]:

save.image(file = 'Cusanovich2018_cusanovich2018subset_idenfify_clades.RData')

In [94]:

df_pseudobulk = cbind(cellid=colnames(datafr),clusterid=labels2)

In [95]:

head(df_pseudobulk)

A matrix: 6 × 2 of type chr
cellid	clusterid
AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT	1
AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC	2
AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC	1
AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC	3
AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC	2
AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT	1

In [100]:

write.table(df_pseudobulk,'./peak_calling/pseudobulk.tsv',quote = FALSE,sep='\t',row.names = FALSE)

In [ ]: