library(data.table)
library(dplyr)
library(Matrix)
library(BuenColors)
library(stringr)
library(cowplot)
library(RColorBrewer)
library(ggpubr)
library(irlba)
library(umap)
library(gplots)
Attaching package: ‘dplyr’ The following objects are masked from ‘package:data.table’: between, first, last The following objects are masked from ‘package:stats’: filter, lag The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union Loading required package: MASS Attaching package: ‘MASS’ The following object is masked from ‘package:dplyr’: select Loading required package: ggplot2 Attaching package: ‘cowplot’ The following object is masked from ‘package:ggplot2’: ggsave Loading required package: magrittr Attaching package: ‘ggpubr’ The following object is masked from ‘package:cowplot’: get_legend Attaching package: ‘gplots’ The following object is masked from ‘package:stats’: lowess
bedtools makewindows -g ../input/mm9/mm9.chrom.sizes -w 5000 > mm9.windows.5kb.bed
bsub < count_reads_windows.sh
path = './count_reads_windows_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)
#assuming tab separated values with a header
datalist = lapply(files, function(x)fread(paste0(path,x))$V4)
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist)
dim(datafr)
df_regions = read.csv("./mm9.windows.5kb.bed",
sep = '\t',header=FALSE,stringsAsFactors=FALSE)
dim(df_regions)
head(df_regions)
V1 | V2 | V3 |
---|---|---|
<chr> | <int> | <int> |
chr1 | 0 | 5000 |
chr1 | 5000 | 10000 |
chr1 | 10000 | 15000 |
chr1 | 15000 | 20000 |
chr1 | 20000 | 25000 |
chr1 | 25000 | 30000 |
winnames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")
head(winnames)
head(sapply(strsplit(files,'\\.'),'[', 2))
colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = winnames
dim(datafr)
head(datafr[,1:3])
AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT | AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC | AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC | |
---|---|---|---|
chr1_0_5000 | 0 | 0 | 0 |
chr1_5000_10000 | 0 | 0 | 0 |
chr1_10000_15000 | 0 | 0 | 0 |
chr1_15000_20000 | 0 | 0 | 0 |
chr1_20000_25000 | 0 | 0 | 0 |
chr1_25000_30000 | 0 | 0 | 0 |
# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')
elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){
set.seed(2019)
mat = data.matrix(mat)
SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)
options(repr.plot.width=width, repr.plot.height=height)
df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);
# print(SVD$d[1:num_pcs])
p <- ggplot(df_plot, aes(x = PC, y = SD)) +
geom_point(col="#cd5c5c",size = 1) +
ggtitle(title)
return(p)
}
run_umap <- function(fm_mat){
umap_object = umap(t(fm_mat),random_state = 2019)
df_umap = umap_object$layout
return(df_umap)
}
plot_umap <- function(df_umap,labels,title='UMAP',colormap=colormap){
set.seed(2019)
df_umap = data.frame(cbind(df_umap,labels),stringsAsFactors = FALSE)
colnames(df_umap) = c('umap1','umap2','celltype')
df_umap$umap1 = as.numeric(df_umap$umap1)
df_umap$umap2 = as.numeric(df_umap$umap2)
options(repr.plot.width=4, repr.plot.height=4)
p <- ggplot(shuf(df_umap), aes(x = umap1, y = umap2, color = celltype)) +
geom_point(size = 1) + scale_color_manual(values = colormap,breaks=unique(labels)) +
ggtitle(title)
return(p)
}
metadata <- read.table('../input/metadata.tsv',
header = TRUE,
stringsAsFactors=FALSE,quote="",row.names=1)
binary_mat = as.matrix((datafr > 0) + 0)
binary_mat = Matrix(binary_mat, sparse = TRUE)
num_cells_ncounted = rowSums(binary_mat)
ncounts = binary_mat[which(num_cells_ncounted >= num_cells_ncounted[order(num_cells_ncounted,decreasing=T)[20000]]),]
new_counts = colSums(ncounts)
# ncounts = ncounts[,new_counts >= quantile(new_counts,probs=0.1)]
ncounts = ncounts[rowSums(ncounts) > 0,]
options(repr.plot.width=8, repr.plot.height=4)
par(mfrow=c(1,2))
hist(log10(num_cells_ncounted),main="No. of Cells Each Site is Observed In",breaks=50)
abline(v=log10(num_cells_ncounted[order(num_cells_ncounted,decreasing=T)[20000]]),lwd=2,col="indianred")
hist(log10(new_counts),main="Number of Sites Each Cell Uses",breaks=50)
# abline(v=log10(quantile(new_counts,probs=0.1)),lwd=2,col="indianred")
dim(ncounts)
nfreqs = t(t(ncounts) / Matrix::colSums(ncounts))
idf = as(log(1 + ncol(ncounts) / Matrix::rowSums(ncounts)), "sparseVector")
tf_idf_counts = as(Diagonal(x=as.vector(idf)), "sparseMatrix") %*% nfreqs
dim(tf_idf_counts)
p_elbow <- elbow_plot(tf_idf_counts,num_pcs = 100, title = 'PCA')
p_elbow
set.seed(2019)
num_pcs = 25
SVD = irlba(tf_idf_counts, num_pcs, num_pcs)
sk_diag = matrix(0, nrow=num_pcs, ncol=num_pcs)
diag(sk_diag) = SVD$d
##remove component 1 as suggested
sk_diag[1,1] = 0
SVDumap_vd = t(sk_diag %*% t(SVD$v))
dim(SVDumap_vd)
LSI_out = t(t(sk_diag %*% t(SVD$v)) %*% t(SVD$u))
LSI_out = t(scale(t(LSI_out)))
LSI_out[LSI_out > 1.5] = 1.5
LSI_out[LSI_out < -1.5] = -1.5
dim(LSI_out)
#This step can take a minute too
hclust_cells = hclust(proxy::dist(t(sk_diag %*% t(SVD$v)), method="cosine"), method="ward.D2")
hclust_genes = hclust(proxy::dist(t(sk_diag %*% t(SVD$u)), method="cosine"), method="ward.D2")
color_pal = colorRampPalette(brewer.pal(8, "Set2"))(length(unique(metadata$label)))
hmcols = colorpanel(100, "steelblue", "white", "tomato")
cells_tree_cut = cutree(hclust_cells, length(unique(metadata$label)))
lsi_cells = cbind(colnames(ncounts),cells_tree_cut)
options(repr.plot.width=4, repr.plot.height=6)
heatmap.2(LSI_out,
col=hmcols,
ColSideColors=color_pal[as.factor(cells_tree_cut)],
#RowSideColors=color_pal[as.factor(genes_tree_cut)],
Rowv = as.dendrogram(hclust_genes), Colv = as.dendrogram(hclust_cells),
labRow=FALSE, labCol=FALSE, trace="none", scale="none",
useRaster=TRUE)
str(hclust_cells)
List of 7 $ merge : int [1:12177, 1:2] -7392 -5089 -5034 -5419 -5081 -7344 -4965 -5156 -5138 -5008 ... $ height : num [1:12177] 0.00295 0.00361 0.00381 0.00384 0.00411 ... $ order : int [1:12178] 5465 5198 5348 5122 5197 5679 5087 7058 4912 5669 ... $ labels : NULL $ method : chr "ward.D2" $ call : language hclust(d = proxy::dist(t(sk_diag %*% t(SVD$v)), method = "cosine"), method = "ward.D2") $ dist.method: chr "cosine" - attr(*, "class")= chr "hclust"
dim(SVDumap_vd)
df_umap_LSI <- run_umap(t(SVDumap_vd))
df_umap_LSI = as.data.frame(df_umap_LSI)
row.names(df_umap_LSI) = colnames(datafr)
labels = metadata[colnames(datafr),'label']
num_colors = length(unique(metadata$label))
colormap = colorRampPalette(brewer.pal(8, "Dark2"))(num_colors)
names(colormap) = sort(unique(metadata$label))
p_LSI <- plot_umap(df_umap_LSI,labels = labels,colormap = colormap,title='Cusanovich2018')+labs(color='tissue')
p_LSI
labels2 = as.character(cells_tree_cut)
num_colors = length(unique(metadata$label))
colormap2= colorRampPalette(brewer.pal(8, "Set2"))(num_colors)
names(colormap2) = as.character(seq(1,num_colors))
p_LSI_cluster <- plot_umap(df_umap_LSI,labels = labels2,colormap = colormap2,title='Cusanovich2018')
p_LSI_cluster = p_LSI_cluster+labs(color='cluster')
p_LSI_cluster
# options(repr.plot.width=4*3, repr.plot.height=4*1)
# p_group = cowplot::plot_grid(p_LSI,p_LSI_cluster,
# labels = "AUTO",ncol = 2)
# p_group
p_LSI.legend <- get_legend(p_LSI)
p_LSI_cluster.legend <- get_legend(p_LSI_cluster)
p_LSI <- p_LSI + theme(legend.position='none')
p_LSI_cluster <- p_LSI_cluster + theme(legend.position='none')
options(repr.plot.width=4*3, repr.plot.height=4*1)
p_group = cowplot::plot_grid(p_LSI + theme(legend.position='none'),
p_LSI.legend,
p_LSI_cluster + theme(legend.position='none'),
p_LSI_cluster.legend,
labels = c('A','','B',''),ncol = 4, rel_widths=c(1,0.5,1,0.5))
p_group
cowplot::ggsave(p_group,filename = 'Cusanovich2018_clades_cusanovich2018subset.pdf',width = 4*3, height = 4*1)
save.image(file = 'Cusanovich2018_cusanovich2018subset_idenfify_clades.RData')
df_pseudobulk = cbind(cellid=colnames(datafr),clusterid=labels2)
head(df_pseudobulk)
cellid | clusterid |
---|---|
AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT | 1 |
AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC | 2 |
AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC | 1 |
AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC | 3 |
AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC | 2 |
AGCGATAGAGTTGAATCAAAGCTAGGTTCCTATCCT | 1 |
write.table(df_pseudobulk,'./peak_calling/pseudobulk.tsv',quote = FALSE,sep='\t',row.names = FALSE)