library(data.table) library(dplyr) library(Matrix) library(BuenColors) library(stringr) library(cowplot) library(RColorBrewer) library(ggpubr) library(irlba) library(umap) library(gplots) path = './count_reads_windows_output/' files <- list.files(path,pattern = "\\.txt$") length(files) #assuming tab separated values with a header datalist = lapply(files, function(x)fread(paste0(path,x))$V4) #assuming the same header/columns for all files datafr = do.call("cbind", datalist) dim(datafr) df_regions = read.csv("./hg19.windows.5kb.bed", sep = '\t',header=FALSE,stringsAsFactors=FALSE) dim(df_regions) head(df_regions) winnames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_") head(winnames) head(sapply(strsplit(files,'\\.'),'[', 2)) colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2) rownames(datafr) = winnames dim(datafr) head(datafr) # saveRDS(datafr, file = './datafr.rds') # datafr = readRDS('./datafr.rds') elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){ set.seed(2019) mat = data.matrix(mat) SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center) options(repr.plot.width=width, repr.plot.height=height) df_plot = data.frame(PC=1:num_pcs, SD=SVD$d); # print(SVD$d[1:num_pcs]) p <- ggplot(df_plot, aes(x = PC, y = SD)) + geom_point(col="#cd5c5c",size = 1) + ggtitle(title) return(p) } run_umap <- function(fm_mat){ umap_object = umap(t(fm_mat),random_state = 2019) df_umap = umap_object$layout return(df_umap) } plot_umap <- function(df_umap,labels,title='UMAP',colormap=colormap){ set.seed(2019) df_umap = data.frame(cbind(df_umap,labels),stringsAsFactors = FALSE) colnames(df_umap) = c('umap1','umap2','celltype') df_umap$umap1 = as.numeric(df_umap$umap1) df_umap$umap2 = as.numeric(df_umap$umap2) options(repr.plot.width=4, repr.plot.height=4) p <- ggplot(shuf(df_umap), aes(x = umap1, y = umap2, color = celltype)) + geom_point(size = 1) + scale_color_manual(values = colormap,breaks=sort(unique(labels))) + ggtitle(title) return(p) } metadata <- read.table('../input/metadata.tsv', header = TRUE, stringsAsFactors=FALSE,quote="",row.names=1) metadata$label = as.character(metadata$label) binary_mat = as.matrix((datafr > 0) + 0) binary_mat = Matrix(binary_mat, sparse = TRUE) num_cells_ncounted = rowSums(binary_mat) ncounts = binary_mat[which(num_cells_ncounted >= num_cells_ncounted[order(num_cells_ncounted,decreasing=T)[20000]]),] new_counts = colSums(ncounts) # ncounts = ncounts[,new_counts >= quantile(new_counts,probs=0.1)] ncounts = ncounts[rowSums(ncounts) > 0,] options(repr.plot.width=8, repr.plot.height=4) par(mfrow=c(1,2)) hist(log10(num_cells_ncounted),main="No. of Cells Each Site is Observed In",breaks=50) abline(v=log10(num_cells_ncounted[order(num_cells_ncounted,decreasing=T)[20000]]),lwd=2,col="indianred") hist(log10(new_counts),main="Number of Sites Each Cell Uses",breaks=50) # abline(v=log10(quantile(new_counts,probs=0.1)),lwd=2,col="indianred") dim(ncounts) nfreqs = t(t(ncounts) / Matrix::colSums(ncounts)) idf = as(log(1 + ncol(ncounts) / Matrix::rowSums(ncounts)), "sparseVector") tf_idf_counts = as(Diagonal(x=as.vector(idf)), "sparseMatrix") %*% nfreqs dim(tf_idf_counts) p_elbow <- elbow_plot(tf_idf_counts,num_pcs = 200, title = 'PCA') p_elbow set.seed(2019) num_pcs = 100 SVD = irlba(tf_idf_counts, num_pcs, num_pcs) sk_diag = matrix(0, nrow=num_pcs, ncol=num_pcs) diag(sk_diag) = SVD$d ##remove component 1 as suggested sk_diag[1,1] = 0 SVDumap_vd = t(sk_diag %*% t(SVD$v)) dim(SVDumap_vd) LSI_out = t(t(sk_diag %*% t(SVD$v)) %*% t(SVD$u)) LSI_out = t(scale(t(LSI_out))) LSI_out[LSI_out > 1.5] = 1.5 LSI_out[LSI_out < -1.5] = -1.5 dim(LSI_out) #This step can take a minute too hclust_cells = hclust(proxy::dist(t(sk_diag %*% t(SVD$v)), method="cosine"), method="ward.D2") hclust_genes = hclust(proxy::dist(t(sk_diag %*% t(SVD$u)), method="cosine"), method="ward.D2") color_pal = colorRampPalette(brewer.pal(8, "Set2"))(length(unique(metadata$label))) hmcols = colorpanel(100, "steelblue", "white", "tomato") cells_tree_cut = cutree(hclust_cells, length(unique(metadata$label))) lsi_cells = cbind(colnames(ncounts),cells_tree_cut) options(repr.plot.width=4, repr.plot.height=6) heatmap.2(LSI_out, col=hmcols, ColSideColors=color_pal[as.factor(cells_tree_cut)], #RowSideColors=color_pal[as.factor(genes_tree_cut)], Rowv = as.dendrogram(hclust_genes), Colv = as.dendrogram(hclust_cells), labRow=FALSE, labCol=FALSE, trace="none", scale="none", useRaster=TRUE) str(hclust_cells) dim(SVDumap_vd) df_umap_LSI <- run_umap(t(SVDumap_vd)) df_umap_LSI = as.data.frame(df_umap_LSI) row.names(df_umap_LSI) = colnames(datafr) labels = metadata[colnames(datafr),'label'] num_colors = length(unique(metadata$label)) colormap = colorRampPalette(brewer.pal(8, "Accent"))(num_colors) names(colormap) = as.character(seq(1,num_colors)) p_LSI <- plot_umap(df_umap_LSI,labels = labels,colormap = colormap,title='Cusanovich2018') p_LSI = p_LSI +labs(color='cellranger') p_LSI labels2 = as.character(cells_tree_cut) num_colors = length(unique(metadata$label)) colormap2= colorRampPalette(brewer.pal(8, "Set2"))(num_colors) names(colormap2) = as.character(seq(1,num_colors)) p_LSI_cluster <- plot_umap(df_umap_LSI,labels = labels2,colormap = colormap2,title='Cusanovich2018') p_LSI_cluster = p_LSI_cluster+labs(color='cluster') p_LSI_cluster options(repr.plot.width=4*2.5, repr.plot.height=4*1) p_group = cowplot::plot_grid(p_LSI,p_LSI_cluster, labels = "AUTO",ncol = 2) p_group cowplot::ggsave(p_group,filename = 'Cusanovich2018_clades_10xpbmc5k.pdf',width = 4*2.5, height = 4*1) save.image(file = 'Cusanovich2018_10xpbmc5k_idenfify_clades.RData') df_pseudobulk = cbind(cellid=colnames(datafr),clusterid=labels2) head(df_pseudobulk) write.table(df_pseudobulk,'./peak_calling/pseudobulk.tsv',quote = FALSE,sep='\t',row.names = FALSE)