library(data.table)
library(Matrix)
library(proxy)
library(Rtsne)
library(densityClust)
library(data.table)
library(irlba)
library(umap)
library(ggplot2)
Attaching package: ‘proxy’ The following object is masked from ‘package:Matrix’: as.matrix The following objects are masked from ‘package:stats’: as.dist, dist The following object is masked from ‘package:base’: as.matrix
bsub < count_reads_peaks_erisone.sh
path = './count_reads_peaks_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)
#assuming tab separated values with a header
datalist = lapply(files, function(x)fread(paste0(path,x))$V4)
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist)
dim(datafr)
df_regions = read.csv("../../input/atac_v1_pbmc_5k_peaks.bed",
sep = '\t',header=FALSE,stringsAsFactors=FALSE)
dim(df_regions)
peaknames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")
head(peaknames)
head(sapply(strsplit(files,'\\.'),'[', 2))
colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = peaknames
head(datafr)
AAACGAAAGCGCAATG-1 | AAACGAAAGGGTATCG-1 | AAACGAAAGTAACATG-1 | AAACGAAAGTTACACC-1 | AAACGAACAGAGATGC-1 | AAACGAACATGCTATG-1 | AAACGAAGTGCATCAT-1 | AAACGAAGTGGACGAT-1 | AAACGAAGTGGCCTCA-1 | AAACGAATCAGTGTAC-1 | ⋯ | TTTGGTTGTCAGAAAT-1 | TTTGGTTGTTGTATCG-1 | TTTGGTTTCAGTGGTT-1 | TTTGTGTAGGAAACTT-1 | TTTGTGTCAAGCCTTA-1 | TTTGTGTCACTCAAGT-1 | TTTGTGTCACTGGGCT-1 | TTTGTGTGTACGCAAG-1 | TTTGTGTGTCTGCGCA-1 | TTTGTGTTCAACTTGG-1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
chr1_10244_10510 | 1 | 0 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 |
chr1_237575_237942 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
chr1_565098_565554 | 0 | 0 | 6 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | ⋯ | 2 | 0 | 0 | 0 | 4 | 0 | 0 | 12 | 0 | 0 |
chr1_569172_569645 | 0 | 0 | 8 | 6 | 0 | 4 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 |
chr1_713421_715095 | 4 | 4 | 10 | 4 | 0 | 2 | 0 | 6 | 0 | 4 | ⋯ | 2 | 4 | 2 | 2 | 2 | 0 | 8 | 4 | 0 | 0 |
chr1_752386_753061 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
dim(datafr)
# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')
elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){
set.seed(2019)
mat = data.matrix(mat)
SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)
options(repr.plot.width=width, repr.plot.height=height)
df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);
# print(SVD$d[1:num_pcs])
p <- ggplot(df_plot, aes(x = PC, y = SD)) +
geom_point(col="#cd5c5c",size = 1) +
ggtitle(title)
return(p)
}
start_time <- Sys.time()
metadata <- read.table('../../input/metadata.tsv',
header = TRUE,
stringsAsFactors=FALSE,quote="",row.names=1)
binary_mat = as.matrix((datafr > 0) + 0)
binary_mat = Matrix(binary_mat, sparse = TRUE)
binary_mat[1:3,1:3]
3 x 3 sparse Matrix of class "dgCMatrix" AAACGAAAGCGCAATG-1 AAACGAAAGGGTATCG-1 AAACGAAAGTAACATG-1 chr1_10244_10510 1 . 1 chr1_237575_237942 . . . chr1_565098_565554 . . 1
num_cells_ncounted = rowSums(binary_mat)
ncounts = binary_mat[num_cells_ncounted >= dim(binary_mat)[2]*0.01,]
new_counts = colSums(ncounts)
# ncounts = ncounts[,new_counts >= quantile(new_counts,probs=0.1)]
ncounts = ncounts[rowSums(ncounts) > 0,]
options(repr.plot.width=8, repr.plot.height=4)
par(mfrow=c(1,2))
hist(log10(num_cells_ncounted),main="No. of Cells Each Site is Observed In",breaks=50)
abline(v=log10(min(num_cells_ncounted[num_cells_ncounted >= dim(binary_mat)[2]*0.01])),lwd=2,col="indianred")
hist(log10(new_counts),main="Number of Sites Each Cell Uses",breaks=50)
# abline(v=log10(quantile(new_counts,probs=0.1)),lwd=2,col="indianred")
dim(ncounts)
sexsites = c(grep("chrY",rownames(ncounts)),grep("chrX",rownames(ncounts)))
ncounts.nosex = ncounts[-sexsites,]
nfreqs = t(t(ncounts.nosex) / Matrix::colSums(ncounts.nosex))
idf = as(log(1 + ncol(ncounts.nosex) / Matrix::rowSums(ncounts.nosex)), "sparseVector")
tf_idf_counts = as(Diagonal(x=as.vector(idf)), "sparseMatrix") %*% nfreqs
dim(tf_idf_counts)
p_elbow_LSI <- elbow_plot(tf_idf_counts,num_pcs = 200, title = 'PCA on LSI')
p_elbow_LSI
set.seed(2019)
num_pcs = 150
SVDtsne = irlba(tf_idf_counts, num_pcs, num_pcs)
d_diagtsne = matrix(0, nrow=num_pcs, ncol=num_pcs)
diag(d_diagtsne) = SVDtsne$d
SVDtsne_vd = t(d_diagtsne %*% t(SVDtsne$v))
dim(SVDtsne_vd)
SVDtsne_vd[1:3,1:3]
-0.009063687 | -0.0002460163 | -0.0002805486 |
-0.008831733 | -0.0001022342 | -0.0004277593 |
-0.008985058 | -0.0001010684 | 0.0011672419 |
end_time <- Sys.time()
end_time - start_time
Time difference of 4.298949 mins
df_out = t(SVDtsne_vd)
colnames(df_out) = colnames(datafr)
rownames(df_out) = paste('PC',1:dim(SVDtsne_vd)[2])
dim(df_out)
all(colnames(df_out) == rownames(metadata))
saveRDS(df_out, file = '../../output/feature_matrices/FM_Cusanovich2018_10xpbmc5k.rds')
set.seed(0)
tsnetfidf = Rtsne(SVDtsne_vd,pca=F)
library(RColorBrewer)
plot.tsne <- function(x, labels,
main="A tSNE visualization",n=20,
pad=0.1, cex=0.65, pch=19, add=FALSE, legend.suffix="",
cex.main=1, cex.legend=1) {
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
layout = x
xylim = range(layout)
xylim = xylim + ((xylim[2]-xylim[1])*pad)*c(-0.5, 0.5)
if (!add) {
par(mar=c(0.2,0.7,1.2,0.7), ps=10)
plot(xylim, xylim, type="n", axes=F, frame=F)
rect(xylim[1], xylim[1], xylim[2], xylim[2], border="#aaaaaa", lwd=0.25)
}
points(layout[,1], layout[,2], col=col_vector[as.integer(labels)],
cex=cex, pch=pch)
mtext(side=3, main, cex=cex.main)
labels.u = unique(labels)
legend.pos = "topright"
legend.text = as.character(labels.u)
if (add) {
legend.pos = "bottomright"
legend.text = paste(as.character(labels.u), legend.suffix)
}
legend(legend.pos, legend=legend.text,
col=col_vector[as.integer(labels.u)],
bty="n", pch=pch, cex=cex.legend)
}
options(repr.plot.width=5, repr.plot.height=5)
plot.tsne(tsnetfidf$Y,as.factor(metadata[,'label']))
sessionInfo()
R version 3.5.1 (2018-07-02) Platform: x86_64-conda_cos6-linux-gnu (64-bit) Running under: CentOS Linux 7 (Core) Matrix products: default BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_sciATAC/lib/R/lib/libRblas.so locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 [7] LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] RColorBrewer_1.1-2 ggplot2_3.1.1 umap_0.2.0.0 irlba_2.3.2 [5] densityClust_0.3 Rtsne_0.15 proxy_0.4-22 Matrix_1.2-17 [9] data.table_1.12.2 loaded via a namespace (and not attached): [1] Rcpp_1.0.1 compiler_3.5.1 pillar_1.3.1 plyr_1.8.4 [5] base64enc_0.1-3 tools_3.5.1 digest_0.6.18 uuid_0.1-2 [9] jsonlite_1.6 evaluate_0.13 tibble_2.1.1 gtable_0.3.0 [13] lattice_0.20-38 pkgconfig_2.0.2 rlang_0.3.4 IRdisplay_0.7.0 [17] ggrepel_0.8.0 IRkernel_0.8.15 gridExtra_2.3 withr_2.1.2 [21] repr_0.19.2 dplyr_0.8.0.1 grid_3.5.1 tidyselect_0.2.5 [25] reticulate_1.12 glue_1.3.1 R6_2.4.0 pbdZMQ_0.3-3 [29] purrr_0.3.2 magrittr_1.5 scales_1.0.0 htmltools_0.3.6 [33] assertthat_0.2.1 colorspace_1.4-1 labeling_0.3 lazyeval_0.2.2 [37] munsell_0.5.0 crayon_1.3.4 FNN_1.1.3
save.image(file = 'Cusanovich2018_10xpbmc5k.RData')