library(prabclus)
library(Matrix)
library(Rtsne)
library(ggplot2)
Loading required package: MASS Loading required package: mclust Package 'mclust' version 5.4.3 Type 'citation("mclust")' for citing this R package in publications.
bsub < count_reads_peaks.sh
path = './count_reads_peaks_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)
#assuming tab separated values with a header
datalist = lapply(files, function(x)fread(paste0(path,x))$V4)
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist)
dim(datafr)
df_regions = read.csv("../../input/atac_v1_pbmc_5k_peaks.bed",
sep = '\t',header=FALSE,stringsAsFactors=FALSE)
dim(df_regions)
peaknames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")
head(peaknames)
head(sapply(strsplit(files,'\\.'),'[', 2))
colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = peaknames
datafr[1:5,1:5]
AAACGAAAGCGCAATG-1 | AAACGAAAGGGTATCG-1 | AAACGAAAGTAACATG-1 | AAACGAAAGTTACACC-1 | AAACGAACAGAGATGC-1 | |
---|---|---|---|---|---|
chr1_10244_10510 | 1 | 0 | 1 | 6 | 0 |
chr1_237575_237942 | 0 | 0 | 0 | 0 | 0 |
chr1_565098_565554 | 0 | 0 | 6 | 0 | 0 |
chr1_569172_569645 | 0 | 0 | 8 | 6 | 0 |
chr1_713421_715095 | 4 | 4 | 10 | 4 | 0 |
dim(datafr)
# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')
filter_peaks <- function (datafr,cutoff = 0.01){
binary_mat = as.matrix((datafr > 0) + 0)
binary_mat = Matrix(binary_mat, sparse = TRUE)
num_cells_ncounted = Matrix::rowSums(binary_mat)
ncounts = binary_mat[num_cells_ncounted >= dim(binary_mat)[2]*cutoff,]
ncounts = ncounts[rowSums(ncounts) > 0,]
options(repr.plot.width=4, repr.plot.height=4)
hist(log10(num_cells_ncounted),main="No. of Cells Each Site is Observed In",breaks=50)
abline(v=log10(min(num_cells_ncounted[num_cells_ncounted >= dim(binary_mat)[2]*cutoff])),lwd=2,col="indianred")
# hist(log10(new_counts),main="Number of Sites Each Cell Uses",breaks=50)
datafr_filtered = datafr[rownames(ncounts),]
return(datafr_filtered)
}
getJaccardDist <- function(cdBinary){
if(colnames(cdBinary[,2:3])[1] == 'start' && colnames(cdBinary[,2:3])[2] == 'end'){
SingleCell.Binary <- cdBinary[,4:(dim(cdBinary)[2])]
}
else
SingleCell.Binary <- cdBinary
SingleCell.Binary.Jaccard <- jaccard(as.matrix(SingleCell.Binary))
return(SingleCell.Binary.Jaccard)
}
plot_tSNE <- function(cdBinary, nDimToUSE, groups=NULL, cellName, perplexity_division, ret.val=FALSE , text.label=FALSE, title=""){
if(missing(nDimToUSE)){
stop("ERROR: Number of PCA's \"nDimToUSE\" to use is missing")
}
if(missing(ret.val)){
ret.val = FALSE
}
if(text.label==TRUE & missing(cellName)){
stop("ERROR: Please give cellName")
}
if(missing(perplexity_division)){
stop("ERROR: Please enter the number with which you want to divide the dimension of your data for perplexity setting")
}
#SingleCell.Binary.Jaccard <- getJaccardDist(cdBinary)
#FinalPCAData <- t(SingleCell.Binary.Jaccard)
SingleCell.Binary.Jaccard <- getJaccardDist(cdBinary)
fit <- cmdscale(as.dist(SingleCell.Binary.Jaccard),eig=TRUE, k=nDimToUSE)
#pcaPRComp <- prcomp(FinalPCAData)
#rtsne_pca_out <- Rtsne(as.matrix(pcaPRComp$x[,1:nPCAToUSE]), perplexity = dim(pcaPRComp$x)[1]/perplexity_division, check_duplicates = FALSE)
rtsne_pca_out <- Rtsne(as.matrix(fit$points[,1:nDimToUSE]), perplexity = perplexity_division,
check_duplicates = FALSE, pca=FALSE, theta=0.01, max_iter=3000)
if(is.null(groups)){
if(ret.val==TRUE)
df<-data.frame(X=rtsne_pca_out$Y[,1],Y=rtsne_pca_out$Y[,2], cellName=cellName)
else
df<-data.frame(X=rtsne_pca_out$Y[,1],Y=rtsne_pca_out$Y[,2])
p1 <- ggplot(df, aes_string(x="X",y ="Y"))
}
else{
if(ret.val==TRUE)
df<-data.frame(X=rtsne_pca_out$Y[,1],Y=rtsne_pca_out$Y[,2], Cell=colnames(SingleCell.Binary.Jaccard), Batch=groups, cellName=cellName)
else
df<-data.frame(X=rtsne_pca_out$Y[,1],Y=rtsne_pca_out$Y[,2], Cell=colnames(SingleCell.Binary.Jaccard), Batch=groups)
p1 <- ggplot(df, aes_string(x="X",y ="Y", color="Batch"))
}
p1<-p1 + ggtitle("t-SNE plot")
p1<-p1 + geom_point(size = 2)
p1<-p1 + xlab(paste("Dim-1"))
p1<-p1 + ylab("Dim-2")+
theme_light(base_size=24) +
theme(strip.background = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5))
# if(length(levels(groups)) < 12)
# p1 <- p1 + scale_colour_brewer(palette="Set1")
if(text.label==TRUE)
p1<-p1 + geom_text(data=df,aes(label=cellName),alpha=0.5,size=4, vjust=1,hjust=0.5,angle=45)
print(p1)
return(df)
}
start_time <- Sys.time()
metadata <- read.table('../../input/metadata.tsv',
header = TRUE,
stringsAsFactors=FALSE,quote="",row.names=1)
datafr_filtered <- filter_peaks(datafr)
dim(datafr_filtered)
cdBinary = as.matrix((datafr_filtered > 0) + 0)
cdBinary = Matrix(cdBinary, sparse = TRUE)
## use the similar number of dimensions as shown in tutorial https://github.com/ManchesterBioinference/Scasat/blob/master/ScAsAT_functions_Buenrostro_All_Bam_Together.ipynb
k = 15
SingleCell.Binary.Jaccard <- getJaccardDist(cdBinary)
fit <- cmdscale(as.dist(SingleCell.Binary.Jaccard),eig=TRUE, k=k)
fm_Scasat = t(fit$points)
dim(fm_Scasat)
fm_Scasat[1:5,1:5]
AAACGAAAGCGCAATG-1 | AAACGAAAGGGTATCG-1 | AAACGAAAGTAACATG-1 | AAACGAAAGTTACACC-1 | AAACGAACAGAGATGC-1 |
---|---|---|---|---|
-0.051167899 | -0.039053205 | -0.24040989 | -0.22332404 | 0.094020841 |
-0.147709843 | -0.161547157 | 0.04736274 | 0.04024757 | -0.100804918 |
-0.006473506 | 0.035480921 | 0.06608409 | 0.07443681 | -0.021050537 |
0.010683836 | 0.017649322 | -0.01767384 | -0.06423900 | -0.013695877 |
0.027035383 | 0.001416199 | 0.00852579 | -0.01542106 | 0.001180007 |
end_time <- Sys.time()
end_time - start_time
Time difference of 1.80813 mins
rownames(fm_Scasat) = paste('Dim',1:dim(fm_Scasat)[1])
fm_Scasat[1:5,1:5]
AAACGAAAGCGCAATG-1 | AAACGAAAGGGTATCG-1 | AAACGAAAGTAACATG-1 | AAACGAAAGTTACACC-1 | AAACGAACAGAGATGC-1 | |
---|---|---|---|---|---|
Dim 1 | -0.051167899 | -0.039053205 | -0.24040989 | -0.22332404 | 0.094020841 |
Dim 2 | -0.147709843 | -0.161547157 | 0.04736274 | 0.04024757 | -0.100804918 |
Dim 3 | -0.006473506 | 0.035480921 | 0.06608409 | 0.07443681 | -0.021050537 |
Dim 4 | 0.010683836 | 0.017649322 | -0.01767384 | -0.06423900 | -0.013695877 |
Dim 5 | 0.027035383 | 0.001416199 | 0.00852579 | -0.01542106 | 0.001180007 |
all(colnames(fm_Scasat) == rownames(metadata))
saveRDS(fm_Scasat, file = '../../output/feature_matrices/FM_Scasat_10xpbmc5k.rds')
options(repr.plot.width=6, repr.plot.height=4)
set.seed(2019)
tsNE_out_2PCs <- plot_tSNE(cdBinary, k, groups=as.character(metadata$label), perplexity=30,
cellName=rownames(metadata),ret.val=TRUE, text.label=F)
sessionInfo()
R version 3.5.1 (2018-07-02) Platform: x86_64-conda_cos6-linux-gnu (64-bit) Running under: CentOS Linux 7 (Core) Matrix products: default BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_scasat/lib/R/lib/libRblas.so locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 [7] LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] ggplot2_3.0.0 Rtsne_0.15 Matrix_1.2-14 prabclus_2.2-7 mclust_5.4.3 [6] MASS_7.3-50 loaded via a namespace (and not attached): [1] Rcpp_1.0.1 compiler_3.5.1 pillar_1.3.1 plyr_1.8.4 [5] bindr_0.1.1 base64enc_0.1-3 tools_3.5.1 digest_0.6.18 [9] uuid_0.1-2 jsonlite_1.5 evaluate_0.11 tibble_2.1.1 [13] gtable_0.2.0 lattice_0.20-35 pkgconfig_2.0.2 rlang_0.3.3 [17] IRdisplay_0.5.0 IRkernel_0.8.12 bindrcpp_0.2.2 withr_2.1.2 [21] repr_0.15.0 stringr_1.3.1 dplyr_0.7.6 tidyselect_0.2.5 [25] grid_3.5.1 glue_1.3.1 R6_2.4.0 pbdZMQ_0.3-3 [29] purrr_0.3.2 magrittr_1.5 scales_0.5.0 htmltools_0.3.6 [33] assertthat_0.2.1 colorspace_1.3-2 labeling_0.3 stringi_1.2.4 [37] lazyeval_0.2.1 munsell_0.5.0 crayon_1.3.4
save.image(file = 'Scasat_10xpbmc5k.RData')