library(data.table)
library(Matrix)
library(proxy)
library(Rtsne)
library(data.table)
library(irlba)
library(umap)
library(ggplot2)
Attaching package: ‘proxy’ The following object is masked from ‘package:Matrix’: as.matrix The following objects are masked from ‘package:stats’: as.dist, dist The following object is masked from ‘package:base’: as.matrix
bsub < count_reads_peaks_erisone.sh
path = './count_reads_peaks_output/'
files <- list.files(path,pattern = "\\.txt$")
length(files)
#assuming tab separated values with a header
datalist = lapply(files, function(x)fread(paste0(path,x))$V4)
#assuming the same header/columns for all files
datafr = do.call("cbind", datalist)
dim(datafr)
df_regions = read.csv("../../input/combined.sorted.merged.bed",
sep = '\t',header=FALSE,stringsAsFactors=FALSE)
dim(df_regions)
peaknames = paste(df_regions$V1,df_regions$V2,df_regions$V3,sep = "_")
head(peaknames)
head(sapply(strsplit(files,'\\.'),'[', 2))
colnames(datafr) = sapply(strsplit(files,'\\.'),'[', 2)
rownames(datafr) = peaknames
datafr[1:5,1:5]
AGCGATAGAATACGATAATGGCAGCTCGCAGGACGT | AGCGATAGAATATTACTTTCCGCGGACTGTACTGAC | AGCGATAGACCAGGCGCATGGCAGCTCGATAGAGGC | AGCGATAGAGATTACGTTGCGCAATGACGTACTGAC | AGCGATAGAGGTCAGCTTGGAGTTGCGTGTACTGAC | |
---|---|---|---|---|---|
chr1_3002715_3002962 | 0 | 0 | 0 | 0 | 0 |
chr1_3037090_3037634 | 0 | 0 | 0 | 0 | 0 |
chr1_3084622_3085850 | 0 | 0 | 0 | 0 | 0 |
chr1_3103610_3104006 | 0 | 0 | 0 | 0 | 0 |
chr1_3106869_3107182 | 0 | 0 | 0 | 0 | 0 |
dim(datafr)
# saveRDS(datafr, file = './datafr.rds')
# datafr = readRDS('./datafr.rds')
run_pca <- function(mat,num_pcs=50,remove_first_PC=FALSE,scale=FALSE,center=FALSE){
set.seed(2019)
mat = as.matrix(mat)
SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)
sk_diag = matrix(0, nrow=num_pcs, ncol=num_pcs)
diag(sk_diag) = SVD$d
if(remove_first_PC){
sk_diag[1,1] = 0
SVD_vd = (sk_diag %*% t(SVD$v))[2:num_pcs,]
}else{
SVD_vd = sk_diag %*% t(SVD$v)
}
return(SVD_vd)
}
elbow_plot <- function(mat,num_pcs=50,scale=FALSE,center=FALSE,title='',width=3,height=3){
set.seed(2019)
mat = data.matrix(mat)
SVD = irlba(mat, num_pcs, num_pcs,scale=scale,center=center)
options(repr.plot.width=width, repr.plot.height=height)
df_plot = data.frame(PC=1:num_pcs, SD=SVD$d);
# print(SVD$d[1:num_pcs])
p <- ggplot(df_plot, aes(x = PC, y = SD)) +
geom_point(col="#cd5c5c",size = 1) +
ggtitle(title)
return(p)
}
filter_peaks <- function (datafr,cutoff = 0.01){
binary_mat = as.matrix((datafr > 0) + 0)
binary_mat = Matrix(binary_mat, sparse = TRUE)
num_cells_ncounted = Matrix::rowSums(binary_mat)
ncounts = binary_mat[num_cells_ncounted >= dim(binary_mat)[2]*cutoff,]
ncounts = ncounts[rowSums(ncounts) > 0,]
options(repr.plot.width=4, repr.plot.height=4)
hist(log10(num_cells_ncounted),main="No. of Cells Each Site is Observed In",breaks=50)
abline(v=log10(min(num_cells_ncounted[num_cells_ncounted >= dim(binary_mat)[2]*cutoff])),lwd=2,col="indianred")
# hist(log10(new_counts),main="Number of Sites Each Cell Uses",breaks=50)
datafr_filtered = datafr[rownames(ncounts),]
return(datafr_filtered)
}
start_time <- Sys.time()
metadata <- read.table('../../input/metadata.tsv',
header = TRUE,
stringsAsFactors=FALSE,quote="",row.names=1)
datafr_filtered <- filter_peaks(datafr)
dim(datafr_filtered)
p_elbow_control <- elbow_plot(datafr_filtered,num_pcs = 100, title = 'PCA on the raw count')
p_elbow_control
fm_control = run_pca(datafr_filtered,num_pcs = 50)
dim(fm_control)
fm_control[1:3,1:3]
-13.397742 | -27.635632 | -60.01563053 |
-4.870369 | -6.288103 | 10.40814457 |
-1.223308 | -1.490986 | 0.05539264 |
end_time <- Sys.time()
end_time - start_time
Time difference of 7.680634 mins
colnames(fm_control) = colnames(datafr)
rownames(fm_control) = paste('PC',1:dim(fm_control)[1])
dim(fm_control)
all(colnames(fm_control) == rownames(metadata))
fm_control = fm_control[,rownames(metadata)]
dim(fm_control)
all(colnames(fm_control) == rownames(metadata))
saveRDS(fm_control, file = '../../output/feature_matrices/FM_Control_cusanovich2018subset.rds')
set.seed(0)
tsne_control = Rtsne(t(fm_control),pca=F)
library(RColorBrewer)
plot.tsne <- function(x, labels,
main="A tSNE visualization",n=20,
pad=0.1, cex=0.65, pch=19, add=FALSE, legend.suffix="",
cex.main=1, cex.legend=1) {
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
layout = x
xylim = range(layout)
xylim = xylim + ((xylim[2]-xylim[1])*pad)*c(-0.5, 0.5)
if (!add) {
par(mar=c(0.2,0.7,1.2,0.7), ps=10)
plot(xylim, xylim, type="n", axes=F, frame=F)
rect(xylim[1], xylim[1], xylim[2], xylim[2], border="#aaaaaa", lwd=0.25)
}
points(layout[,1], layout[,2], col=col_vector[as.integer(labels)],
cex=cex, pch=pch)
mtext(side=3, main, cex=cex.main)
labels.u = unique(labels)
legend.pos = "topright"
legend.text = as.character(labels.u)
if (add) {
legend.pos = "bottomright"
legend.text = paste(as.character(labels.u), legend.suffix)
}
legend(legend.pos, legend=legend.text,
col=col_vector[as.integer(labels.u)],
bty="n", pch=pch, cex=cex.legend)
}
options(repr.plot.width=5, repr.plot.height=5)
plot.tsne(tsne_control$Y,as.factor(metadata[,'label']))
sessionInfo()
R version 3.5.1 (2018-07-02) Platform: x86_64-conda_cos6-linux-gnu (64-bit) Running under: CentOS Linux 7 (Core) Matrix products: default BLAS/LAPACK: /data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_sciATAC/lib/R/lib/libRblas.so locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 [7] LC_PAPER=en_US.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] RColorBrewer_1.1-2 ggplot2_3.1.1 umap_0.2.0.0 irlba_2.3.2 [5] Rtsne_0.15 proxy_0.4-22 Matrix_1.2-17 data.table_1.12.2 loaded via a namespace (and not attached): [1] Rcpp_1.0.1 compiler_3.5.1 pillar_1.3.1 plyr_1.8.4 [5] base64enc_0.1-3 tools_3.5.1 digest_0.6.18 uuid_0.1-2 [9] jsonlite_1.6 evaluate_0.13 tibble_2.1.1 gtable_0.3.0 [13] lattice_0.20-38 pkgconfig_2.0.2 rlang_0.3.4 IRdisplay_0.7.0 [17] IRkernel_0.8.15 withr_2.1.2 repr_0.19.2 dplyr_0.8.0.1 [21] grid_3.5.1 tidyselect_0.2.5 reticulate_1.12 glue_1.3.1 [25] R6_2.4.0 pbdZMQ_0.3-3 purrr_0.3.2 magrittr_1.5 [29] scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 colorspace_1.4-1 [33] labeling_0.3 lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
save.image(file = 'Control_cusanovich2018subset.RData')