devtools::install_github("zji90/SCRATdatamm9")
source("https://raw.githubusercontent.com/zji90/SCRATdata/master/installcode.R")
library(devtools)
library(GenomicAlignments)
library(Rsamtools)
library(SCRATdatahg19)
library(SCRAT)
Loading required package: BiocGenerics Loading required package: parallel Attaching package: ‘BiocGenerics’ The following objects are masked from ‘package:parallel’: clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB The following objects are masked from ‘package:stats’: IQR, mad, sd, var, xtabs The following objects are masked from ‘package:base’: anyDuplicated, append, as.data.frame, basename, cbind, colMeans, colnames, colSums, dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min Loading required package: S4Vectors Loading required package: stats4 Attaching package: ‘S4Vectors’ The following object is masked from ‘package:base’: expand.grid Loading required package: IRanges Loading required package: GenomeInfoDb Loading required package: GenomicRanges Loading required package: SummarizedExperiment Loading required package: Biobase Welcome to Bioconductor Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'. Loading required package: DelayedArray Loading required package: matrixStats Attaching package: ‘matrixStats’ The following objects are masked from ‘package:Biobase’: anyMissing, rowMedians Loading required package: BiocParallel Attaching package: ‘DelayedArray’ The following objects are masked from ‘package:matrixStats’: colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges The following objects are masked from ‘package:base’: aperm, apply Loading required package: Biostrings Loading required package: XVector Attaching package: ‘Biostrings’ The following object is masked from ‘package:DelayedArray’: type The following object is masked from ‘package:base’: strsplit Loading required package: Rsamtools Warning message: “replacing previous import ‘DT::dataTableOutput’ by ‘shiny::dataTableOutput’ when loading ‘SCRAT’”Warning message: “replacing previous import ‘DT::renderDataTable’ by ‘shiny::renderDataTable’ when loading ‘SCRAT’”Warning message: “replacing previous import ‘mclust::em’ by ‘shiny::em’ when loading ‘SCRAT’”
start_time = Sys.time()
metadata <- read.table('../input/metadata.tsv',
header = TRUE,
stringsAsFactors=FALSE,quote="",row.names=1)
SCRATsummary <- function (dir = "", genome, bamfile = NULL, singlepair = "automated",
removeblacklist = T, log2transform = T, adjustlen = T, featurelist = c("GENE",
"ENCL", "MOTIF_TRANSFAC", "MOTIF_JASPAR", "GSEA"), customfeature = NULL,
Genestarttype = "TSSup", Geneendtype = "TSSdown", Genestartbp = 3000,
Geneendbp = 1000, ENCLclunum = 2000, Motifflank = 100, GSEAterm = "c5.bp",
GSEAstarttype = "TSSup", GSEAendtype = "TSSdown", GSEAstartbp = 3000,
GSEAendbp = 1000)
{
if (is.null(bamfile)) {
bamfile <- list.files(dir, pattern = ".bam$")
}
datapath <- system.file("extdata", package = paste0("SCRATdata",
genome))
bamdata <- list()
for (i in bamfile) {
filepath <- file.path(dir, i)
if (singlepair == "automated") {
bamfile <- BamFile(filepath)
tmpsingle <- readGAlignments(bamfile)
tmppair <- readGAlignmentPairs(bamfile)
pairendtf <- testPairedEndBam(bamfile)
if (pairendtf) {
tmp <- tmppair
startpos <- pmin(start(first(tmp)), start(last(tmp)))
endpos <- pmax(end(first(tmp)), end(last(tmp)))
id <- which(!is.na(as.character(seqnames(tmp))))
tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))
}
else {
tmp <- GRanges(tmpsingle)
}
}
else if (singlepair == "single") {
tmp <- GRanges(readGAlignments(filepath))
}
else if (singlepair == "pair") {
tmp <- readGAlignmentPairs(filepath)
startpos <- pmin(start(first(tmp)), start(last(tmp)))
endpos <- pmax(end(first(tmp)), end(last(tmp)))
id <- which(!is.na(as.character(seqnames(tmp))))
tmp <- GRanges(seqnames=as.character(seqnames(tmp))[id],IRanges(start=startpos[id],end=endpos[id]))
}
if (removeblacklist) {
load(paste0(datapath, "/gr/blacklist.rda"))
tmp <- tmp[-as.matrix(findOverlaps(tmp, gr))[, 1],
]
}
bamdata[[i]] <- tmp
}
bamsummary <- sapply(bamdata, length)
allres <- NULL
datapath <- system.file("extdata", package = paste0("SCRATdata",
genome))
if ("GENE" %in% featurelist) {
print("Processing GENE features")
load(paste0(datapath, "/gr/generegion.rda"))
if (Genestarttype == "TSSup") {
grstart <- ifelse(as.character(strand(gr)) == "+",
start(gr) - as.numeric(Genestartbp), end(gr) +
as.numeric(Genestartbp))
}
else if (Genestarttype == "TSSdown") {
grstart <- ifelse(as.character(strand(gr)) == "+",
start(gr) + as.numeric(Genestartbp), end(gr) -
as.numeric(Genestartbp))
}
else if (Genestarttype == "TESup") {
grstart <- ifelse(as.character(strand(gr)) == "+",
end(gr) - as.numeric(Genestartbp), start(gr) +
as.numeric(Genestartbp))
}
else if (Genestarttype == "TESdown") {
grstart <- ifelse(as.character(strand(gr)) == "+",
end(gr) + as.numeric(Genestartbp), start(gr) -
as.numeric(Genestartbp))
}
if (Geneendtype == "TSSup") {
grend <- ifelse(as.character(strand(gr)) == "+",
start(gr) - as.numeric(Geneendbp), end(gr) +
as.numeric(Geneendbp))
}
else if (Geneendtype == "TSSdown") {
grend <- ifelse(as.character(strand(gr)) == "+",
start(gr) + as.numeric(Geneendbp), end(gr) -
as.numeric(Geneendbp))
}
else if (Geneendtype == "TESup") {
grend <- ifelse(as.character(strand(gr)) == "+",
end(gr) - as.numeric(Geneendbp), start(gr) +
as.numeric(Geneendbp))
}
else if (Geneendtype == "TESdown") {
grend <- ifelse(as.character(strand(gr)) == "+",
end(gr) + as.numeric(Geneendbp), start(gr) -
as.numeric(Geneendbp))
}
ngr <- names(gr)
gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart,
grend), end = pmax(grstart, grend)))
names(gr) <- ngr
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- end(gr) - start(gr) + 1
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
}
if ("ENCL" %in% featurelist) {
print("Processing ENCL features")
load(paste0(datapath, "/gr/ENCL", ENCLclunum, ".rda"))
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- sapply(gr, function(i) sum(end(i) - start(i) +
1))
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
}
if ("MOTIF_TRANSFAC" %in% featurelist) {
print("Processing MOTIF_TRANSFAC features")
load(paste0(datapath, "/gr/transfac1.rda"))
gr <- flank(gr, as.numeric(Motifflank), both = T)
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- sapply(gr, function(i) sum(end(i) - start(i) +
1))
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
load(paste0(datapath, "/gr/transfac2.rda"))
gr <- flank(gr, as.numeric(Motifflank), both = T)
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- sapply(gr, function(i) sum(end(i) - start(i) +
1))
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
if (genome %in% c("hg19", "hg38")) {
load(paste0(datapath, "/gr/transfac3.rda"))
gr <- flank(gr, as.numeric(Motifflank), both = T)
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- sapply(gr, function(i) sum(end(i) -
start(i) + 1))
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
}
}
if ("MOTIF_JASPAR" %in% featurelist) {
print("Processing MOTIF_JASPAR features")
load(paste0(datapath, "/gr/jaspar1.rda"))
gr <- flank(gr, as.numeric(Motifflank), both = T)
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- sapply(gr, function(i) sum(end(i) - start(i) +
1))
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
load(paste0(datapath, "/gr/jaspar2.rda"))
gr <- flank(gr, as.numeric(Motifflank), both = T)
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- sapply(gr, function(i) sum(end(i) - start(i) +
1))
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
}
if ("GSEA" %in% featurelist) {
print("Processing GSEA features")
for (i in GSEAterm) {
load(paste0(datapath, "/gr/GSEA", i, ".rda"))
allgr <- gr
for (sgrn in names(allgr)) {
gr <- allgr[[sgrn]]
if (GSEAstarttype == "TSSup") {
grstart <- ifelse(as.character(strand(gr)) ==
"+", start(gr) - as.numeric(GSEAstartbp),
end(gr) + as.numeric(GSEAstartbp))
}
else if (GSEAstarttype == "TSSdown") {
grstart <- ifelse(as.character(strand(gr)) ==
"+", start(gr) + as.numeric(GSEAstartbp),
end(gr) - as.numeric(GSEAstartbp))
}
else if (GSEAstarttype == "TESup") {
grstart <- ifelse(as.character(strand(gr)) ==
"+", end(gr) - as.numeric(GSEAstartbp), start(gr) +
as.numeric(GSEAstartbp))
}
else if (GSEAstarttype == "TESdown") {
grstart <- ifelse(as.character(strand(gr)) ==
"+", end(gr) + as.numeric(GSEAstartbp), start(gr) -
as.numeric(GSEAstartbp))
}
if (GSEAendtype == "TSSup") {
grend <- ifelse(as.character(strand(gr)) ==
"+", start(gr) - as.numeric(GSEAendbp), end(gr) +
as.numeric(GSEAendbp))
}
else if (GSEAendtype == "TSSdown") {
grend <- ifelse(as.character(strand(gr)) ==
"+", start(gr) + as.numeric(GSEAendbp), end(gr) -
as.numeric(GSEAendbp))
}
else if (GSEAendtype == "TESup") {
grend <- ifelse(as.character(strand(gr)) ==
"+", end(gr) - as.numeric(GSEAendbp), start(gr) +
as.numeric(GSEAendbp))
}
else if (GSEAendtype == "TESdown") {
grend <- ifelse(as.character(strand(gr)) ==
"+", end(gr) + as.numeric(GSEAendbp), start(gr) -
as.numeric(GSEAendbp))
}
ngr <- names(gr)
gr <- GRanges(seqnames = seqnames(gr), IRanges(start = pmin(grstart,
grend), end = pmax(grstart, grend)))
names(gr) <- ngr
allgr[[sgrn]] <- gr
}
gr <- allgr
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- sapply(gr, function(i) sum(end(i) -
start(i) + 1))
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
}
}
if ("Custom" %in% featurelist) {
print("Processing custom features")
gr <- read.table(customfeature, as.is = T, sep = "\t")
gr <- GRanges(seqnames = gr[, 1], IRanges(start = gr[,
2], end = gr[, 3]))
tmp <- sapply(bamdata, function(i) countOverlaps(gr,
i))
tmp <- sweep(tmp, 2, bamsummary, "/") * 10000
if (log2transform) {
tmp <- log2(tmp + 1)
}
if (adjustlen) {
grrange <- end(gr) - start(gr) + 1
tmp <- sweep(tmp, 1, grrange, "/") * 1e+06
}
tmp <- tmp[rowSums(tmp) > 0, , drop = F]
allres <- rbind(allres, tmp)
}
allres
}
df_out <- SCRATsummary(dir = "../input/sc-bams_nodup/",
genome = "mm9",
featurelist="MOTIF_JASPAR",
removeblacklist = FALSE,
log2transform = FALSE, adjustlen = FALSE)
end_time <- Sys.time()
end_time - start_time
dim(df_out)
df_out[1:5,1:5]
head(sapply(strsplit(colnames(df_out), "\\."),'[',2))
colnames(df_out) = sapply(strsplit(colnames(df_out), "\\."),'[',2)
dim(df_out)
df_out[1:5,1:5]
if(! all(colnames(df_out) == rownames(metadata))){
df_out = df_out[,rownames(metadata)]
dim(df_out)
}
saveRDS(df_out, file = '../output/feature_matrices/FM_SCRAT_cusanovich2018subset_motifs_no_blacklist_filtering.rds')
sessionInfo()
save.image(file = '../output/SCRAT_cusanovich2018subset_no_blacklist_filtering.RData')