library(Seurat) library(dplyr) library(Matrix) # Load the PBMC dataset pbmc.data <- Read10X("data/pbmc3k_filtered_gene_bc_matrices/hg19") previous_time <- proc.time()[3] # Initialize the Seurat object with the raw (non-normalized data) # Note that this is slightly different than the older Seurat workflow, where log-normalized values were passed in directly. # You can continue to pass in log-normalized values, just set do.logNormalize=F in the next step. pbmc <- new("seurat", raw.data = pbmc.data) # Keep all genes expressed in >= 3 cells, keep all cells with >= 200 genes # Perform log-normalization, first scaling each cell to a total of 1e4 molecules (as in Macosko et al. Cell 2015) pbmc <- Setup(pbmc, min.cells = 3, min.genes = 200, do.logNormalize = T, total.expr = 1e4, project = "10X_PBMC") proc.time()[3] - previous_time # The number of genes and UMIs (nGene and nUMI) are automatically calculated for every object by Seurat. # For non-UMI data, nUMI represents the sum of the non-normalized values within a cell # We calculate the percentage of mitochondrial genes here and store it in percent.mito using the AddMetaData. # The % of UMI mapping to MT-genes is a common scRNA-seq QC metric. # NOTE: You must have the Matrix package loaded to calculate the percent.mito values. mito.genes <- grep("^MT-", rownames(pbmc@data), value = T) percent.mito <- colSums(expm1(pbmc@data[mito.genes, ]))/colSums(expm1(pbmc@data)) #AddMetaData adds columns to object@data.info, and is a great place to stash QC stats pbmc <- AddMetaData(pbmc, percent.mito, "percent.mito") VlnPlot(pbmc, c("nGene", "nUMI", "percent.mito"), nCol = 3) #GenePlot is typically used to visualize gene-gene relationships, but can be used for anything calculated by the object, i.e. columns in object@data.info, PC scores etc. #Since there is a rare subset of cells with an outlier level of high mitochondrial percentage, and also low UMI content, we filter these as well par(mfrow = c(1, 2)) GenePlot(pbmc, "nUMI", "percent.mito") GenePlot(pbmc, "nUMI", "nGene") #We filter out cells that have unique gene counts over 2,500 #Note that accept.high and accept.low can be used to define a 'gate', and can filter cells not only based on nGene but on anything in the object (as in GenePlot above) pbmc <- SubsetData(pbmc, subset.name = "nGene", accept.high = 2500) pbmc <- SubsetData(pbmc, subset.name = "percent.mito", accept.high = 0.05) previous_time <- proc.time()[3] #note that this overwrites pbmc@scale.data. Therefore, if you intend to use RegressOut, you can set do.scale=F and do.center=F in the original object to save some time. pbmc <- RegressOut(pbmc, latent.vars = c("nUMI", "percent.mito")) proc.time()[3] - previous_time pbmc <- MeanVarPlot(pbmc ,fxn.x = expMean, fxn.y = logVarDivMean, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, do.contour = F) previous_time <- proc.time()[3] pbmc <- PCA(pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 5, genes.print = 5) proc.time()[3] - previous_time PCAPlot(pbmc, 1, 2) PCElbowPlot(pbmc) previous_time <- proc.time()[3] #save.SNN=T saves the SNN so that the SLM algorithm can be rerun using the same graph, but with a different resolution value (see docs for full details) pbmc <- FindClusters(pbmc, pc.use = 1:10, resolution = 0.6, print.output = 0, save.SNN = T) proc.time()[3] - previous_time previous_time <- proc.time()[3] pbmc <- RunTSNE(pbmc, dims.use = 1:10, do.fast = T) proc.time()[3] - previous_time previous_time <- proc.time()[3] # find all markers of cluster 1 cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25) print(head(cluster1.markers, 5)) proc.time()[3] - previous_time previous_time <- proc.time()[3] # find all markers distinguishing cluster 5 from clusters 0 and 3 cluster5.markers <- FindMarkers(pbmc, 5, c(0,3), min.pct = 0.25) print(head(cluster5.markers, 5)) proc.time()[3] - previous_time previous_time <- proc.time()[3] # find markers for every cluster compared to all remaining cells, report only the positive ones pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25) pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_diff) proc.time()[3] - previous_time