list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra", "PerformanceAnalytics", "corrplot") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] corrplot_0.84 PerformanceAnalytics_2.0.4
## [3] xts_0.12-0 zoo_1.8-8
## [5] factoextra_1.0.7 vegan_2.5-6
## [7] lattice_0.20-41 permute_0.9-5
## [9] plotly_4.9.2.1 viridis_0.5.1
## [11] viridisLite_0.3.0 Pstat_1.2
## [13] matrixStats_0.56.0 ggforce_0.3.1
## [15] methylKit_1.8.1 GenomicRanges_1.34.0
## [17] GenomeInfoDb_1.18.2 IRanges_2.16.0
## [19] S4Vectors_0.20.1 BiocGenerics_0.28.0
## [21] here_0.1 reshape2_1.4.4
## [23] forcats_0.5.0 stringr_1.4.0
## [25] dplyr_0.8.5 purrr_0.3.4
## [27] readr_1.3.1 tidyr_1.0.3
## [29] tibble_3.0.1 ggplot2_3.3.0
## [31] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ellipsis_0.3.0
## [3] mclust_5.4.5 rprojroot_1.3-2
## [5] qvalue_2.14.1 XVector_0.22.0
## [7] fs_1.4.1 rstudioapi_0.11
## [9] farver_2.0.3 ggrepel_0.8.2
## [11] fansi_0.4.1 mvtnorm_1.1-0
## [13] lubridate_1.7.8 xml2_1.3.2
## [15] splines_3.5.1 R.methodsS3_1.8.0
## [17] knitr_1.28 polyclip_1.10-0
## [19] jsonlite_1.6.1 Rsamtools_1.34.1
## [21] broom_0.5.6 cluster_2.1.0
## [23] dbplyr_1.4.3 R.oo_1.23.0
## [25] compiler_3.5.1 httr_1.4.1
## [27] backports_1.1.6 lazyeval_0.2.2
## [29] assertthat_0.2.1 Matrix_1.2-18
## [31] limma_3.38.3 cli_2.0.2
## [33] tweenr_1.0.1 htmltools_0.4.0
## [35] tools_3.5.1 coda_0.19-3
## [37] gtable_0.3.0 glue_1.4.0
## [39] GenomeInfoDbData_1.2.0 Rcpp_1.0.4.6
## [41] bbmle_1.0.23.1 Biobase_2.42.0
## [43] cellranger_1.1.0 vctrs_0.3.0
## [45] Biostrings_2.50.2 nlme_3.1-143
## [47] rtracklayer_1.42.2 xfun_0.13
## [49] fastseg_1.28.0 rvest_0.3.5
## [51] lifecycle_0.2.0 gtools_3.8.2
## [53] XML_3.99-0.3 zlibbioc_1.28.0
## [55] MASS_7.3-51.6 scales_1.1.1
## [57] hms_0.5.3 SummarizedExperiment_1.12.0
## [59] yaml_2.2.1 gridExtra_2.3
## [61] emdbook_1.3.12 bdsmatrix_1.3-4
## [63] stringi_1.4.6 BiocParallel_1.16.6
## [65] rlang_0.4.6 pkgconfig_2.0.3
## [67] bitops_1.0-6 evaluate_0.14
## [69] htmlwidgets_1.5.1 GenomicAlignments_1.18.1
## [71] tidyselect_1.1.0 plyr_1.8.6
## [73] magrittr_1.5 R6_2.4.1
## [75] generics_0.0.2 DelayedArray_0.8.0
## [77] DBI_1.1.0 mgcv_1.8-31
## [79] pillar_1.4.4 haven_2.2.0
## [81] withr_2.2.0 RCurl_1.98-1.2
## [83] modelr_0.1.7 crayon_1.3.4
## [85] rmarkdown_2.1 grid_3.5.1
## [87] readxl_1.3.1 data.table_1.12.8
## [89] reprex_0.3.0 digest_0.6.25
## [91] numDeriv_2016.8-1.1 R.utils_2.9.2
## [93] munsell_0.5.0 quadprog_1.5-8
getwd()
## [1] "/Volumes/Bumblebee/C.magister_methyl-oa/notebooks"
gannet
Files were transferred from Hyak on December 27th, 2020 using rsync
.
file.list=list("/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-06_S1_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-14_S2_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-22_S3_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-38_S4_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-04_S5_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-15_S6_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-33_S7_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-01_S8_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-06_S9_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-21_S10_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-24_S11_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-06_S12_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-11_S13_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-24_S14_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-02_S15_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-13_S16_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-28_S17_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-01_S18_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-08_S19_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
"/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-11_S20_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam")
summary <- read_delim(file="../qc-processing/MiSeq-QC/mbdall.txt", delim="\t") %>%
arrange(sample_mbd) %>% filter(sample_mbd != "NA")
## Parsed with column specification:
## cols(
## .default = col_double(),
## sample = col_character(),
## Low_CpGmeth = col_character(),
## treatment = col_character(),
## treatment_high_lowCO2 = col_character(),
## developmental_stage = col_character(),
## notes = col_character(),
## DNAiso_batch = col_character(),
## `pool for library?` = col_character()
## )
## See spec(...) for full column specifications.
head(summary)
## # A tibble: 6 x 30
## sample sample_mbd Low_CpGmeth tank treatment treatment_high_…
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 CH01-… 1 no 3 LC L
## 2 CH01-… 2 no 3 LC L
## 3 CH01-… 3 no 1 LB L
## 4 CH01-… 4 no 1 LB L
## 5 CH03-… 5 yes 1 LB L
## 6 CH03-… 6 yes 1 LB L
## # … with 24 more variables: developmental_stage <chr>, notes <chr>,
## # DNAiso_batch <chr>, conc_ng.uL <dbl>, yield_ug <dbl>,
## # MBD_concentration_ng.uL <dbl>, Total_recovery_ng <dbl>,
## # Percent_recovery <dbl>, MBD_date <dbl>,
## # library_bioanlyzer_mean_fragment_size_bp <dbl>,
## # library_bioanalyzer_molarity_pmol.L <dbl>, `pool for library?` <chr>,
## # qubit_concentration_ng.uL <dbl>, qubit_molarity_nM <dbl>,
## # library_4nM_uL <dbl>, Total_Reads <dbl>, Aligned_Reads <dbl>,
## # Unaligned_Reads <dbl>, Ambiguously_Aligned_Reads <dbl>, Unique_reads <dbl>,
## # perc_totalread_unique <dbl>, CpGs_Meth <dbl>, CHGs_Meth <dbl>,
## # CHHs_Meth <dbl>
cor(summary %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=.45)
#### Look closer at factors that correlate with CpGs_Meth (dark(ish) blue dots)
chart.Correlation(summary %>%
select(CpGs_Meth, library_bioanlyzer_mean_fragment_size_bp, qubit_concentration_ng.uL,
Total_Reads, Aligned_Reads, perc_totalread_unique, CHHs_Meth),
histogram=F, pch=19)
#### Plot meth ~ fragment size (bioanalyzer)
ggplot(summary,
aes(x=library_bioanlyzer_mean_fragment_size_bp, y=CpGs_Meth,
label=sample_mbd, color=Low_CpGmeth)) +
geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
geom_text(aes(label=sample_mbd), vjust = -0.8)
ggplot(summary,
aes(x=Total_Reads, y=CpGs_Meth,
label=sample_mbd, color=Low_CpGmeth)) +
geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
geom_text(aes(label=sample_mbd), vjust = -0.8)
ggplot(summary,
aes(x=Aligned_Reads, y=CpGs_Meth,
label=sample_mbd, color=Low_CpGmeth)) +
geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
geom_text(aes(label=sample_mbd), vjust = -0.8)
ggplot(summary,
aes(x=perc_totalread_unique, y=CpGs_Meth,
label=sample_mbd, color=Low_CpGmeth)) +
geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
geom_text(aes(label=sample_mbd), vjust = -0.8)
ggplot(summary,
aes(x=CHHs_Meth, y=CpGs_Meth,
label=sample_mbd, color=Low_CpGmeth)) +
geom_point(size=3) + scale_color_manual(values=c("black", "red")) +
geom_text(aes(label=sample_mbd), vjust = -0.8)
#### The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation. It also assigns minimum coverage of 2x and the treatments (in this case, the CO2 treatment)
myobj_MiSeq = processBismarkAln(location = file.list, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20"), assembly = "pilon_scaffolds.fasta", read.context="CpG", mincov=2, treatment = as.numeric(as.factor(summary$treatment_high_lowCO2)))
#getwd()
#save(myobj_MiSeq, file = "../qc-processing/MiSeq-QC/myobj_MiSeq")
load("../qc-processing/MiSeq-QC/myobj_MiSeq")
# Check out data for sample #1
head(myobj_MiSeq[[1]])
## chr start end strand coverage numCs numTs
## 1 contig_1_pilon 24401 24401 - 2 0 2
## 2 contig_1_pilon 24409 24409 - 2 0 2
## 3 contig_1_pilon 24419 24419 - 2 0 2
## 4 contig_10_pilon 34002 34002 + 2 0 2
## 5 contig_10_pilon 51920 51920 - 2 0 2
## 6 contig_10_pilon 55069 55069 + 2 0 2
# First look at % CpG methylation (panels)
for (i in 1:20) {
getMethylationStats(myobj_MiSeq[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
# Now look at coverage
for (i in 1:20) {
getCoverageStats(myobj_MiSeq[[i]],plot=TRUE,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
meth_unite=methylKit::unite(myobj_MiSeq, destrand=TRUE, min.per.group=1L)
#save(meth_unite, file = "../qc-processing/MiSeq-QC/meth_unite") #save object to file
meth_unite_reshaped <- melt(data=meth_unite, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
tidyr::separate(col = "variable" , into = c("variable", "sample"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
dcast(chr+start+end+strand+sample ~ variable) %>%
drop_na(coverage) %>%
mutate(percMeth = 100*(numCs/coverage)) %>%
mutate(sample=as.numeric(sample))
#save(meth_unite_reshaped, file = "../qc-processing/MiSeq-QC/meth_unite_reshaped")
# Load reshaped object if needed
load(file = "../qc-processing/MiSeq-QC/meth_unite_reshaped")
head(meth_unite_reshaped)
## chr start end strand sample coverage numCs numTs percMeth
## 1 contig_1003_pilon 25122 25122 + 1 10 10 0 100
## 2 contig_1003_pilon 25143 25143 + 1 7 7 0 100
## 3 contig_1003_pilon 25146 25146 + 1 3 3 0 100
## 4 contig_1003_pilon 25162 25162 + 1 5 5 0 100
## 5 contig_1003_pilon 25166 25166 + 1 5 3 2 60
## 6 contig_1003_pilon 25186 25186 + 1 7 7 0 100
meth_unite_reshaped %>%
group_by(sample) %>%
summarize(mean = mean(percMeth), nloci = n())
## # A tibble: 20 x 3
## sample mean nloci
## <dbl> <dbl> <int>
## 1 1 77.9 307273
## 2 2 72.6 372596
## 3 3 78.3 299331
## 4 4 72.7 333387
## 5 5 24.1 51783
## 6 6 21.6 60153
## 7 7 74.9 431389
## 8 8 65.5 139884
## 9 9 71.0 337914
## 10 10 74.8 414587
## 11 11 56.7 206273
## 12 12 76.0 280847
## 13 13 72.7 491751
## 14 14 72.3 258637
## 15 15 74.8 416299
## 16 16 35.2 132251
## 17 17 48.9 64462
## 18 18 77.0 347761
## 19 19 68.5 280665
## 20 20 17.8 31092
meth_unite_reshaped %>%
filter(coverage>=5) %>%
group_by(sample) %>%
summarize(mean = mean(percMeth,), nloci = n())
## # A tibble: 20 x 3
## sample mean nloci
## <dbl> <dbl> <int>
## 1 1 83.5 150500
## 2 2 78.6 165889
## 3 3 83.7 121088
## 4 4 80.9 158257
## 5 5 2.41 2773
## 6 6 2.18 3054
## 7 7 81.5 224040
## 8 8 69.3 10767
## 9 9 80.6 158567
## 10 10 80.2 240812
## 11 11 59.6 16414
## 12 12 81.4 97256
## 13 13 79.4 288343
## 14 14 78.8 113501
## 15 15 79.9 190444
## 16 16 9.69 6078
## 17 17 61.0 5172
## 18 18 82.3 197277
## 19 19 76.4 53183
## 20 20 1.84 1418
meth_unite_reshaped %>%
filter(coverage>=10) %>%
group_by(sample) %>%
summarize(mean = mean(percMeth), nloci = n())
## # A tibble: 20 x 3
## sample mean nloci
## <dbl> <dbl> <int>
## 1 1 84.7 75634
## 2 2 80.1 48507
## 3 3 85.2 36303
## 4 4 82.7 78369
## 5 5 2.29 864
## 6 6 1.88 919
## 7 7 83.3 85681
## 8 8 54.1 417
## 9 9 83.9 82386
## 10 10 81.6 127130
## 11 11 21.0 1281
## 12 12 82.7 25360
## 13 13 81.1 132739
## 14 14 80.0 56056
## 15 15 81.4 66130
## 16 16 3.15 1908
## 17 17 51.5 846
## 18 18 83.5 113142
## 19 19 72.6 3528
## 20 20 1.38 366
meth_unite_reshaped %>%
filter(coverage>=15) %>%
group_by(sample) %>%
summarize(mean = mean(percMeth), n = n())
## # A tibble: 20 x 3
## sample mean n
## <dbl> <dbl> <int>
## 1 1 85.2 39878
## 2 2 80.4 11926
## 3 3 85.5 9520
## 4 4 83.5 41526
## 5 5 1.96 379
## 6 6 1.49 419
## 7 7 84.1 27937
## 8 8 36.3 74
## 9 9 85.3 52617
## 10 10 82.3 66801
## 11 11 14.0 465
## 12 12 83.0 5656
## 13 13 82.0 52841
## 14 14 80.6 30239
## 15 15 81.8 19507
## 16 16 2.64 910
## 17 17 29.8 214
## 18 18 84.3 68228
## 19 19 54.9 336
## 20 20 1.03 154
The following samples have low CpG methylation: 5, 6, 9, 13, 14, kind of 19. Check out each sample individually
meth_unite_reshaped %>%
ggplot() + geom_point(aes(x=coverage, y=percMeth)) +
facet_wrap(~sample)
MiSeq_5x=filterByCoverage(myobj_MiSeq,lo.count=5,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
save(MiSeq_5x, file="../qc-processing/MiSeq-QC/MiSeq_5x")
for (i in 1:20) {
getMethylationStats(MiSeq_5x[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
MiSeq_10x=filterByCoverage(myobj_MiSeq,lo.count=10,lo.perc=NULL,
hi.count=100,hi.perc=NULL)
save(MiSeq_10x, file="../qc-processing/MiSeq-QC/MiSeq_10x")
for (i in 1:20) {
getMethylationStats(MiSeq_10x[[i]],plot=T,both.strands=TRUE)
mtext(paste("Sample", i, sep=" "), side=3, line = -6)
}
clusterSamples(meth_unite, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 20
PCASamples(meth_unite)