Load libraries

list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra", "PerformanceAnalytics", "corrplot") #add new libraries here 

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Obtain session information

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.15.7
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] corrplot_0.84              PerformanceAnalytics_2.0.4
##  [3] xts_0.12-0                 zoo_1.8-8                 
##  [5] factoextra_1.0.7           vegan_2.5-6               
##  [7] lattice_0.20-41            permute_0.9-5             
##  [9] plotly_4.9.2.1             viridis_0.5.1             
## [11] viridisLite_0.3.0          Pstat_1.2                 
## [13] matrixStats_0.56.0         ggforce_0.3.1             
## [15] methylKit_1.8.1            GenomicRanges_1.34.0      
## [17] GenomeInfoDb_1.18.2        IRanges_2.16.0            
## [19] S4Vectors_0.20.1           BiocGenerics_0.28.0       
## [21] here_0.1                   reshape2_1.4.4            
## [23] forcats_0.5.0              stringr_1.4.0             
## [25] dplyr_0.8.5                purrr_0.3.4               
## [27] readr_1.3.1                tidyr_1.0.3               
## [29] tibble_3.0.1               ggplot2_3.3.0             
## [31] tidyverse_1.3.0           
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1            ellipsis_0.3.0             
##  [3] mclust_5.4.5                rprojroot_1.3-2            
##  [5] qvalue_2.14.1               XVector_0.22.0             
##  [7] fs_1.4.1                    rstudioapi_0.11            
##  [9] farver_2.0.3                ggrepel_0.8.2              
## [11] fansi_0.4.1                 mvtnorm_1.1-0              
## [13] lubridate_1.7.8             xml2_1.3.2                 
## [15] splines_3.5.1               R.methodsS3_1.8.0          
## [17] knitr_1.28                  polyclip_1.10-0            
## [19] jsonlite_1.6.1              Rsamtools_1.34.1           
## [21] broom_0.5.6                 cluster_2.1.0              
## [23] dbplyr_1.4.3                R.oo_1.23.0                
## [25] compiler_3.5.1              httr_1.4.1                 
## [27] backports_1.1.6             lazyeval_0.2.2             
## [29] assertthat_0.2.1            Matrix_1.2-18              
## [31] limma_3.38.3                cli_2.0.2                  
## [33] tweenr_1.0.1                htmltools_0.4.0            
## [35] tools_3.5.1                 coda_0.19-3                
## [37] gtable_0.3.0                glue_1.4.0                 
## [39] GenomeInfoDbData_1.2.0      Rcpp_1.0.4.6               
## [41] bbmle_1.0.23.1              Biobase_2.42.0             
## [43] cellranger_1.1.0            vctrs_0.3.0                
## [45] Biostrings_2.50.2           nlme_3.1-143               
## [47] rtracklayer_1.42.2          xfun_0.13                  
## [49] fastseg_1.28.0              rvest_0.3.5                
## [51] lifecycle_0.2.0             gtools_3.8.2               
## [53] XML_3.99-0.3                zlibbioc_1.28.0            
## [55] MASS_7.3-51.6               scales_1.1.1               
## [57] hms_0.5.3                   SummarizedExperiment_1.12.0
## [59] yaml_2.2.1                  gridExtra_2.3              
## [61] emdbook_1.3.12              bdsmatrix_1.3-4            
## [63] stringi_1.4.6               BiocParallel_1.16.6        
## [65] rlang_0.4.6                 pkgconfig_2.0.3            
## [67] bitops_1.0-6                evaluate_0.14              
## [69] htmlwidgets_1.5.1           GenomicAlignments_1.18.1   
## [71] tidyselect_1.1.0            plyr_1.8.6                 
## [73] magrittr_1.5                R6_2.4.1                   
## [75] generics_0.0.2              DelayedArray_0.8.0         
## [77] DBI_1.1.0                   mgcv_1.8-31                
## [79] pillar_1.4.4                haven_2.2.0                
## [81] withr_2.2.0                 RCurl_1.98-1.2             
## [83] modelr_0.1.7                crayon_1.3.4               
## [85] rmarkdown_2.1               grid_3.5.1                 
## [87] readxl_1.3.1                data.table_1.12.8          
## [89] reprex_0.3.0                digest_0.6.25              
## [91] numDeriv_2016.8-1.1         R.utils_2.9.2              
## [93] munsell_0.5.0               quadprog_1.5-8

Check current directory

getwd()
## [1] "/Volumes/Bumblebee/C.magister_methyl-oa/notebooks"

Download files from gannet

Create list of all bismark data files, which have reads that are already trimmed and aligned. These BAM files are also sorted and indexed.

IMPORTANT NOTE: The location of these files depends on where the user saved them. I (Laura) used an external hard drive.

Files were transferred from Hyak on December 27th, 2020 using rsync.

file.list=list("/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-06_S1_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-14_S2_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-22_S3_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH01-38_S4_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-04_S5_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-15_S6_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH03-33_S7_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-01_S8_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-06_S9_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-21_S10_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH05-24_S11_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-06_S12_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-11_S13_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH07-24_S14_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-02_S15_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-13_S16_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH09-28_S17_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-01_S18_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-08_S19_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam",
               "/Volumes/Bumblebee/C.magister_methyl-oa/qc-processing/MiSeq-QC/bismark/CH10-11_S20_L001_R1_001_val_1_bismark_bt2_pe.deduplicated.sorted.bam")

Read in MiSeq bismark summary with sample treatment info

summary <- read_delim(file="../qc-processing/MiSeq-QC/mbdall.txt", delim="\t") %>%
  arrange(sample_mbd) %>% filter(sample_mbd != "NA")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   sample = col_character(),
##   Low_CpGmeth = col_character(),
##   treatment = col_character(),
##   treatment_high_lowCO2 = col_character(),
##   developmental_stage = col_character(),
##   notes = col_character(),
##   DNAiso_batch = col_character(),
##   `pool for library?` = col_character()
## )
## See spec(...) for full column specifications.
head(summary)
## # A tibble: 6 x 30
##   sample sample_mbd Low_CpGmeth  tank treatment treatment_high_…
##   <chr>       <dbl> <chr>       <dbl> <chr>     <chr>           
## 1 CH01-…          1 no              3 LC        L               
## 2 CH01-…          2 no              3 LC        L               
## 3 CH01-…          3 no              1 LB        L               
## 4 CH01-…          4 no              1 LB        L               
## 5 CH03-…          5 yes             1 LB        L               
## 6 CH03-…          6 yes             1 LB        L               
## # … with 24 more variables: developmental_stage <chr>, notes <chr>,
## #   DNAiso_batch <chr>, conc_ng.uL <dbl>, yield_ug <dbl>,
## #   MBD_concentration_ng.uL <dbl>, Total_recovery_ng <dbl>,
## #   Percent_recovery <dbl>, MBD_date <dbl>,
## #   library_bioanlyzer_mean_fragment_size_bp <dbl>,
## #   library_bioanalyzer_molarity_pmol.L <dbl>, `pool for library?` <chr>,
## #   qubit_concentration_ng.uL <dbl>, qubit_molarity_nM <dbl>,
## #   library_4nM_uL <dbl>, Total_Reads <dbl>, Aligned_Reads <dbl>,
## #   Unaligned_Reads <dbl>, Ambiguously_Aligned_Reads <dbl>, Unique_reads <dbl>,
## #   perc_totalread_unique <dbl>, CpGs_Meth <dbl>, CHGs_Meth <dbl>,
## #   CHHs_Meth <dbl>

investigate possible factors influencing low %CpGmeth

cor(summary %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=.45)