bsmaploc="/Users/Shared/Apps/bsmap-2.74/" #to confirm you current directory run the command and you should see a wd directory !ls cd wd #This command downloads a archived file including six BS-seq libraries (4.3 Gb) #!wget http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_HTSdata/BiGo_lar_fastq_mcf.tgz !curl -O http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_HTSdata/BiGo_lar_fastq_mcf.tgz #uncompress files !tar -zxvf BiGo_lar_fastq_mcf.tgz #remove BiGo_lar_fastq_mcf.tgz #!rm BiGo_lar_fastq_mcf.tgz #Downloading the oyster genome #!wget http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa !curl -O http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa for i in ("M1","T1D3","T1D5", "M3", "T3D3", "T3D5"): !{bsmaploc}bsmap \ -a mcf_{i}_R1.fastq \ -b mcf_{i}_R2.fastq \ -d Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa \ -o bsmap_out_{i}.sam \ -p 8 for i in ("M1","T1D3","T1D5", "M3", "T3D3", "T3D5"): !python {bsmaploc}methratio.py \ -d Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa \ -u -z -g \ -o methratio_out_{i}.txt \ -s {bsmaploc}samtools \ bsmap_out_{i}.sam \ #first methratio files are converted to filter for CG context, 3x coverage (mr3x.awk), and reformatting (mr_gg.awk.sh). #due to issue passing variable to awk, simple scripts were used (included in repository) for i in ("M1","T1D3","T1D5", "M3", "T3D3", "T3D5"): !echo {i} !grep "[A-Z][A-Z]CG[A-Z]" methratio_out_{i}CG.txt !awk -f ../scripts/mr3x.awk methratio_out_{i}CG.txt > mr3x.{i}.txt !awk -f ../scripts/mr_gg.awk.sh mr3x.{i}.txt > mkfmt_{i}.txt %pylab inline %load_ext rpy2.ipython %%R #UNCOMMENT IF YOU NEED TO INSTALL PACKAGES # dependencies #install.packages( c("data.table","devtools")) #source("http://bioconductor.org/biocLite.R") #biocLite(c("GenomicRanges","IRanges")) # install the development version from github #library(devtools) #install_github("al2na/methylKit",build_vignettes=FALSE) %R library(methylKit) %%R file.list <- list ('mkfmt_M1.txt', 'mkfmt_T1D3.txt', 'mkfmt_T1D5.txt', 'mkfmt_M3.txt', 'mkfmt_T3D3.txt', 'mkfmt_T3D5.txt' ) %%R myobj=read(file.list,sample.id=list("1_sperm","1_72hpf","1_120hpf","3_sperm","3_72hpf","3_120hpf"),assembly="v9",treatment=c(0,0,0,1,1,1)) %%R meth<-unite(myobj) #getCorrelation(meth,plot=T) hc<- clusterSamples(meth, dist="correlation", method="ward", plot=T) #PCA<-PCASamples(meth) %%R #Family-specific DMLs #note that file.list was defined in prior section DMLobj=read(file.list,sample.id=list("M1","T1D3","T1D5","M3","T3D3","T3D5"),assembly="v9",treatment=c(1,1,1,0,0,0), context="CpG") lin<-unite(DMLobj) lin.pooled <- pool(lin, sample.ids <- c("lin_1", "lin_3")) lin_DML.fisher <- calculateDiffMeth(lin.pooled) select(lin_DML.fisher, 1) lin_DML_p <- getData(lin_DML.fisher) lin_DML_filt <- lin_DML_p[lin_DML_p$pvalue < 0.01 & lin_DML_p$meth.diff > 25,] write.csv(lin_DML_filt,file="lin_DML_filt") !wc -l lin_DML_filt %%R file.list <- list ('mkfmt_M1.txt', 'mkfmt_T1D3.txt', 'mkfmt_M3.txt', 'mkfmt_T3D3.txt' ) %%R #Developmentally different DMLs (Males v Day3) DMLobj=read(file.list,sample.id=list("M1","T1D3","M3","T3D3"), assembly="v9",treatment=c(1,0,1,0), context="CpG") DevelMvD3<-unite(DMLobj) DevelMvD3.pooled <- pool(DevelMvD3, sample.ids <- c("Males", "Day3")) DevelMvD3_DML.fisher <- calculateDiffMeth(DevelMvD3.pooled) select(DevelMvD3_DML.fisher, 1) DevelMvD3_DML_p <- getData(DevelMvD3_DML.fisher) DevelMvD3_DML_filt <- DevelMvD3_DML_p[DevelMvD3_DML_p$pvalue < 0.01 & DevelMvD3_DML_p$meth.diff > 25,] write.csv(DevelMvD3_DML_filt,file="DevelMvD3_DML_filt") !wc -l DevelMvD3_DML_filt %%R file.list <- list ('mkfmt_M1.txt', 'mkfmt_T1D5.txt', 'mkfmt_M3.txt', 'mkfmt_T3D5.txt' ) %%R #Developmentally different DMLs (Males v Day5) DMLobj=read(file.list,sample.id=list("M1","T1D5","M3","T3D5"), assembly="v9",treatment=c(1,0,1,0), context="CpG") DevelMvD5<-unite(DMLobj) DevelMvD5.pooled <- pool(DevelMvD5, sample.ids <- c("Males", "Day5")) DevelMvD5_DML.fisher <- calculateDiffMeth(DevelMvD5.pooled) select(DevelMvD5_DML.fisher, 1) DevelMvD5_DML_p <- getData(DevelMvD5_DML.fisher) DevelMvD5_DML_filt <- DevelMvD5_DML_p[DevelMvD5_DML_p$pvalue < 0.01 & DevelMvD5_DML_p$meth.diff > 25,] write.csv(DevelMvD5_DML_filt,file="DevelMvD5_DML_filt") !wc -l DevelMvD5_DML_filt %%R file.list <- list ('mkfmt_T1D3.txt', 'mkfmt_T1D5.txt', 'mkfmt_T3D3.txt', 'mkfmt_T3D5.txt' ) %%R #Developmentally different DMLs (Day3 v Day5) DMLobj=read(file.list,sample.id=list("T1D3","T1D5","T3D3","T3D5"), assembly="v9",treatment=c(1,0,1,0), context="CpG") DevelD3vD5<-unite(DMLobj) DevelD3vD5.pooled <- pool(DevelD3vD5, sample.ids <- c("Day3", "Day5")) DevelD3vD5_DML.fisher <- calculateDiffMeth(DevelD3vD5.pooled) select(DevelD3vD5_DML.fisher, 1) DevelD3vD5_DML_p <- getData(DevelD3vD5_DML.fisher) DevelD3vD5_DML_filt <- DevelD3vD5_DML_p[DevelD3vD5_DML_p$pvalue < 0.01 & DevelD3vD5_DML_p$meth.diff > 25,] write.csv(DevelD3vD5_DML_filt,file="DevelD3vD5_DML_filt") !wc -l DevelD3vD5_DML_filt #removing column titles !tail -n +2 DevelMvD5_DML_filt > DevelMvD5_DML !tail -n +2 DevelD3vD5_DML_filt > DevelD3vD5_DML #Concatenate all developmetnally different DMLs to one file !cat DevelMvD3_DML_filt DevelMvD5_DML DevelD3vD5_DML > Devel_DML_filt !wc -l Devel_DML_filt !tail -n +2 lin_DML_filt | awk -F, '{print $2, $3, $4, "DML_lin" }' | tr -d '"' | tr ' ' "\t" > lineage_dml.bed !wc -l lineage_dml.bed !tail -n +2 Devel_DML_filt | awk -F, '{print $2, $3, $4, "DML_dev" }' | tr -d '"' | tr ' ' "\t" > dev_dml.bed !wc -l dev_dml.bed #In order to find location of DMLs oyster genome tracks will be downloaded #and intersectbed (bedtools suite) run #Note track with all CG's is large (~977mb) cd genome_tracks for i in ("exon","intron","TEx","gene","1k5p_gene_promoter","CG"): !curl -O http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_{i}.gff cd .. for i in ("exon","intron","TEx","1k5p_gene_promoter"): !intersectbed \ -u \ -a lineage_dml.bed \ -b ./genome_tracks/Cgigas_v9_{i}.gff \ > {i}_intersect_DML_lin_u.txt !wc -l {i}_intersect_DML_lin_u.txt > lin{i} #Concatenate counts of genomic regions into one table for lineage-specific DMLs !cat linintron linexon lin1k5p_gene_promoter linTEx > lintable !awk 'FNR==NR{sum+=$1;next}; {print $0,sum}' lintable{,} > lin_total !awk '{print $2, $1, $3, (($1/$3)*100)}' lin_total > lineage_DMLs for i in ("exon","intron","TEx","1k5p_gene_promoter"): !intersectbed \ -u \ -a dev_dml.bed \ -b ./genome_tracks/Cgigas_v9_{i}.gff \ > {i}_intersect_DML_dev_u.txt !wc -l {i}_intersect_DML_dev_u.txt > dev{i} #Concatenate counts of genomic regions into one table for developmentally different DMLs !cat devintron devexon dev1k5p_gene_promoter devTEx > devtable !awk 'FNR==NR{sum+=$1;next}; {print $0,sum}' devtable{,} > dev_total !awk '{print $2, $1, $3, (($1/$3)*100)}' dev_total > developmental_DMLs for i in ("exon","intron","TEx","gene","1k5p_gene_promoter"): !intersectbed \ -u \ -a ./genome_tracks/Cgigas_v9_CG.gff \ -b ./genome_tracks/Cgigas_v9_{i}.gff \ > {i}_intersect_CG_u.txt !wc -l {i}_intersect_CG_u.txt > CG{i} #Concatenate counts of genomic regions into one table for all CGs in oyster genome !cat CGintron CGexon CG1k5p_gene_promoter CGTEx > CGtable !awk 'FNR==NR{sum+=$1;next}; {print $0,sum}' CGtable{,} > CG_total !awk '{print $2, $1, $3, (($1/$3)*100)}' CG_total > all_CGs !paste -d" " lineage_DMLs developmental_DMLs all_CGs > StackedBars !awk '{print $4, $8, $12}' StackedBars | tr ' ' "\t" > StackedBars_DMLs %%R DMLs<-as.matrix(read.table('StackedBars_DMLs', header=F)) colnames(DMLs)<-c("Lin DMLs","Devel DMLs", "All CpGs") par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=T) par(xpd=T, mar=par()$mar+c(0,0,0,5)) barplot(as.matrix(DMLs), col=c("#BCCCC1", "#8DAB96", "#4A7958", "#2F583B"), ylab="Proportion of CpG within a genomic region (%)") legend("topright",inset=c(-0.63,-0), legend=c("Transposable Element", "Promoter Region", "Exon", "Intron"), pch=c(19,19,19), col=c("#2F583B","#4A7958","#8DAB96","#BCCCC1")) #Formatting family-specific DML files for stats !wc -l lineage_dml.bed > lineage_countstotal !wc -l ./genome_tracks/Cgigas_v9_CG.gff > CG_countstotal !cat linTEx lineage_countstotal > Lineage_TEs !awk '{print $1}' Lineage_TEs > Lineage_TEs_counts !cat CGTEx CG_countstotal > CG_TEs !awk '{print $1}' CG_TEs > CG_TEs_counts !paste Lineage_TEs_counts CG_TEs_counts > LinTEs_combined !awk '{print $1, $2}' LinTEs_combined > Lineage_TEs_stats %%R #Stats for TEs: family-specific LinStats<- read.table('Lineage_TEs_stats') chisq.test(LinStats) #formatting developmental DML files for stats !wc -l dev_dml.bed > dev_countstotal !cat devTEx dev_countstotal > Dev_TEs !awk '{print $1}' Dev_TEs > Dev_TEs_counts !paste Dev_TEs_counts CG_TEs_counts > DevTEs_combined !awk '{print $1, $2}' DevTEs_combined > Dev_TEs_stats %%R #Stats for TEs: developmentally different DevStats<-read.table('Dev_TEs_stats') chisq.test(DevStats)