bsmaploc="/Users/Shared/Apps/bsmap-2.74/"

#to confirm you current directory run the command and you should see a wd directory
!ls


cd wd

#This command downloads a archived file including six BS-seq libraries (4.3 Gb)

#!wget http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_HTSdata/BiGo_lar_fastq_mcf.tgz
!curl -O http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_HTSdata/BiGo_lar_fastq_mcf.tgz    


#uncompress files
!tar -zxvf BiGo_lar_fastq_mcf.tgz

#remove BiGo_lar_fastq_mcf.tgz
#!rm BiGo_lar_fastq_mcf.tgz 

#Downloading the oyster genome 
#!wget http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa
!curl -O http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_ensembl_tracks/Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa    
    

for i in ("M1","T1D3","T1D5", "M3", "T3D3", "T3D5"):
    !{bsmaploc}bsmap \
-a mcf_{i}_R1.fastq \
-b mcf_{i}_R2.fastq \
-d Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa \
-o bsmap_out_{i}.sam \
-p 8

for i in ("M1","T1D3","T1D5", "M3", "T3D3", "T3D5"):
    !python {bsmaploc}methratio.py \
-d Crassostrea_gigas.GCA_000297895.1.22.dna_sm.genome.fa \
-u -z -g \
-o methratio_out_{i}.txt \
-s {bsmaploc}samtools \
bsmap_out_{i}.sam \


#first methratio files are converted to filter for CG context, 3x coverage (mr3x.awk), and reformatting (mr_gg.awk.sh).
#due to issue passing variable to awk, simple scripts were used (included in repository)
for i in ("M1","T1D3","T1D5", "M3", "T3D3", "T3D5"):
    !echo {i}
    !grep "[A-Z][A-Z]CG[A-Z]" <methratio_out_{i}.txt> methratio_out_{i}CG.txt
    !awk -f ../scripts/mr3x.awk methratio_out_{i}CG.txt > mr3x.{i}.txt
    !awk -f ../scripts/mr_gg.awk.sh mr3x.{i}.txt > mkfmt_{i}.txt


%pylab inline

%load_ext rpy2.ipython


%%R
#UNCOMMENT IF YOU NEED TO INSTALL PACKAGES
# dependencies
#install.packages( c("data.table","devtools"))
#source("http://bioconductor.org/biocLite.R")
#biocLite(c("GenomicRanges","IRanges"))

# install the development version from github
#library(devtools)
#install_github("al2na/methylKit",build_vignettes=FALSE)


%R library(methylKit)


%%R file.list <- list 
('mkfmt_M1.txt',
 'mkfmt_T1D3.txt',
 'mkfmt_T1D5.txt',
 'mkfmt_M3.txt',
 'mkfmt_T3D3.txt',
 'mkfmt_T3D5.txt'
)


%%R
myobj=read(file.list,sample.id=list("1_sperm","1_72hpf","1_120hpf","3_sperm","3_72hpf","3_120hpf"),assembly="v9",treatment=c(0,0,0,1,1,1))

%%R
meth<-unite(myobj)
#getCorrelation(meth,plot=T)
hc<- clusterSamples(meth, dist="correlation", method="ward", plot=T)
#PCA<-PCASamples(meth)

%%R
#Family-specific DMLs
#note that file.list was defined in prior section
DMLobj=read(file.list,sample.id=list("M1","T1D3","T1D5","M3","T3D3","T3D5"),assembly="v9",treatment=c(1,1,1,0,0,0), context="CpG")
lin<-unite(DMLobj)
lin.pooled <- pool(lin, sample.ids <- c("lin_1", "lin_3"))
lin_DML.fisher <- calculateDiffMeth(lin.pooled)
select(lin_DML.fisher, 1)
lin_DML_p <- getData(lin_DML.fisher)
lin_DML_filt <- lin_DML_p[lin_DML_p$pvalue < 0.01 & lin_DML_p$meth.diff > 25,]
write.csv(lin_DML_filt,file="lin_DML_filt")

!wc -l lin_DML_filt

%%R file.list <- list 
('mkfmt_M1.txt',
 'mkfmt_T1D3.txt',
 'mkfmt_M3.txt',
 'mkfmt_T3D3.txt'
)

%%R 
#Developmentally different DMLs (Males v Day3)
DMLobj=read(file.list,sample.id=list("M1","T1D3","M3","T3D3"), assembly="v9",treatment=c(1,0,1,0), context="CpG")
DevelMvD3<-unite(DMLobj)
DevelMvD3.pooled <- pool(DevelMvD3, sample.ids <- c("Males", "Day3"))
DevelMvD3_DML.fisher <- calculateDiffMeth(DevelMvD3.pooled)
select(DevelMvD3_DML.fisher, 1)
DevelMvD3_DML_p <- getData(DevelMvD3_DML.fisher)
DevelMvD3_DML_filt <- DevelMvD3_DML_p[DevelMvD3_DML_p$pvalue < 0.01 & DevelMvD3_DML_p$meth.diff > 25,]
write.csv(DevelMvD3_DML_filt,file="DevelMvD3_DML_filt")

!wc -l DevelMvD3_DML_filt

%%R file.list <- list 
('mkfmt_M1.txt',
 'mkfmt_T1D5.txt',
 'mkfmt_M3.txt',
 'mkfmt_T3D5.txt'
)

%%R 
#Developmentally different DMLs (Males v Day5)
DMLobj=read(file.list,sample.id=list("M1","T1D5","M3","T3D5"), assembly="v9",treatment=c(1,0,1,0), context="CpG")
DevelMvD5<-unite(DMLobj)
DevelMvD5.pooled <- pool(DevelMvD5, sample.ids <- c("Males", "Day5"))
DevelMvD5_DML.fisher <- calculateDiffMeth(DevelMvD5.pooled)
select(DevelMvD5_DML.fisher, 1)
DevelMvD5_DML_p <- getData(DevelMvD5_DML.fisher)
DevelMvD5_DML_filt <- DevelMvD5_DML_p[DevelMvD5_DML_p$pvalue < 0.01 & DevelMvD5_DML_p$meth.diff > 25,]
write.csv(DevelMvD5_DML_filt,file="DevelMvD5_DML_filt")

!wc -l DevelMvD5_DML_filt

%%R file.list <- list 
('mkfmt_T1D3.txt',
 'mkfmt_T1D5.txt',
 'mkfmt_T3D3.txt',
 'mkfmt_T3D5.txt'
)

%%R 
#Developmentally different DMLs (Day3 v Day5)
DMLobj=read(file.list,sample.id=list("T1D3","T1D5","T3D3","T3D5"), assembly="v9",treatment=c(1,0,1,0), context="CpG")
DevelD3vD5<-unite(DMLobj)
DevelD3vD5.pooled <- pool(DevelD3vD5, sample.ids <- c("Day3", "Day5"))
DevelD3vD5_DML.fisher <- calculateDiffMeth(DevelD3vD5.pooled)
select(DevelD3vD5_DML.fisher, 1)
DevelD3vD5_DML_p <- getData(DevelD3vD5_DML.fisher)
DevelD3vD5_DML_filt <- DevelD3vD5_DML_p[DevelD3vD5_DML_p$pvalue < 0.01 & DevelD3vD5_DML_p$meth.diff > 25,]
write.csv(DevelD3vD5_DML_filt,file="DevelD3vD5_DML_filt")

!wc -l DevelD3vD5_DML_filt

#removing column titles
!tail -n +2 DevelMvD5_DML_filt > DevelMvD5_DML
!tail -n +2 DevelD3vD5_DML_filt > DevelD3vD5_DML 

#Concatenate all developmetnally different DMLs to one file
!cat DevelMvD3_DML_filt DevelMvD5_DML DevelD3vD5_DML > Devel_DML_filt

!wc -l Devel_DML_filt

!tail -n +2 lin_DML_filt | awk -F, '{print $2, $3, $4, "DML_lin" }' | tr -d '"' | tr ' ' "\t"  > lineage_dml.bed

!wc -l lineage_dml.bed

!tail -n +2 Devel_DML_filt | awk -F, '{print $2, $3, $4, "DML_dev" }' | tr -d '"' | tr ' ' "\t"  > dev_dml.bed

!wc -l dev_dml.bed

#In order to find location of DMLs oyster genome tracks will be downloaded
#and intersectbed (bedtools suite) run
#Note track with all CG's is large (~977mb)

cd genome_tracks


for i in ("exon","intron","TEx","gene","1k5p_gene_promoter","CG"):
    !curl -O http://eagle.fish.washington.edu/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_{i}.gff 


cd ..

for i in ("exon","intron","TEx","1k5p_gene_promoter"):
    !intersectbed \
    -u \
    -a lineage_dml.bed \
    -b ./genome_tracks/Cgigas_v9_{i}.gff \
    > {i}_intersect_DML_lin_u.txt
    !wc -l {i}_intersect_DML_lin_u.txt > lin{i}

#Concatenate counts of genomic regions into one table for lineage-specific DMLs
!cat linintron linexon lin1k5p_gene_promoter linTEx > lintable

!awk 'FNR==NR{sum+=$1;next}; {print $0,sum}' lintable{,} > lin_total

!awk '{print $2, $1, $3, (($1/$3)*100)}' lin_total > lineage_DMLs

for i in ("exon","intron","TEx","1k5p_gene_promoter"):
    !intersectbed \
    -u \
    -a dev_dml.bed \
    -b ./genome_tracks/Cgigas_v9_{i}.gff \
    > {i}_intersect_DML_dev_u.txt
    !wc -l {i}_intersect_DML_dev_u.txt > dev{i}

#Concatenate counts of genomic regions into one table for developmentally different DMLs
!cat devintron devexon dev1k5p_gene_promoter devTEx  > devtable

!awk 'FNR==NR{sum+=$1;next}; {print $0,sum}' devtable{,} > dev_total

!awk '{print $2, $1, $3, (($1/$3)*100)}' dev_total > developmental_DMLs

for i in ("exon","intron","TEx","gene","1k5p_gene_promoter"):
    !intersectbed \
    -u \
    -a ./genome_tracks/Cgigas_v9_CG.gff \
    -b ./genome_tracks/Cgigas_v9_{i}.gff \
    > {i}_intersect_CG_u.txt
    !wc -l {i}_intersect_CG_u.txt > CG{i}

#Concatenate counts of genomic regions into one table for all CGs in oyster genome
!cat CGintron CGexon CG1k5p_gene_promoter CGTEx > CGtable

!awk 'FNR==NR{sum+=$1;next}; {print $0,sum}' CGtable{,} > CG_total

!awk '{print $2, $1, $3, (($1/$3)*100)}' CG_total > all_CGs

!paste -d" " lineage_DMLs developmental_DMLs all_CGs > StackedBars

!awk '{print $4, $8, $12}' StackedBars | tr ' ' "\t" > StackedBars_DMLs

%%R 
DMLs<-as.matrix(read.table('StackedBars_DMLs', header=F))
colnames(DMLs)<-c("Lin DMLs","Devel DMLs", "All CpGs")
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=T)
par(xpd=T, mar=par()$mar+c(0,0,0,5))
barplot(as.matrix(DMLs), col=c("#BCCCC1", "#8DAB96", "#4A7958", "#2F583B"), ylab="Proportion of CpG within a genomic region (%)")
legend("topright",inset=c(-0.63,-0), legend=c("Transposable Element", "Promoter Region", "Exon", "Intron"), pch=c(19,19,19), col=c("#2F583B","#4A7958","#8DAB96","#BCCCC1"))

#Formatting family-specific DML files for stats
!wc -l lineage_dml.bed > lineage_countstotal

!wc -l ./genome_tracks/Cgigas_v9_CG.gff > CG_countstotal

!cat linTEx lineage_countstotal > Lineage_TEs

!awk '{print $1}' Lineage_TEs > Lineage_TEs_counts

!cat CGTEx CG_countstotal > CG_TEs

!awk '{print $1}' CG_TEs > CG_TEs_counts

!paste Lineage_TEs_counts CG_TEs_counts > LinTEs_combined

!awk '{print $1, $2}' LinTEs_combined > Lineage_TEs_stats

%%R
#Stats for TEs: family-specific
LinStats<- read.table('Lineage_TEs_stats')
chisq.test(LinStats)

#formatting developmental DML files for stats
!wc -l dev_dml.bed > dev_countstotal

!cat devTEx dev_countstotal > Dev_TEs

!awk '{print $1}' Dev_TEs > Dev_TEs_counts

!paste Dev_TEs_counts CG_TEs_counts > DevTEs_combined

!awk '{print $1, $2}' DevTEs_combined > Dev_TEs_stats

%%R
#Stats for TEs: developmentally different
DevStats<-read.table('Dev_TEs_stats')
chisq.test(DevStats)