#Tophat complete, had to go back and quality trim individual files

#checking accepted hits 
!samtools view -c /Volumes/web/cnidarian/BiGill_RNAseqGalaxy_Tophat.bam

#only mapped reads
!samtools view -c -F 4 /Volumes/web/cnidarian/BiGill_RNAseqGalaxy_Tophat.bam

!samtools flagstat /Volumes/web/cnidarian/BiGill_RNAseqGalaxy_Tophat.bam

#Cufflinks needs GTF
!head /Volumes/web/cnidarian/TJGR_CDSgffread_filtered.gtf

#hung for days; restart

!head /Volumes/web/cnidarian/BiGillCufflinks_assembled_transcripts1.gtf

!wc /Volumes/web/cnidarian/BiGillCufflinks_transcripts_exp1.tabular

!wc /Volumes/web/cnidarian/BiGillCufflinks_gene_exp1.tabular
!coveragebed -hist -abam /Volumes/web/cnidarian/BiGill_RNAseqGalaxy_Tophat.bam -b /Volumes/web/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_exon.gff > /Volumes/web/cnidarian/BiGill_ThBAM_cov_exon_1.txt
!head /Volumes/web/cnidarian/BiGill_ThBAM_cov_exon_1.txt

from pandas import *

# read data from data file into a pandas DataFrame  
ex = read_table("http://eagle.fish.washington.edu/cnidarian/BiGill_ThBAM_cov_exon_1.txt", # name of the data file
            #sep=",", # what character separates each column?
            na_values=["", " "]) # what values should be considered "blank" values?

print ex
#need to figure out column titles @fu

ex ['0.1396011'].hist(bins=50);
#Axis limits are changed using the axis([xmin, xmax, ymin, ymax]) function.
#plt.axis([0, 1, 0, 400000])

!coveragebed -abam /Volumes/web/cnidarian/BiGill_RNAseqGalaxy_Tophat.bam -b /Volumes/web/trilobite/Crassostrea_gigas_v9_tracks/Cgigas_v9_exon.gff > /Volumes/web/cnidarian/BiGill_ThBAM_cov_exon_2.txt

!head /Volumes/web/cnidarian/BiGill_ThBAM_cov_exon_2.txt
DEFAULT: After each interval in B, coverageBed will report:

The number of features in A that overlapped (by at least one base pair) the B interval.
The number of bases in B that had non-zero coverage from features in A.
The length of the entry in B.
The fraction of bases in B that had non-zero coverage from features in A.
Below are the number of features in A (N=...) overlapping B and fraction of bases in B with coverage.

Chromosome  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

BED FILE B  ***************     ***************     ******    **************

BED File A  ^^^^ ^^^^              ^^             ^^^^^^^^^    ^^^ ^^ ^^^^
              ^^^^^^^^                                      ^^^^^ ^^^^^ ^^

Result      [  N=3, 10/15 ]     [  N=1, 2/15 ]     [N=1,6/6]   [N=6, 12/14 ]

#into SQLShare

!head /Volumes/web/cnidarian/BiGill_ThBam_exon_feature.csv

!tr ',' "\t" </Volumes/web/cnidarian/BiGill_ThBam_exon_feature.csv> /Volumes/web/cnidarian/BiGill_ThBam_exon_feature.igv

!head /Volumes/web/cnidarian/BiGill_ThBam_exon_feature.igv

!head /Volumes/web/cnidarian/BiGill_ThBam_exon_feature.sorted.igv

!head /Volumes/web/cnidarian/BiGill_exonexp_gene.csv

!tr ',' "\t" </Volumes/web/cnidarian/BiGill_exonexp_gene.csv> /Volumes/web/cnidarian/BiGill_exonexp_gene.txt

!head /Volumes/web/cnidarian/BiGill_exonexp_gene.txt


#need to make igv file
#clean up Gene name and join
#find tails of genes with uniform versus variable expression @fu

!head /Volumes/web/cnidarian/BiGill_mCG_feature.csv

!tr ',' "\t" </Volumes/web/cnidarian/BiGill_mCG_feature.csv> /Volumes/web/cnidarian/BiGill_mCG_feature.igv

!samtools sort -h


#complete, though oddly not many reads map.....


#snapshot

from pandas import *

# read data from data file into a pandas DataFrame  
BiGillrna = read_table("http://eagle.fish.washington.edu/cnidarian/BiGill_RNAseq_gene.1" # name of the data file
           #sep=",", # what character separates each column?
           #na_values=["", " "]) # what values should be considered "blank" values?
           )

BiGillrna

BiGillrna['Total gene reads'].hist(bins=1000);
#Axis limits are changed using the axis([xmin, xmax, ymin, ymax]) function.
#plt.axis([0, 0, 0, 20000])
#plt.xlabel('x', fontsize=20)
#plt.ylabel('RPKM', fontsize= 20)
#plt.title('Gill', fontsize= 20)

"""
Simple demo of a scatter plot.
"""
import numpy as np
import matplotlib.pyplot as plt


N = 50
x = BiGillrna['Total gene reads']
y = np.random.rand(N)
area = np.pi * (15 * np.random.rand(N))**2 # 0 to 15 point radiuses

plt.scatter(x, y, s=area, alpha=0.5)
plt.show()