#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# In[7]:


get_ipython().run_line_magic('env', 'wd=/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts')
wd="/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts"

# Set list of column header names
gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']

get_ipython().run_line_magic('env', 'rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/')
get_ipython().run_line_magic('env', 'gffs=Panopea-generosa-vv0.74.a4.[Cegmrt]*.gff3')
get_ipython().run_line_magic('env', "wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.[Cegmrt]*.gff3' --reject 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/")


# ### Import Python modules

# In[8]:


import fnmatch
import os
import pandas


# #### Create necessary directories

# In[9]:


get_ipython().run_cell_magic('bash', '', 'mkdir --parents ${wd}\n')


# In[10]:


cd {wd}


# #### Download _Panopea generosa_ GFFs for v074.a3.
# 
# Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3)

# In[13]:


get_ipython().run_cell_magic('bash', '', '\nrsync \\\n--archive \\\n--verbose \\\n--progress \\\n--include="${gffs}" \\\n--exclude="*" \\\n"${rysnc_owl}" \\\n.\n\n# Remove unneeded repeats GFF\nrm Panopea-generosa-vv0.74.a4.repeat_region.gff3\necho ""\necho ""\necho "----------------------------------------------------------"\n\nls -lh\n')


# #### If need to download via wget, uncomment lines in the cell below

# In[14]:


# %%bash
# time \
# wget "${wget_gffs}"

# ls -lh ${wd}


# In[17]:


get_ipython().run_cell_magic('bash', '', 'head Panopea-generosa-vv0.74.a4.CDS.gff3\n')


# ### Get sequence length stats for 

# In[18]:


for file in os.listdir('.'):
    if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a4*.gff3'):
        print('\n' * 2)
        print(file)
        print("-------------------------")
        
        # Import GFF.
        # Skip first 3 rows (gff header lines) and indicate file is tab-separated
        gff=pandas.read_csv(file, header=None, skiprows=3, sep="\t")
        
        # Rename columns
        gff.columns = gff_header
        
        # Subtract start value from end value.
        # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)
        gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)
        
        # Apply functions in list to seqlength column
        gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max'])
        
        print (gff_stats)


# In[ ]: