#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # In[7]: get_ipython().run_line_magic('env', 'wd=/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts') wd="/home/sam/analyses/20191029_pgen_v074.a4_genome_feature_counts" # Set list of column header names gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes'] get_ipython().run_line_magic('env', 'rysnc_owl=owl:/volume1/web/halfshell/genomic-databank/') get_ipython().run_line_magic('env', 'gffs=Panopea-generosa-vv0.74.a4.[Cegmrt]*.gff3') get_ipython().run_line_magic('env', "wget_gffs=--directory-prefix=${wd} --recursive --quiety --no-directories --no-check-certificate --no-parent --accept 'Panopea-generosa-vv0.74.a3.[Cegmrt]*.gff3' --reject 'Panopea-generosa-vv0.74.a4.repeat_region.gff3' https://owl.fish.washington.edu/halfshell/genomic-databank/") # ### Import Python modules # In[8]: import fnmatch import os import pandas # #### Create necessary directories # In[9]: get_ipython().run_cell_magic('bash', '', 'mkdir --parents ${wd}\n') # In[10]: cd {wd} # #### Download _Panopea generosa_ GFFs for v074.a3. # # Info on GFFs is here: [https://github.com/RobertsLab/resources/wiki/Genomic-Resources#genome-feature-tracks-3) # In[13]: get_ipython().run_cell_magic('bash', '', '\nrsync \\\n--archive \\\n--verbose \\\n--progress \\\n--include="${gffs}" \\\n--exclude="*" \\\n"${rysnc_owl}" \\\n.\n\n# Remove unneeded repeats GFF\nrm Panopea-generosa-vv0.74.a4.repeat_region.gff3\necho ""\necho ""\necho "----------------------------------------------------------"\n\nls -lh\n') # #### If need to download via wget, uncomment lines in the cell below # In[14]: # %%bash # time \ # wget "${wget_gffs}" # ls -lh ${wd} # In[17]: get_ipython().run_cell_magic('bash', '', 'head Panopea-generosa-vv0.74.a4.CDS.gff3\n') # ### Get sequence length stats for # In[18]: for file in os.listdir('.'): if fnmatch.fnmatch(file, 'Panopea-generosa-vv0.74.a4*.gff3'): print('\n' * 2) print(file) print("-------------------------") # Import GFF. # Skip first 3 rows (gff header lines) and indicate file is tab-separated gff=pandas.read_csv(file, header=None, skiprows=3, sep="\t") # Rename columns gff.columns = gff_header # Subtract start value from end value. # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system) gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1) # Apply functions in list to seqlength column gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max']) print (gff_stats) # In[ ]: