#!/usr/bin/env python # coding: utf-8 # ## Verify NCBI _O.lurida_ Genome Submission with GFF Annotations # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # # - `%env` indicates a bash variable; without `%env` is Python variable # In[1]: # Set directories, input/output files get_ipython().run_line_magic('env', 'data_dir=/home/samb/data/O_lurida/genomes') get_ipython().run_line_magic('env', 'analysis_dir=/home/samb/analyses/20210513_olur_NCBI_genome-submission-prep') get_ipython().run_line_magic('env', 'genome_fasta=Olurida_v081.fa') get_ipython().run_line_magic('env', 'orig_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff') get_ipython().run_line_magic('env', 'new_gff=Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added_no-fasta.gff') get_ipython().run_line_magic('env', 'sqn=20210513_Olurida-v081.sqn') # NCBI verification program get_ipython().run_line_magic('env', 'table2asn=/home/samb/programs/linux64.table2asn_GFF') # Locus tag prefix from NCBI BioProject PRJNA393719 get_ipython().run_line_magic('env', 'locus_tag=CGC61') # ### Create output directory # In[3]: get_ipython().run_cell_magic('bash', '', 'mkdir --parents "${analysis_dir}"\nls -lh "${analysis_dir}"\n') # ### Download FastA and GFF # # If needing to use URLs: # # GFF: https://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff # # - MD5: `f54512bd964f45645c34b1e8e403a2b0` # # FastA: http://owl.fish.washington.edu/halfshell/genomic-databank/Olurida_v081.fa # # - MD5: `3ac56372bd62038f264d27eef0883bd3` # In[4]: get_ipython().run_cell_magic('bash', '', 'rsync -avp owl:/volume1/web/halfshell/genomic-databank/Olurida_v081.fa "${data_dir}"\n\nrsync -avp owl:/volume1/web/halfshell/genomic-databank/Olurida_v081_genome_snap02.all.renamed.putative_function.domain_added.gff "${data_dir}"\n\nls -lh "${data_dir}"\n') # ### Fix GFF formatting # # GFF is _not_ in a standard format. When [generated by MAKER on 20190709](https://robertslab.github.io/sams-notebook/2019/07/09/Genome-Annotation-Olurida_v081-with-MAKER-and-Tissue-specific-Transcriptomes-on-Mox.html), the setting used included the genome sequences appended to the end of the GFF. These need to be removed. # #### Figure out beginning of FastA seqs # ##### Identify first FastA sequence and look at a few lines before that. # # Uses the `-n` option to display line numbers before each result. # In[3]: get_ipython().run_cell_magic('bash', '', 'cd ${data_dir}\ngrep -n -B 5 "^>" ${orig_gff} | head -n 5\n') # #### Extract GFF # In[4]: get_ipython().run_cell_magic('bash', '', 'cd ${data_dir}\nhead -n 11574858 ${orig_gff} > ${new_gff}\n\n# Check the new file\ntail -n 1 ${new_gff}\n') # #### Generate checksum for new GFF # In[5]: get_ipython().run_cell_magic('bash', '', 'cd ${data_dir}\nmd5sum ${new_gff} > checksums.md5\n') # ### Run NCBI verification # # Options were taken from here (and are the most basic options): # # https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/ # # See end of notebook for the full help menu with explanations of each/every possible option. # # Based on experience, there's a _lot_ werror output, so I've redirected stderr to `/dev/null`. Plus, all of that info is described in the resulting output files. # In[9]: get_ipython().run_cell_magic('bash', '', 'time \\\n${table2asn} \\\n-M n \\\n-J \\\n-c w \\\n-euk \\\n-locus-tag-prefix ${locus_tag} \\\n-gaps-min 10 \\\n-l unspecified \\\n-i ${data_dir}/${genome_fasta} \\\n-f ${data_dir}/${new_gff} \\\n-o ${analysis_dir}/${sqn} \\\n-Z \\\n2> /dev/null\n\n# List files\nls -lh ${analysis_dir}\n') # #### Check for errors # # - `.stats` explanations are here: https://www.ncbi.nlm.nih.gov/genbank/genome_validation/ # # - Any errors need to be resolved prior to submission. # # - `.dr` explanations are here: https://www.ncbi.nlm.nih.gov/genbank/asndisc/#fatal # # - FATAL catagories _might_ need to be resolved prior to submission. # In[11]: get_ipython().run_cell_magic('bash', '', 'cat ${analysis_dir}/20210513_Olurida-v081.stats\n') # In[12]: get_ipython().run_cell_magic('bash', '', 'head -n 20 ${analysis_dir}/20210513_Olurida-v081.dr\n')