#!/usr/bin/env python # coding: utf-8 # ###BLASTN C.gigas against NCBI nt DB # Code explanation (by line) # 1. Calls blastn program (!blastn) and specifies to use blastn (-task blastn). # 2. Specifies which query file to blast. # 3. Specifies which database file to blast against (located on Hummingbird /Volumes/Data/blast_dbs). # 4. Specifies output format. In this case, output format 6 with subject scientific names. # 5. Specifies maximum number of DB matches to save. # 6. Specifies number of CPUs to use. # 7. Output file name. # 8. Directs stderr output to file instead of printing to screen. # In[7]: get_ipython().system('blastn -task blastn -query Owl/halfshell/EmmaBS400.fa -db nt -outfmt "6 sscinames" -max_target_seqs 1 -num_threads 16 -out 20150501_nt_blastn.tab 2> 20150501_nt_blastn.err') # Interrupted kernel because I glanced at the output file and saw this (after running for ~6hrs!): # In[8]: get_ipython().system('head -30 20150501_nt_blastn.tab') # The issue is that the output only contains the species info and not the rest of the formatting (e-vals, bit scores, etc) that are part of BLASTn format 6. # # It turns out that specifying format 6 just specifies tab-delimited output. If an "additional" feature is specified from the default output of format 6, then the defaults no longer apply; you have to specify them all. # ###Re-run BLAST with correct output formatting # In[ ]: get_ipython().system('blastn -task blastn -query Owl/halfshell/EmmaBS400.fa -db nt -outfmt "6 qseqid sseqid pident length evalue stitle sscinames" -max_target_seqs 1 -num_threads 16 -out 20150501_nt_blastn.tab 2> 20150501_nt_blastn.err') # ####Verify file output # In[10]: get_ipython().system('head -10 20150501_nt_blastn.tab') # ####Count number of matched sequences # In[17]: get_ipython().system('wc -l 20150501_nt_blastn.tab') # ####Check best matches based on e-value # The code below cuts columns (fields; -f) 5 and 7 (e-value and species) of the input file, sorts and displays first 100 entries (head -100) # In[15]: get_ipython().system('cut -f5,7 20150501_nt_blastn.tab | sort | head -100') # ###Extract all ouput with e-values less than or equal to 0.001 # The code uses awk to obtain all lines in the file that have e-values less than or equal to 0.001 (```$5<=0.001```). The e-values are found in column (i.e. field) 5 of the input file ($5). # In[79]: get_ipython().run_cell_magic('bash', '', "awk ' $5<=0.001' 20150501_nt_blastn.tab > 20150501_Cgigas_larvae_OA_blastn_evals_0.001.txt\n") # ###Count number of lines produced from previous command # In[81]: get_ipython().system('wc -l 20150501_Cgigas_larvae_OA_blastn_evals_0.001.txt') # ###Store number of lines as Python variable # The code stores the number of lines in the Python variable "total" from the output of the bash command (designated by the "!"). # In[92]: total = get_ipython().getoutput('wc -l < 20150501_Cgigas_larvae_OA_blastn_evals_0.001.txt') # ###Verify variable has correct number # Notice that the number of lines is stored as a string list. This is denoted by the brackets and single quotes in the output. # In[93]: print total # ###Convert string list variable to integer # This enables the number to be more easily used for subsequent calculations. # In[96]: total = int(total[0]) # ###Verify variable is now and integer and no longer a string list # This is confirmed by the lack of brackets around the output. # In[97]: print total # ###Count the number of each species in the BLAST matches # The code uses awk again to extract all lines with e-values <=0.001. Those lines are then cut on column (i.e. field; -f) 7, sorted (which is necessary for the subsequent "uniq" command), and the uniq entries are counted (```uniq -c```). # In[70]: get_ipython().run_cell_magic('bash', '', "awk ' $5<=0.001' 20150501_nt_blastn.tab | cut -f7 | sort | uniq -c > 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt\n") # ###Check output file # In[71]: get_ipython().system('head 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt') # ###Remove leading whitespace from file # Although difficult to notice, the previous command where the number of unique entities are counted results in an output with a bunch of spaces at the beginning of each line that I don't want. # # The command below utilizes sed. Sed creates a backup of the input file (```-i.bu```; ".bu" is a custom suffix for the backup file - you can enter anything you'd like) and then removes (substitutes, thus the ```s/```) all spaces (``` */```) found only at the beginning (```^```) of each line. # In[72]: get_ipython().system("sed -i.bu 's/^ *//g' 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt") # ###Verify removal of leading whitespaces # Note how the output is shifted to the left compared to how it was above. # In[73]: get_ipython().system('head -30 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt') # ###Count number of unique species # In[75]: get_ipython().system('wc -l 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt') # ###Copy file to Eagle # In[77]: get_ipython().system('cp 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt /Volumes/Eagle/Arabidopsis/') # ###View top 15 species of BLAST matches # The code sorts the file in reverse (```-r```) numerical (```-n```) order on the first column (```-k1```) and displays the first 15 lines (```head -15```). # In[78]: get_ipython().system('sort -rn -k1 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt | head -15') # ###Store Crassostrea gigas counts in variable # The code uses bash grep to find any lines with "Crassostrea gigas" on them and then uses awk to print the first column (i.e. field; $1) which contains the count. # In[108]: gigas_count = get_ipython().getoutput("grep 'Crassostrea gigas' 20150501_Cgigas_larvae_OA_unique_blastn_evals.txt | awk '{ print $1 }'") # ###Confirm count is stored in variable # In[109]: print gigas_count # ###Convert Python variable value from string list to floating number # Conversion to float is necessary based on the desired output we want later. The output I want later will be a value less than 1. Integers can only display whole numbers. # In[110]: gigas_count = float(gigas_count[0]) # ###Verify conversion from string list to float # In[111]: print gigas_count # ###Calculate percentage of Crassostrea gigas BLAST hits # In[113]: print (gigas_count/total) * 100