#!/usr/bin/env python # coding: utf-8 # ##Count the total number of sequences in the FASTQ file and store in variable # This command uses bash commands to count the number of lines in the FASTQ file (```wc-l```), # divides the total number of lines by ```4``` (there are 4 lines per read in Illumina FASTQ files). # The ```echo``` command is used to print the result to the screen, which gets stored in the variable: # ```TotalSeqs``` # In[5]: TotalSeqs = get_ipython().getoutput('echo $((`wc -l < 2112_lane1_NoIndex_L001_R1_001.fastq` / 4))') # In[6]: #Prints the value stored in TotalSeqs. #Notice that this is a Python string list and is not an integer! print TotalSeqs # In[7]: #Converts the value in the TotalSeqs string list at index 0 (TotalSeqs[0]) to #an integer value of base 10. #This conversion will be used repeatedly throughout this notebook to allow #mathematical calculations using the numbers generated by bash commands. TotalSeqs = int(TotalSeqs[0], 10) # In[13]: print TotalSeqs # --- # ##Use bash ```grep``` and ```wc -l``` to count all the instances of the TruSeq adaptor sequence: # # GATCGGAAGAGCACACGTCTGAACTCCAGTCAC # In[21]: TruSeq_adaptor_grep = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[22]: #Converts the value in the TruSeq_adaptor_grep string list at index 0 (TruSeq_adaptor_grep[0]) to #an integer value of base 10. TruSeq_adaptor_grep = int(TruSeq_adaptor_grep[0]) # In[23]: print TruSeq_adaptor_grep # --- # ###Percentage of Reads Containing TruSeq adaptor sequence # In[24]: #Calculates percentage of reads having TruSeq adaptor sequences. #Uses "float" to convert integer values to floating point decimals. Necessary since #the calculation on integers would be < 1 & would result in an answer of '0'. print ((float(TruSeq_adaptor_grep)/TotalSeqs)*100) # --- # ##Use bash ```grep``` and ```wc -l``` to count all the instances of the Epinext universal primer sequence: # # AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT # In[1]: EpinextUniversal_grep = get_ipython().getoutput("grep -o 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[2]: #Converts the value in the EpinextUniversal_grep string list at index 0 (EpinextUniversal_grep[0]) to #an integer value of base 10. EpinextUniversal_grep = int(EpinextUniversal_grep[0]) # In[3]: print EpinextUniversal_grep # --- # ###Percentage of Reads Containing Epinext Universal Primer sequence # In[8]: #Calculates percentage of reads having TruSeq adaptor sequences. #Uses "float" to convert integer values to floating point decimals. Necessary since #the calculation on integers would be < 1 & would result in an answer of '0'. print ((float(EpinextUniversal_grep)/TotalSeqs)*100) # --- # ##Use ```fastx_barcode_splitter``` to identify TruSeq adaptor sequence. # ####The ```fastx_barcode_splitter``` is a component of fastx_toolkit-0.0.13.2 # In[25]: #The full-lengths barcode file used by fastx_barcode_splitter. get_ipython().system('head TruSeqAdaptor.txt') # ###Look for TruSeq adaptor at beginning of lines # In[36]: #Gunzip the gzipped FASTQ file. #Pipe the output of that to fastx_barcode_splitter.pl #fastx_barcode_splitter uses a default mismatch value = 1 #Specify barcode file (--bcfile TruSeqAdaptor.txt) #Specify to look for barcode at beginning of file (--bol) #Specify output location and append a prefix to new file name (--prefix ./bol_) #Specify new file name suffix (--suffix ".fastq") #Print data to screen and output file (tee bol_TruSeqAdaptor_stats.txt) get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqAdaptor.txt --bol --prefix ./bol_ --suffix ".fastq" | tee bol_TruSeqAdaptor_stats.txt') # In[37]: #Uses awk to capture the second field (the "Count" column; print $2) from #the second line (FNR == 2) of the bol_TruSeqAdaptor_stats.txt #Stores the value in the variable TruSeqAdaptor_fastx_bol as a Python string list. TruSeqAdaptor_fastx_bol = get_ipython().getoutput("awk 'FNR == 2 {print $2}' bol_TruSeqAdaptor_stats.txt") # In[38]: print TruSeqAdaptor_fastx_bol # In[39]: #Converts the value in the TruSeqAdaptor_fastx_bol string list at index 0 (TruSeqAdaptor_fastx_bol[0]) to #an integer value of base 10. TruSeqAdaptor_fastx_bol = int(TruSeqAdaptor_fastx_bol[0]) # In[40]: print TruSeqAdaptor_fastx_bol # --- # ###Percentage of Reads Containing TruSeq adaptor sequence at Beginning of Lines # In[41]: #Calculates percentage of reads having TruSeq adaptor sequences. #Uses "float" to convert integer values to floating point decimals. Necessary since #the calculation on integers would be < 1 & would result in an answer of '0'. print ((float(TruSeqAdaptor_fastx_bol)/TotalSeqs)*100) # --- # ###Look for TruSeq adaptor at end of lines # In[43]: #Gunzip the gzipped FASTQ file. #Pipe the output of that to fastx_barcode_splitter.pl #fastx_barcode_splitter uses a default mismatch value = 1 #Specify barcode file (--bcfile TruSeqAdaptor.txt) #Specify to look for barcode at beginning of file (--bol) #Specify output location and append a prefix to new file name (--prefix ./bol_) #Specify new file name suffix (--suffix ".fastq") #Print data to screen and output file (tee bol_TruSeqAdaptor_stats.txt) get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqAdaptor.txt --eol --prefix ./eol_ --suffix ".fastq" | tee eol_TruSeqAdaptor_stats.txt') # In[44]: #Uses awk to capture the second field (the "Count" column; print $2) from #the second line (FNR == 2) of the TruSeqAdaptor_fastx_eol.txt #Stores the value in the variable TruSeqAdaptor_fastx_eol as a Python string list. TruSeqAdaptor_fastx_eol = get_ipython().getoutput("awk 'FNR == 2 {print $2}' eol_TruSeqAdaptor_stats.txt") # In[45]: #Converts the value in the TruSeqAdaptor_fastx_bol string list at index 0 (TruSeqAdaptor_fastx_bol[0]) to #an integer value of base 10. TruSeqAdaptor_fastx_eol = int(TruSeqAdaptor_fastx_eol[0]) # In[46]: print TruSeqAdaptor_fastx_eol # --- # ###Percentage of Reads Containing TruSeq adaptor sequence at End of Lines # In[47]: #Calculates percentage of reads having TruSeq adaptor sequences. #Uses "float" to convert integer values to floating point decimals. Necessary since #the calculation on integers would be < 1 & would result in an answer of '0'. print ((float(TruSeqAdaptor_fastx_eol)/TotalSeqs)*100)