#!/usr/bin/env python # coding: utf-8 # ##FASTQC Analysis # In[2]: get_ipython().system('fastqc --version') # In[1]: get_ipython().system('fastqc 2112_lane1_NoIndex_L001_R1_001.fastq.gz') # In[8]: get_ipython().system('cp 2112_lane1_NoIndex_L001_R1_001_fastqc.* /Volumes/Eagle/Arabidopsis/iPythonNotebooks/') # In[9]: get_ipython().system('mv /Volumes/Eagle/Arabidopsis/iPythonNotebooks/2112_lane1_NoIndex_L001_R1_001_fastqc.html /Volumes/Eagle/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.html') # In[10]: get_ipython().system('mv /Volumes/Eagle/Arabidopsis/iPythonNotebooks/2112_lane1_NoIndex_L001_R1_001_fastqc.zip /Volumes/Eagle/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.zip') # In[1]: from IPython.display import HTML HTML('') # ##Illumina Index Identification Methods Comparison # In[6]: pwd # ####Count the total number of sequences in the FASTQ file and store in variable # This command uses bash commands to count the number of lines in the FASTQ file (```wc-l```), # divides the total number of lines by ```4``` (there are 4 lines per read in Illumina FASTQ files). # The ```echo``` command is used to print the result to the screen, which gets stored in the variable: # ```TotalSeqs``` # In[127]: TotalSeqs = get_ipython().getoutput('echo $((`wc -l < 2112_lane1_NoIndex_L001_R1_001.fastq` / 4))') # In[128]: #Prints the value stored in TotalSeqs. #Notice that this is a Python string list and is not an integer! print TotalSeqs # In[129]: #Converts the value in the TotalSeqs string list at index 0 (TotalSeqs[0]) to #an integer value of base 10. #This conversion will be used repeatedly throughout this notebook to allow #mathematical calculations using the numbers generated by bash commands. TotalSeqs = int(TotalSeqs[0], 10) # In[130]: print TotalSeqs # ##Use bash ```grep``` and ```wc -l``` to count all the instances of each of the full-length TruSeq adaptor/index sequences. # # The index sequence is indicated in each of the respective variable names. # # Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1). # In[43]: BC1_ATCACG_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[44]: #Converts the value in the BC1_ATCACG_full string list at index 0 (BC1_ATCACG_full[0]) to #an integer value of base 10. BC1_ATCACG_full = int(BC1_ATCACG_full[0] ,10) # In[45]: print BC1_ATCACG_full # --- # In[46]: BC3_TTAGGC_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[47]: #Converts the value in the BC3_TTAGGC_full string list at index 0 (BC3_TTAGGC_full[0]) to #an integer value of base 10. BC3_TTAGGC_full = int(BC3_TTAGGC_full[0], 10) # In[48]: print BC3_TTAGGC_full # --- # In[49]: BC4_TGACCA_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[50]: #Converts the value in the BC4_TGACCA_full string list at index 0 (BC4_TGACCA_full[0]) to #an integer value of base 10. BC4_TGACCA_full = int(BC4_TGACCA_full[0], 10) # In[51]: print BC4_TGACCA_full # --- # In[80]: BC5_ACAGTG_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[82]: #Converts the value in the BC5_ACAGTG_full string list at index 0 (BC5_ACAGTG_full[0]) to #an integer value of base 10. BC5_ACAGTG_full = int(BC5_ACAGTG_full[0], 10) # In[83]: print BC5_ACAGTG_full # --- # In[55]: BC6_GCCAAT_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[56]: #Converts the value in the BC6_GCCAAT_full string list at index 0 (BC6_GCCAAT_full[0]) to #an integer value of base 10. BC6_GCCAAT_full = int(BC6_GCCAAT_full[0], 10) # In[57]: print BC6_GCCAAT_full # --- # In[58]: BC7_CAGATC_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[59]: #Converts the value in the BC7_CAGATC_full string list at index 0 (BC7_CAGATC_full[0]) to #an integer value of base 10. BC7_CAGATC_full = int(BC7_CAGATC_full[0], 10) # In[60]: print BC7_CAGATC_full # --- # ###Total Number of Full-length Barcodes Identified # In[69]: #Adds all of the counts from each full-length Illumina adaptor/index sequence. #Saves to variable "sum_full". sum_full = BC1_ATCACG_full + BC3_TTAGGC_full + BC4_TGACCA_full + BC5_ACAGTG_full + BC6_GCCAAT_full + BC7_CAGATC_full # In[70]: print sum_full # --- # ###Percentage of Reads Containing Full-length Barcodes # In[131]: #Calculates percentage of reads having full-lenght Illumina adaptor/index sequences. #Uses "float" to convert integer values to floating point decimals. Necessary since #the calculation on integers would be < 1 & would result in an answer of '0'. print ((float(sum_full)/TotalSeqs)*100) # --- # ##Use bash ```grep``` and ```wc -l``` to count all the instances of each of the TruSeq index sequences. # # The index sequence is indicated in each of the respective variable names. # # Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1). # In[84]: BC1_ATCACG = get_ipython().getoutput("grep -o 'ATCACG' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[85]: #Converts the value in the BC1_ATCACG string list at index 0 (BC1_ATCACG[0]) to #an integer value of base 10. BC1_ATCACG = int(BC1_ATCACG[0] ,10) # In[86]: print BC1_ATCACG # --- # In[87]: BC3_TTAGGC = get_ipython().getoutput("grep -o 'TTAGGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[88]: #Converts the value in the BC3_TTAGGC string list at index 0 (BC3_TTAGGC[0]) to #an integer value of base 10. BC3_TTAGGC = int(BC3_TTAGGC[0] ,10) # In[89]: print BC3_TTAGGC # --- # In[15]: BC4_TGACCA = get_ipython().getoutput("grep -o 'TGACCA' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[16]: #Converts the value in the BC4_TGACCA string list at index 0 (BC4_TGACCA[0]) to #an integer value of base 10. BC4_TGACCA = int(BC4_TGACCA[0]) # In[17]: print BC4_TGACCA # --- # In[93]: BC5_ACAGTG = get_ipython().getoutput("grep -o 'ACAGTG' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[94]: #Converts the value in the BC5_ACAGTG string list at index 0 (BC5_ACAGTG[0]) to #an integer value of base 10. BC5_ACAGTG = int(BC5_ACAGTG[0] ,10) # In[95]: print BC5_ACAGTG # --- # In[105]: BC6_GCCAAT = get_ipython().getoutput("grep -o 'GCCAAT' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[106]: #Converts the value in the BC6_GCCAAT string list at index 0 (BC6_GCCAAT[0]) to #an integer value of base 10. BC6_GCCAAT = int(BC6_GCCAAT[0] ,10) # In[107]: print BC6_GCCAAT # --- # In[108]: BC7_CAGATC = get_ipython().getoutput("grep -o 'CAGATC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l") # In[109]: #Converts the value in the BC7_CAGATC string list at index 0 (BC7_CAGATC[0]) to #an integer value of base 10. BC7_CAGATC = int(BC7_CAGATC[0] ,10) # In[110]: print BC7_CAGATC # --- # ###Total Number of Short Barcodes Identified # In[111]: #Adds all of the counts from each Illumina adaptor/index sequence. #Saves to variable "sum_short". sum_short = BC1_ATCACG + BC3_TTAGGC + BC4_TGACCA + BC5_ACAGTG + BC6_GCCAAT + BC7_CAGATC # In[112]: print sum_short # ###Percentage of Reads Containing Short Barcodes # In[132]: #Calculates percentage of reads having full-lenght Illumina adaptor/index sequences. #Uses "float" to convert integer values to floating point decimals. Necessary since #the calculation on integers would be < 1 & would result in an answer of '0'. print ((float(sum_short)/TotalSeqs)*100) # --- # ##Use ```fastx_barcode_splitter``` to identify full-length TruSeq adaptor/index sequences. # ####The ```fastx_barcode_splitter``` is a component of fastx_toolkit-0.0.13.2 # In[138]: #The full-lengths barcode file used by fastx_barcode_splitter. get_ipython().system('head TruSeqBarcodesLong.txt') # ###Look for full-length barcodes at beginning of lines # In[139]: #Gunzip the gzipped FASTQ file. #Pipe the output of that to fastx_barcode_splitter.pl #fastx_barcode_splitter uses a default mismatch value = 1 #Specify barcode file (--bcfile TruSeqBarcodesLong.txt) #Specify to look for barcode at beginning of file (--bol) #Specify output location and append a prefix to new file name (--prefix ./bol_) #Specify new file name suffix (--suffix ".fastq") get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodesLong.txt --bol --prefix ./bol_long_ --suffix ".fastq" | tee bol_long_stats.txt') # ###Look for full-length barcodes at end of lines # In[140]: #Gunzip the gzipped FASTQ file. #Pipe the output of that to fastx_barcode_splitter.pl #fastx_barcode_splitter uses a default mismatch value = 1 #Specify barcode file (--bcfile TruSeqBarcodesLong.txt) #Specify to look for barcode at beginning of file (--eol) #Specify output location and append a prefix to new file name (--prefix ./eol_) #Specify new file name suffix (--suffix ".fastq") get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodes.txt --eol --prefix ./eol_long_ --suffix ".fastq" | tee eol_long_stats.txt') # ##Use ```fastx_barcode_splitter``` to identify TruSeq index sequences. # In[141]: #The full-lenghts barcode file used by fastx_barcode_splitter. get_ipython().system('head TruSeqBarcodesShort.txt') # ###Look for index sequences at beginning of lines # In[142]: #Gunzip the gzipped FASTQ file. #Pipe the output of that to fastx_barcode_splitter.pl #fastx_barcode_splitter uses a default mismatch value = 1 #Specify barcode file (--bcfile TruSeqBarcodesShort.txt) #Specify to look for barcode at beginning of file (--bol) #Specify output location and append a prefix to new file name (--prefix ./bol_) #Specify new file name suffix (--suffix ".fastq") get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodesShort.txt --bol --prefix ./bol_ --suffix ".fastq" | tee bol_short_stats.txt') # ###Look for index sequences at end of lines # In[143]: #Gunzip the gzipped FASTQ file. #Pipe the output of that to fastx_barcode_splitter.pl #fastx_barcode_splitter uses a default mismatch value = 1 #Specify barcode file (--bcfile TruSeqBarcodesShort.txt) #Specify to look for barcode at beginning of file (--eol) #Specify output location and append a prefix to new file name (--prefix ./eol_) #Specify new file name suffix (--suffix ".fastq") get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodesShort.txt --eol --prefix ./eol_ --suffix ".fastq" | tee eol_short_stats.txt')