#!/usr/bin/env python
# coding: utf-8
# ##FASTQC Analysis
# In[2]:
get_ipython().system('fastqc --version')
# In[1]:
get_ipython().system('fastqc 2112_lane1_NoIndex_L001_R1_001.fastq.gz')
# In[8]:
get_ipython().system('cp 2112_lane1_NoIndex_L001_R1_001_fastqc.* /Volumes/Eagle/Arabidopsis/iPythonNotebooks/')
# In[9]:
get_ipython().system('mv /Volumes/Eagle/Arabidopsis/iPythonNotebooks/2112_lane1_NoIndex_L001_R1_001_fastqc.html /Volumes/Eagle/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.html')
# In[10]:
get_ipython().system('mv /Volumes/Eagle/Arabidopsis/iPythonNotebooks/2112_lane1_NoIndex_L001_R1_001_fastqc.zip /Volumes/Eagle/Arabidopsis/iPythonNotebooks/20150313_2112_lane1_NoIndex_L001_R1_001_fastqc.zip')
# In[1]:
from IPython.display import HTML
HTML('')
# ##Illumina Index Identification Methods Comparison
# In[6]:
pwd
# ####Count the total number of sequences in the FASTQ file and store in variable
# This command uses bash commands to count the number of lines in the FASTQ file (```wc-l```),
# divides the total number of lines by ```4``` (there are 4 lines per read in Illumina FASTQ files).
# The ```echo``` command is used to print the result to the screen, which gets stored in the variable:
# ```TotalSeqs```
# In[127]:
TotalSeqs = get_ipython().getoutput('echo $((`wc -l < 2112_lane1_NoIndex_L001_R1_001.fastq` / 4))')
# In[128]:
#Prints the value stored in TotalSeqs.
#Notice that this is a Python string list and is not an integer!
print TotalSeqs
# In[129]:
#Converts the value in the TotalSeqs string list at index 0 (TotalSeqs[0]) to
#an integer value of base 10.
#This conversion will be used repeatedly throughout this notebook to allow
#mathematical calculations using the numbers generated by bash commands.
TotalSeqs = int(TotalSeqs[0], 10)
# In[130]:
print TotalSeqs
# ##Use bash ```grep``` and ```wc -l``` to count all the instances of each of the full-length TruSeq adaptor/index sequences.
#
# The index sequence is indicated in each of the respective variable names.
#
# Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1).
# In[43]:
BC1_ATCACG_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[44]:
#Converts the value in the BC1_ATCACG_full string list at index 0 (BC1_ATCACG_full[0]) to
#an integer value of base 10.
BC1_ATCACG_full = int(BC1_ATCACG_full[0] ,10)
# In[45]:
print BC1_ATCACG_full
# ---
# In[46]:
BC3_TTAGGC_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[47]:
#Converts the value in the BC3_TTAGGC_full string list at index 0 (BC3_TTAGGC_full[0]) to
#an integer value of base 10.
BC3_TTAGGC_full = int(BC3_TTAGGC_full[0], 10)
# In[48]:
print BC3_TTAGGC_full
# ---
# In[49]:
BC4_TGACCA_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[50]:
#Converts the value in the BC4_TGACCA_full string list at index 0 (BC4_TGACCA_full[0]) to
#an integer value of base 10.
BC4_TGACCA_full = int(BC4_TGACCA_full[0], 10)
# In[51]:
print BC4_TGACCA_full
# ---
# In[80]:
BC5_ACAGTG_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[82]:
#Converts the value in the BC5_ACAGTG_full string list at index 0 (BC5_ACAGTG_full[0]) to
#an integer value of base 10.
BC5_ACAGTG_full = int(BC5_ACAGTG_full[0], 10)
# In[83]:
print BC5_ACAGTG_full
# ---
# In[55]:
BC6_GCCAAT_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[56]:
#Converts the value in the BC6_GCCAAT_full string list at index 0 (BC6_GCCAAT_full[0]) to
#an integer value of base 10.
BC6_GCCAAT_full = int(BC6_GCCAAT_full[0], 10)
# In[57]:
print BC6_GCCAAT_full
# ---
# In[58]:
BC7_CAGATC_full = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[59]:
#Converts the value in the BC7_CAGATC_full string list at index 0 (BC7_CAGATC_full[0]) to
#an integer value of base 10.
BC7_CAGATC_full = int(BC7_CAGATC_full[0], 10)
# In[60]:
print BC7_CAGATC_full
# ---
# ###Total Number of Full-length Barcodes Identified
# In[69]:
#Adds all of the counts from each full-length Illumina adaptor/index sequence.
#Saves to variable "sum_full".
sum_full = BC1_ATCACG_full + BC3_TTAGGC_full + BC4_TGACCA_full + BC5_ACAGTG_full + BC6_GCCAAT_full + BC7_CAGATC_full
# In[70]:
print sum_full
# ---
# ###Percentage of Reads Containing Full-length Barcodes
# In[131]:
#Calculates percentage of reads having full-lenght Illumina adaptor/index sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(sum_full)/TotalSeqs)*100)
# ---
# ##Use bash ```grep``` and ```wc -l``` to count all the instances of each of the TruSeq index sequences.
#
# The index sequence is indicated in each of the respective variable names.
#
# Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1).
# In[84]:
BC1_ATCACG = get_ipython().getoutput("grep -o 'ATCACG' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[85]:
#Converts the value in the BC1_ATCACG string list at index 0 (BC1_ATCACG[0]) to
#an integer value of base 10.
BC1_ATCACG = int(BC1_ATCACG[0] ,10)
# In[86]:
print BC1_ATCACG
# ---
# In[87]:
BC3_TTAGGC = get_ipython().getoutput("grep -o 'TTAGGC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[88]:
#Converts the value in the BC3_TTAGGC string list at index 0 (BC3_TTAGGC[0]) to
#an integer value of base 10.
BC3_TTAGGC = int(BC3_TTAGGC[0] ,10)
# In[89]:
print BC3_TTAGGC
# ---
# In[15]:
BC4_TGACCA = get_ipython().getoutput("grep -o 'TGACCA' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[16]:
#Converts the value in the BC4_TGACCA string list at index 0 (BC4_TGACCA[0]) to
#an integer value of base 10.
BC4_TGACCA = int(BC4_TGACCA[0])
# In[17]:
print BC4_TGACCA
# ---
# In[93]:
BC5_ACAGTG = get_ipython().getoutput("grep -o 'ACAGTG' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[94]:
#Converts the value in the BC5_ACAGTG string list at index 0 (BC5_ACAGTG[0]) to
#an integer value of base 10.
BC5_ACAGTG = int(BC5_ACAGTG[0] ,10)
# In[95]:
print BC5_ACAGTG
# ---
# In[105]:
BC6_GCCAAT = get_ipython().getoutput("grep -o 'GCCAAT' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[106]:
#Converts the value in the BC6_GCCAAT string list at index 0 (BC6_GCCAAT[0]) to
#an integer value of base 10.
BC6_GCCAAT = int(BC6_GCCAAT[0] ,10)
# In[107]:
print BC6_GCCAAT
# ---
# In[108]:
BC7_CAGATC = get_ipython().getoutput("grep -o 'CAGATC' 2112_lane1_NoIndex_L001_R1_001.fastq | wc -l")
# In[109]:
#Converts the value in the BC7_CAGATC string list at index 0 (BC7_CAGATC[0]) to
#an integer value of base 10.
BC7_CAGATC = int(BC7_CAGATC[0] ,10)
# In[110]:
print BC7_CAGATC
# ---
# ###Total Number of Short Barcodes Identified
# In[111]:
#Adds all of the counts from each Illumina adaptor/index sequence.
#Saves to variable "sum_short".
sum_short = BC1_ATCACG + BC3_TTAGGC + BC4_TGACCA + BC5_ACAGTG + BC6_GCCAAT + BC7_CAGATC
# In[112]:
print sum_short
# ###Percentage of Reads Containing Short Barcodes
# In[132]:
#Calculates percentage of reads having full-lenght Illumina adaptor/index sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(sum_short)/TotalSeqs)*100)
# ---
# ##Use ```fastx_barcode_splitter``` to identify full-length TruSeq adaptor/index sequences.
# ####The ```fastx_barcode_splitter``` is a component of fastx_toolkit-0.0.13.2
# In[138]:
#The full-lengths barcode file used by fastx_barcode_splitter.
get_ipython().system('head TruSeqBarcodesLong.txt')
# ###Look for full-length barcodes at beginning of lines
# In[139]:
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesLong.txt)
#Specify to look for barcode at beginning of file (--bol)
#Specify output location and append a prefix to new file name (--prefix ./bol_)
#Specify new file name suffix (--suffix ".fastq")
get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodesLong.txt --bol --prefix ./bol_long_ --suffix ".fastq" | tee bol_long_stats.txt')
# ###Look for full-length barcodes at end of lines
# In[140]:
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesLong.txt)
#Specify to look for barcode at beginning of file (--eol)
#Specify output location and append a prefix to new file name (--prefix ./eol_)
#Specify new file name suffix (--suffix ".fastq")
get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodes.txt --eol --prefix ./eol_long_ --suffix ".fastq" | tee eol_long_stats.txt')
# ##Use ```fastx_barcode_splitter``` to identify TruSeq index sequences.
# In[141]:
#The full-lenghts barcode file used by fastx_barcode_splitter.
get_ipython().system('head TruSeqBarcodesShort.txt')
# ###Look for index sequences at beginning of lines
# In[142]:
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesShort.txt)
#Specify to look for barcode at beginning of file (--bol)
#Specify output location and append a prefix to new file name (--prefix ./bol_)
#Specify new file name suffix (--suffix ".fastq")
get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodesShort.txt --bol --prefix ./bol_ --suffix ".fastq" | tee bol_short_stats.txt')
# ###Look for index sequences at end of lines
# In[143]:
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesShort.txt)
#Specify to look for barcode at beginning of file (--eol)
#Specify output location and append a prefix to new file name (--prefix ./eol_)
#Specify new file name suffix (--suffix ".fastq")
get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | fastx_barcode_splitter.pl --bcfile TruSeqBarcodesShort.txt --eol --prefix ./eol_ --suffix ".fastq" | tee eol_short_stats.txt')