#!/usr/bin/env python
# coding: utf-8

# ##Count the total number of sequences in the FASTQ file and store in variable

# This command uses bash commands to count the number of lines in the FASTQ file (```wc-l```),
# divides the total number of lines by ```4``` (there are 4 lines per read in Illumina FASTQ files).
# The ```echo``` command is used to print the result to the screen, which gets stored in the variable:
# ```TotalSeqs```

# In[5]:


TotalSeqs = get_ipython().getoutput('echo $((`wc -l < 2112_lane1_NoIndex_L001_R1_001.fastq` / 4))')


# In[6]:


#Prints the value stored in TotalSeqs.
#Notice that this is a Python string list and is not an integer!
print TotalSeqs


# In[7]:


#Converts the value in the TotalSeqs string list at index 0 (TotalSeqs[0]) to 
#an integer value of base 10.
#This conversion will be used repeatedly throughout this notebook to allow 
#mathematical calculations using the numbers generated by bash commands.
TotalSeqs = int(TotalSeqs[0], 10)


# In[13]:


print TotalSeqs


# ---

# ##Use bash ```grep``` and ```wc -l``` to count all the instances of the TruSeq adaptor sequence:
# 
# GATCGGAAGAGCACACGTCTGAACTCCAGTCAC

# In[21]:


TruSeq_adaptor_grep = get_ipython().getoutput("grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' 2112_lane1_NoIndex_L001_R1_001.fastq  | wc -l")


# In[22]:


#Converts the value in the TruSeq_adaptor_grep string list at index 0 (TruSeq_adaptor_grep[0]) to 
#an integer value of base 10.
TruSeq_adaptor_grep = int(TruSeq_adaptor_grep[0])


# In[23]:


print TruSeq_adaptor_grep


# ---

# ###Percentage of Reads Containing TruSeq adaptor sequence

# In[24]:


#Calculates percentage of reads having TruSeq adaptor sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since 
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(TruSeq_adaptor_grep)/TotalSeqs)*100)


# ---

# ##Use bash ```grep``` and ```wc -l``` to count all the instances of the Epinext universal primer sequence:
# 
# AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT

# In[1]:


EpinextUniversal_grep = get_ipython().getoutput("grep -o 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' 2112_lane1_NoIndex_L001_R1_001.fastq  | wc -l")


# In[2]:


#Converts the value in the EpinextUniversal_grep string list at index 0 (EpinextUniversal_grep[0]) to 
#an integer value of base 10.
EpinextUniversal_grep = int(EpinextUniversal_grep[0])


# In[3]:


print EpinextUniversal_grep


# ---

# ###Percentage of Reads Containing Epinext Universal Primer sequence

# In[8]:


#Calculates percentage of reads having TruSeq adaptor sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since 
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(EpinextUniversal_grep)/TotalSeqs)*100)


# ---

# ##Use ```fastx_barcode_splitter``` to identify  TruSeq adaptor sequence.

# ####The ```fastx_barcode_splitter``` is a component of fastx_toolkit-0.0.13.2

# In[25]:


#The full-lengths barcode file used by fastx_barcode_splitter.
get_ipython().system('head TruSeqAdaptor.txt')


# ###Look for TruSeq adaptor at beginning of lines

# In[36]:


#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqAdaptor.txt)
#Specify to look for barcode at beginning of file (--bol)
#Specify output location and append a prefix to new file name (--prefix ./bol_)
#Specify new file name suffix (--suffix ".fastq")
#Print data to screen and output file (tee bol_TruSeqAdaptor_stats.txt)
get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz |  fastx_barcode_splitter.pl  --bcfile TruSeqAdaptor.txt  --bol  --prefix ./bol_  --suffix ".fastq" |  tee bol_TruSeqAdaptor_stats.txt')


# In[37]:


#Uses awk to capture the second field (the "Count" column; print $2) from
#the second line (FNR == 2) of the bol_TruSeqAdaptor_stats.txt
#Stores the value in the variable TruSeqAdaptor_fastx_bol as a Python string list.
TruSeqAdaptor_fastx_bol = get_ipython().getoutput("awk 'FNR == 2 {print $2}' bol_TruSeqAdaptor_stats.txt")


# In[38]:


print TruSeqAdaptor_fastx_bol


# In[39]:


#Converts the value in the TruSeqAdaptor_fastx_bol string list at index 0 (TruSeqAdaptor_fastx_bol[0]) to 
#an integer value of base 10.
TruSeqAdaptor_fastx_bol = int(TruSeqAdaptor_fastx_bol[0])


# In[40]:


print TruSeqAdaptor_fastx_bol


# ---

# ###Percentage of Reads Containing TruSeq adaptor sequence at Beginning of Lines

# In[41]:


#Calculates percentage of reads having TruSeq adaptor sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since 
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(TruSeqAdaptor_fastx_bol)/TotalSeqs)*100)


# ---

# ###Look for TruSeq adaptor at end of lines

# In[43]:


#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqAdaptor.txt)
#Specify to look for barcode at beginning of file (--bol)
#Specify output location and append a prefix to new file name (--prefix ./bol_)
#Specify new file name suffix (--suffix ".fastq")
#Print data to screen and output file (tee bol_TruSeqAdaptor_stats.txt)
get_ipython().system('gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz |  fastx_barcode_splitter.pl  --bcfile TruSeqAdaptor.txt  --eol  --prefix ./eol_  --suffix ".fastq" |  tee eol_TruSeqAdaptor_stats.txt')


# In[44]:


#Uses awk to capture the second field (the "Count" column; print $2) from
#the second line (FNR == 2) of the TruSeqAdaptor_fastx_eol.txt
#Stores the value in the variable TruSeqAdaptor_fastx_eol as a Python string list.
TruSeqAdaptor_fastx_eol = get_ipython().getoutput("awk 'FNR == 2 {print $2}' eol_TruSeqAdaptor_stats.txt")


# In[45]:


#Converts the value in the TruSeqAdaptor_fastx_bol string list at index 0 (TruSeqAdaptor_fastx_bol[0]) to 
#an integer value of base 10.
TruSeqAdaptor_fastx_eol = int(TruSeqAdaptor_fastx_eol[0])


# In[46]:


print TruSeqAdaptor_fastx_eol


# ---

# ###Percentage of Reads Containing TruSeq adaptor sequence at End of Lines

# In[47]:


#Calculates percentage of reads having TruSeq adaptor sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since 
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(TruSeqAdaptor_fastx_eol)/TotalSeqs)*100)