#!/usr/bin/env python # coding: utf-8 # ###Quality trim all fastq.gz files using [Trimmomatic (v0.30)](http://www.usadellab.org/cms/?page=trimmomatic) # ####Code explanation of for loop below: # 1. ```%%bash``` specifies to use the shell for this Jupyter cell # 2. ```for file in /Volumes/nightingales/C_virginica/2112_lane1_[^N]*``` initiates a for loop to handle all files beginning with ```2212_lane2_``` and only those that do not have the letter "N" at that position in the file name. # 3. ```do``` tells the for loop what to do with each of the files. # 4. ```newname=${file##*/}``` takes the value of the ```$file``` variable (which is ```/Volumes/nightingales/C_gigas/2212_lane2_[^N]*```) and trims the longest match from the beginning of the pattern (the pattern is ```*/```; the ```##``` is a bash command to specifiy how to trim). The resulting output (which is just the file name without the full path) is then stored in the ```newname``` variable. # 5. This line initiates Trimmomatic and uses the following arguments to specify order of execution: # 1. single end reads (```SE```) # 1. number of threads (```-threads 16```), # 2. type of quality score (```-phred33```), # 3. input file location (```"$file"```), # 4. output file name/location (```/Volumes/Data/Sam/scratch/20150414_trimmed_$newname```), # 5. single end Illumina TruSeq adaptor trimming (```ILLUMINACLIP:/usr/local/bioinformatics/Trimmomatic-0.30/adapters/TruSeq3-SE.fa:2:30:10```), # 6. cut number of bases at beginning of read if below quality threshold (```LEADING:3```) # 7. cut number of bases at end of read if below quality threshold (```TRAILING:3```) # 8. cut if average quality within 4 base window falls below 15 (```SLIDINGWINDOW:4:15```) # 6. ```done``` closes for loop. # In[2]: get_ipython().run_cell_magic('bash', '', 'for file in /Volumes/nightingales/C_virginica/2112_lane1_[^N]*\ndo\nnewname=${file##*/}\njava -jar /usr/local/bioinformatics/Trimmomatic-0.30/trimmomatic-0.30.jar SE -threads 16 -phred33 "$file" /Volumes/Data/Sam/scratch/20150414_trimmed_$newname ILLUMINACLIP:/usr/local/bioinformatics/Trimmomatic-0.30/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15;\ndone\n') # ###FASTQC on all trimmed files using [FASTQC (v0.11.2)](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) # In[23]: get_ipython().run_cell_magic('bash', '', 'for file in /Volumes/Data/Sam/scratch/20150414_trimmed_2112*; do fastqc "$file" --outdir=/Volumes/Eagle/Arabidopsis/; done\n') # ###Unzip all FASTQC .zip files # In[24]: cd /Volumes/Eagle/Arabidopsis/ # In[25]: get_ipython().run_cell_magic('bash', '', 'for file in 20150414_trimmed_2112_lane1_*.zip; do unzip "$file"; done\n') # ###Concatenate groups of sequences into single files # In[3]: cd /Volumes/Data/Sam/scratch/ # ####HB2 25,000ppm oil Index - ATCACG # In[4]: get_ipython().run_cell_magic('bash', '', '#gunzips all matching files in folder and appends the data to a single file:\n#20150414_trimmed_2112_lane1_HB2_Oil_25000ppm_ATCACG.fastq\nfor file in 20150414_trimmed_2112_lane1_ATCACG*\ndo\ngunzip -c "$file" >> 20150414_trimmed_2112_lane1_HB2_Oil_25000ppm_ATCACG.fastq\ndone\n') # In[5]: get_ipython().run_cell_magic('bash', '', '#Gzip file\ngzip 20150414_trimmed_2112_lane1_HB2_Oil_25000ppm_ATCACG.fastq\n') # ####HB16 25,000ppm oil Index - TTAGGC # In[20]: get_ipython().run_cell_magic('bash', '', '#gunzips all matching files in folder and appends the data to a single file:\n#20150414_trimmed_2112_lane1_HB16_Oil_25000ppm_TTAGGC.fastq\nfor file in 20150414_trimmed_2112_lane1_TTAGGC*\ndo\ngunzip -c "$file" >> 20150414_trimmed_2112_lane1_HB16_Oil_25000ppm_TTAGGC.fastq\ndone\n') # In[21]: get_ipython().run_cell_magic('bash', '', '#Gzip file\ngzip 20150414_trimmed_2112_lane1_HB16_Oil_25000ppm_TTAGGC.fastq\n') # ####HB30 25,000ppm oil Index - TGACCA # In[8]: get_ipython().run_cell_magic('bash', '', '#gunzips all matching files in folder and appends the data to a single file:\n#20150414_trimmed_2112_lane1_HB30_Oil_25000ppm_TGACCA.fastq\nfor file in 20150414_trimmed_2112_lane1_TGACCA*\ndo\ngunzip -c "$file" >> 20150414_trimmed_2112_lane1_HB30_Oil_25000ppm_TGACCA.fastq\ndone\n') # In[9]: get_ipython().run_cell_magic('bash', '', '#Gzip file\ngzip 20150414_trimmed_2112_lane1_HB30_Oil_25000ppm_TGACCA.fastq\n') # ####NB3 No oil Index - ACAGTG # In[10]: get_ipython().run_cell_magic('bash', '', '#gunzips all matching files in folder and appends the data to a single file:\n#20150414_trimmed_2112_lane1_NB3_NoOil_ACAGTG.fastq\nfor file in 20150414_trimmed_2112_lane1_ACAGTG*\ndo\ngunzip -c "$file" >> 20150414_trimmed_2112_lane1_NB3_NoOil_ACAGTG.fastq\ndone\n') # In[11]: get_ipython().run_cell_magic('bash', '', '#Gzip file\ngzip 20150414_trimmed_2112_lane1_NB3_NoOil_ACAGTG.fastq\n') # ####NB6 No oil Index - GCCAAT # In[12]: get_ipython().run_cell_magic('bash', '', '#gunzips all matching files in folder and appends the data to a single file:\n#20150414_trimmed_2112_lane1_NB6_NoOil_GCCAAT.fastq\nfor file in 20150414_trimmed_2112_lane1_GCCAAT*\ndo\ngunzip -c "$file" >> 20150414_trimmed_2112_lane1_NB6_NoOil_GCCAAT.fastq\ndone\n') # In[13]: get_ipython().run_cell_magic('bash', '', '#Gzip file\ngzip 20150414_trimmed_2112_lane1_NB6_NoOil_GCCAAT.fastq\n') # ####NB11 No oil Index - CAGATC # In[14]: get_ipython().run_cell_magic('bash', '', '#gunzips all matching files in folder and appends the data to a single file:\n#20150414_trimmed_2112_lane1_NB11_NoOil_CAGATC.fastq\nfor file in 20150414_trimmed_2112_lane1_CAGATC*\ndo\ngunzip -c "$file" >> 20150414_trimmed_2112_lane1_NB11_NoOil_CAGATC.fastq\ndone\n') # In[15]: get_ipython().run_cell_magic('bash', '', '#Gzip file\ngzip 20150414_trimmed_2112_lane1_NB11_NoOil_CAGATC.fastq\n') # ###Copy files to Eagle for web-based access # In[19]: get_ipython().run_cell_magic('bash', '', 'for file in 2015*e1_[NH]B*; do cp "$file" /Volumes/Eagle/Arabidopsis/; done\n') # In[ ]: