#!/usr/bin/env python # coding: utf-8 # FASTQ # ===== # # This notebook explores [FASTQ], the most common format for storing sequencing reads. # # FASTA and FASTQ are rather similar, but FASTQ is almost always used for storing *sequencing reads* (with associated quality values), whereas FASTA is used for storing all kinds of DNA, RNA or protein sequencines (without associated quality values). # # Before delving into the format, I should mention that there are great tools and libraries for parsing and manipulating FASTQ, e.g. [FASTX], and [BioPython]'s [SeqIO] module. If your needs are relatively simple, you might try using these tools and libraries and skip reading this document. # # [FASTA]: http://en.wikipedia.org/wiki/FASTA_format # [FASTQ]: http://en.wikipedia.org/wiki/FASTQ_format # [BioPython]: http://biopython.org/wiki/Main_Page # [SeqIO]: http://biopython.org/wiki/SeqIO # [FASTX]: http://hannonlab.cshl.edu/fastx_toolkit/ # ### Basic format # Here's a single sequencing read in FASTQ format: # # @ERR294379.100739024 HS24_09441:8:2203:17450:94030#42/1 # AGGGAGTCCACAGCACAGTCCAGACTCCCACCAGTTCTGACGAAATGATGAGAGCTCAGAAGTAACAGTTGCTTTCAGTCCCATAAAAACAGTCCTACAA # + # BDDEEF?FGFFFHGFFHHGHGGHCH@GHHHGFAHEGFEHGEFGHCCGGGFEGFGFFDFFHBGDGFHGEFGHFGHGFGFFFEHGGFGGDGHGFEEHFFHGE # # It's spread across four lines. The four lines are: # # 1. "`@`" followed by a read name # 2. Nucleotide sequence # 3. "`+`", possibly followed by some info, but ignored by virtually all tools # 4. Quality sequence (explained below) # # Here is a very simple Python function for parsing file of FASTQ records: # In[1]: def parse_fastq(fh): """ Parse reads from a FASTQ filehandle. For each read, we return a name, nucleotide-string, quality-string triple. """ reads = [] while True: first_line = fh.readline() if len(first_line) == 0: break # end of file name = first_line[1:].rstrip() seq = fh.readline().rstrip() fh.readline() # ignore line starting with + qual = fh.readline().rstrip() reads.append((name, seq, qual)) return reads fastq_string = '''@ERR294379.100739024 HS24_09441:8:2203:17450:94030#42/1 AGGGAGTCCACAGCACAGTCCAGACTCCCACCAGTTCTGACGAAATGATG + BDDEEF?FGFFFHGFFHHGHGGHCH@GHHHGFAHEGFEHGEFGHCCGGGF @ERR294379.136275489 HS24_09441:8:2311:1917:99340#42/1 CTTAAGTATTTTGAAAGTTAACATAAGTTATTCTCAGAGAGACTGCTTTT + @@AHFF?EEDEAF?FEEGEFD?GGFEFGECGE?9H?EEABFAG9@CDGGF @ERR294379.97291341 HS24_09441:8:2201:10397:52549#42/1 GGCTGCCATCAGTGAGCAAGTAAGAATTTGCAGAAATTTATTAGCACACT + CDAF@#@=44465HHHHH''' parse_paired_fastq(StringIO(fastq_string1), StringIO(fastq_string2)) # ### Other comments # # In all the examples above, the reads in the FASTQ file are all the same length. This is not necessarily the case though it is usually true for datasets generated by sequencing-by-synthesis instruments. FASTQ files can contain reads of various lengths. # # FASTQ files often have extension `.fastq` or `.fq`. # ### Other resources # # * [Wikipedia page for FASTQ format](http://en.wikipedia.org/wiki/Fastq_format) # * [BioPython], which has [its own ways of parsing FASTA](http://biopython.org/wiki/SeqIO) # * [FASTX] toolkit # * [seqtk] # * [FastQC] # # [BioPython]: http://biopython.org/wiki/Main_Page # [SeqIO]: http://biopython.org/wiki/SeqIO # [SAMtools]: http://samtools.sourceforge.net/ # [FASTX]: http://hannonlab.cshl.edu/fastx_toolkit/ # [FASTQC]: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ # [seqtk]: https://github.com/lh3/seqtk # # © Copyright [Ben Langmead](http://www.cs.jhu.edu/~langmea) 2014--2019