import pathlib
import gzip
import Bio.SeqIO
directory = pathlib.Path('samples/ubiome')
fastq_paths = sorted(directory.glob('**/*.fastq.gz'))
fastq_paths
[PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L001.fastq.gz'), PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L002.fastq.gz'), PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L003.fastq.gz'), PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L004.fastq.gz'), PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L001.fastq.gz'), PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L002.fastq.gz'), PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L003.fastq.gz'), PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L004.fastq.gz')]
%%time
path = fastq_paths[0]
with gzip.open(path, 'rt') as file_handle:
sequences = list(Bio.SeqIO.parse(file_handle, "fastq"))
CPU times: user 220 ms, sys: 8 ms, total: 228 ms Wall time: 236 ms
len(sequences)
10675
sequences[0]
SeqRecord(seq=Seq('AGTGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATGACT...GAG', SingleLetterAlphabet()), id='NB501532:123:HMH2CAFXX:1:11101:8553:1152', name='NB501532:123:HMH2CAFXX:1:11101:8553:1152', description='NB501532:123:HMH2CAFXX:1:11101:8553:1152 1:N:0:GATGATGA+CTACCTCG', dbxrefs=[])