import glob
import os
import yaml
import re
import copy
from io import StringIO
import pandas as pd
import numpy as np
fps = ['Bs_S106_L001_R1_001.fastq.gz',
'Bs_S106_L001_R2_001.fastq.gz',
'Bs_S106_L002_R1_001.fastq.gz',
'Bs_S106_L002_R2_001.fastq.gz',
'Vf_S104_L001_R1_001.fastq.gz',
'Vf_S104_L001_R2_001.fastq.gz',
'Vf_S104_L002_R1_001.fastq.gz',
'Vf_S104_L002_R2_001.fastq.gz']
exp_df = pd.DataFrame.from_dict({'Extension': {0: 'fastq.gz',
1: 'fastq.gz',
2: 'fastq.gz',
3: 'fastq.gz',
4: 'fastq.gz',
5: 'fastq.gz',
6: 'fastq.gz',
7: 'fastq.gz'},
'File': {0: 'Bs_S106_L001_R1_001.fastq.gz',
1: 'Bs_S106_L001_R2_001.fastq.gz',
2: 'Bs_S106_L002_R1_001.fastq.gz',
3: 'Bs_S106_L002_R2_001.fastq.gz',
4: 'Vf_S104_L001_R1_001.fastq.gz',
5: 'Vf_S104_L001_R2_001.fastq.gz',
6: 'Vf_S104_L002_R1_001.fastq.gz',
7: 'Vf_S104_L002_R2_001.fastq.gz'},
'Index': {0: 'S106',
1: 'S106',
2: 'S106',
3: 'S106',
4: 'S104',
5: 'S104',
6: 'S104',
7: 'S104'},
'Lane': {0: 'L001',
1: 'L001',
2: 'L002',
3: 'L002',
4: 'L001',
5: 'L001',
6: 'L002',
7: 'L002'},
'Read': {0: 'R1',
1: 'R2',
2: 'R1',
3: 'R2',
4: 'R1',
5: 'R2',
6: 'R1',
7: 'R2'},
'Run': {0: '001',
1: '001',
2: '001',
3: '001',
4: '001',
5: '001',
6: '001',
7: '001'},
'Sample': {0: 'Bs',
1: 'Bs',
2: 'Bs',
3: 'Bs',
4: 'Vf',
5: 'Vf',
6: 'Vf',
7: 'Vf'}})
def parse_ilm_fps_to_df(fps,
pattern = '^((.+?)_(S\d+)_(L\d+)_(R[12])_(\d+)\.(.+))$',
pattern_names = ['File','Sample','Index','Lane','Read','Run','Extension']):
p = re.compile(pattern)
df = pd.DataFrame(columns = pattern_names)
for f in fps:
m = p.match(f)
if m:
df = df.append(dict(zip(pattern_names, m.groups())), ignore_index = True)
return(df)
obs_df = parse_ilm_fps_to_df(fps)
pd.util.testing.assert_frame_equal(obs_df.sort_index(axis=1), exp_df.sort_index(axis=1))
df = exp_df
seq_dir = './example/reads'
exp_dict = {'Bs': {'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
def get_sample_reads_df(df, seq_dir):
sample_reads_dict = {}
samples = list(set(df['Sample']))
for s in samples:
f_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R1'),'File'].values))
r_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R2'),'File'].values))
sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
return(sample_reads_dict)
obs_dict = get_sample_reads_df(df, seq_dir)
assert(exp_dict == obs_dict)
def get_sample_paths(seq_dir):
fps = os.listdir(seq_dir)
files_df = parse_ilm_fps_to_df(fps)
sample_reads_dict = get_sample_reads_df(files_df, seq_dir)
return(sample_reads_dict)
exp_db_dict = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
exp_db_dict_update = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
exp_db_dict_partial = {'Bs': {'filter_db': None,
'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
'./example/reads/Bs_S106_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
'./example/reads/Bs_S106_L002_R2_001.fastq.gz']},
'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
'./example/reads/Vf_S104_L002_R1_001.fastq.gz'],
'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
'./example/reads/Vf_S104_L002_R2_001.fastq.gz']}}
def add_filter_db(sample_fp_dict, db_fp, samples = None):
if samples is None:
samples = sample_fp_dict.keys()
samples_dict = copy.deepcopy(sample_fp_dict)
for s in samples_dict:
if s in samples:
samples_dict[s]['filter_db'] = db_fp
elif 'filter_db' in samples_dict[s]:
continue
else:
samples_dict[s]['filter_db'] = None
return(samples_dict)
obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens')
assert(obs_dict == exp_db_dict)
obs_dict = add_filter_db(exp_db_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])
assert(obs_dict == exp_db_dict_update)
obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])
assert(obs_dict == exp_db_dict_partial)
Steps:
First, we need to read in a set of samples for analysis.
There are two options for this: guess the sample names, or read them in from a sample sheet or manifest.
Enter the correct sequencing directory below, and a list of some example filenames should pop up. Make sure these look correct.
seq_dir = './test_data/test_reads/'
!ls -l {seq_dir} | head
total 219216 -rw-r--r--+ 1 jonsanders staff 8797141 Sep 12 16:52 S22205_S104_L001_R1_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 9666850 Sep 12 16:52 S22205_S104_L001_R2_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 8982873 Sep 12 16:52 S22207_S103_L001_R1_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 9939119 Sep 12 16:52 S22207_S103_L001_R2_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 9012384 Sep 12 16:52 S22282_S102_L001_R1_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 9930763 Sep 12 16:52 S22282_S102_L001_R2_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 8901974 Sep 12 16:52 S22400_S101_L001_R1_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 9781744 Sep 12 16:52 S22400_S101_L001_R2_001.fastq.gz -rw-r--r--+ 1 jonsanders staff 8864399 Sep 12 16:52 S22401_S100_L001_R1_001.fastq.gz
If that looks correct, execute the following cell. It should result in a dictionary associating samples to filepaths.
To validate, a portion of the dictionary is converted to yaml and printed below:
sample_paths = get_sample_paths(seq_dir)
print(yaml.dump(sample_paths, default_flow_style = False))
S22205: forward: - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz S22207: forward: - ./test_data/test_reads/S22207_S103_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22207_S103_L001_R2_001.fastq.gz S22282: forward: - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz S22400: forward: - ./test_data/test_reads/S22400_S101_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22400_S101_L001_R2_001.fastq.gz S22401: forward: - ./test_data/test_reads/S22401_S100_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22401_S100_L001_R2_001.fastq.gz S22402: forward: - ./test_data/test_reads/S22402_S105_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22402_S105_L001_R2_001.fastq.gz
Read in samples from the sample sheet used to demultiplex them.
This is useful if, for example, you have nonstandard file naming, only want to process a subset of sequences in a run, or want to associate other names to the sequences.
# read in sample sheet
def read_sample_sheet(f, sep='\t'):
data = False
data_lines = ''
for line in f:
if data:
if line.startswith('[') or line.strip() == '':
data = False
continue
data_lines += line
elif line.startswith('[Data]'):
data = True
continue
else:
continue
data_df = pd.read_csv(StringIO(data_lines), sep = '\t', comment='#')
return(data_df)
with open('./test_data/example_sample_sheet.txt', 'r') as f:
ss_df = read_sample_sheet(f)
ss_df.head()
Lane | Sample_ID | Sample_Name | Sample_Plate | Sample_Well | I7_Index_ID | index | I5_Index_ID | index2 | Sample_Project | Description | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | S22205 | S22205 | Example Plate 1 | A1 | iTru7_101_01 | ACGTTACC | iTru5_01_A | ACCGACAA | Example Project | sample_S22205 |
1 | 1 | S22282 | S22282 | Example Plate 1 | B1 | iTru7_101_02 | CTGTGTTG | iTru5_01_B | AGTGGCAA | Example Project | sample_S22282 |
seq_dir = './test_data/test_reads/'
fps = os.listdir(seq_dir)
files_df = parse_ilm_fps_to_df(fps)
files_df
File | Sample | Index | Lane | Read | Run | Extension | |
---|---|---|---|---|---|---|---|
0 | S22205_S104_L001_R1_001.fastq.gz | S22205 | S104 | L001 | R1 | 001 | fastq.gz |
1 | S22205_S104_L001_R2_001.fastq.gz | S22205 | S104 | L001 | R2 | 001 | fastq.gz |
2 | S22207_S103_L001_R1_001.fastq.gz | S22207 | S103 | L001 | R1 | 001 | fastq.gz |
3 | S22207_S103_L001_R2_001.fastq.gz | S22207 | S103 | L001 | R2 | 001 | fastq.gz |
4 | S22282_S102_L001_R1_001.fastq.gz | S22282 | S102 | L001 | R1 | 001 | fastq.gz |
5 | S22282_S102_L001_R2_001.fastq.gz | S22282 | S102 | L001 | R2 | 001 | fastq.gz |
6 | S22400_S101_L001_R1_001.fastq.gz | S22400 | S101 | L001 | R1 | 001 | fastq.gz |
7 | S22400_S101_L001_R2_001.fastq.gz | S22400 | S101 | L001 | R2 | 001 | fastq.gz |
8 | S22401_S100_L001_R1_001.fastq.gz | S22401 | S100 | L001 | R1 | 001 | fastq.gz |
9 | S22401_S100_L001_R2_001.fastq.gz | S22401 | S100 | L001 | R2 | 001 | fastq.gz |
10 | S22402_S105_L001_R1_001.fastq.gz | S22402 | S105 | L001 | R1 | 001 | fastq.gz |
11 | S22402_S105_L001_R2_001.fastq.gz | S22402 | S105 | L001 | R2 | 001 | fastq.gz |
def get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID'):
sample_reads_dict = {}
samples = list(set(ss_df[sample_col]))
fps = os.listdir(seq_dir)
files_df = parse_ilm_fps_to_df(fps)
for s in samples:
# get the subset of sample sheet rows for that sample
sample_rows = ss_df.loc[ss_df[sample_col] == s,]
f_fps = []
r_fps = []
# iter across subset table and find file corresponding to each row
for idx, row in sample_rows.iterrows():
lane = 'L{0:03d}'.format(row['Lane'])
f_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) &
(files_df['Lane'] == lane) &
(files_df['Read'] == 'R1'), 'File'].values)
r_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) &
(files_df['Lane'] == lane) &
(files_df['Read'] == 'R2'), 'File'].values)
f_fps = sorted(f_fps)
r_fps = sorted(r_fps)
sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
return(sample_reads_dict)
sample_paths = get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID')
print(yaml.dump(sample_paths, default_flow_style = False))
sample_S22205: forward: - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz sample_S22282: forward: - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz
- filter db per sample
- samples for binning
- samples for abundance profiling
- tmp dir
- envs
- software
- assembler
- trimmer
- params
* atropos
* maxbin
* humann2
* metaphlan
* kraken
* shogun?
config_dict = {}
These are the filepaths to the databases used for host filtering.
The method 'add host filter db' adds the provided filepath to the sample dictionary you created above.
By default, all samples get same database.
You can also provide a list of specific sample names to update if you want to set different filter dbs for some samples. To do this, just re-execute add_filter_db
providing a list of samples to update; the existing filter_db
paths will remain unchanged if they are not in this list.
Example:
filter_db1 = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'
filter_db2 = '/home/jgsanders/ref_data/genomes/mouse/mouse'
samples_dict = add_filter_db(sample_paths, filter_db1)
samples_dict = add_filter_db(samples_dict, filter_db2, samples = ['Vf'])
filter_db = './test_data/phix'
samples_dict = add_filter_db(sample_paths, filter_db)
print(yaml.dump(samples_dict, default_flow_style = False))
sample_S22205: filter_db: ./test_data/phix forward: - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz sample_S22282: filter_db: ./test_data/phix forward: - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz
These are the samples from which assembled contigs will be binned into putative draft genomes.
By default, bin all samples.
binning_samples = list(samples_dict.keys())
config_dict['binning_samples'] = binning_samples
These are the samples from which abundance information will be used for binning of binning_samples
By default, use all samples.
# This line filters out BLANK* samples from abundance mapping
abundance_samples = [x for x in samples_dict if not x.startswith('BLANK')]
# This line uses all samples for abundance mapping
abundance_samples = list(samples_dict.keys())
config_dict['abundance_samples'] = abundance_samples
These are the read trimmers to use for qc.
Currently, the available options are:
- atropos
- skewer
config_dict['trimmer'] = 'atropos'
These are the assemblers to use for assembly.
Currently, the available options are:
- spades
- metaspades
- megahit
assemblers = ['megahit','metaspades']
config_dict['assemblers'] = assemblers
This is the assembler to use for binning.
Currently, the available options are:
- spades
- metaspades
- megahit
config_dict['mapping_assembler'] = 'metaspades'
This is the directory used during execution of rules. Should ideally be local scratch storage.
tmp_dir_root = './'
config_dict['tmp_dir_root'] = tmp_dir_root
These are the conda environments to use for various rules.
env
is the primary snakemake_assemble
environment. Change others as necessary for other rules.
envs = {'anvi': 'source activate oecophylla-anvi',
'humann2': 'source activate oecophylla-humann2',
'kraken': 'source activate oecophylla-kraken',
'shogun': 'source activate oecophylla-shogun',
'qc': 'source activate oecophylla-qc',
'raw': 'source activate oecophylla-qc'}
Sets paths to resources for anvio
resources = {'centrifuge_base': '/home/jgsanders/miniconda/envs/anvio2/centrifuge',
'centrifuge_models': '/home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v'}
config_dict['resources'] = resources
These are parameters passed to the various tools used.
params = {}
Sets quality and adapter trimming parameters.
Some suggested defaults:
Nextera: -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT -q 15 --minimum-length 100 --pair-filter any
KapaHyperPlus: -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any
params['atropos'] = ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any'
Sets binning parameters.
params['maxbin'] = '-plotmarker'
Sets parameters for Kraken taxonomic profiling.
params['kraken'] = {'kraken_db': '/home/qiz173/Databases/Kraken/stdb'}
Sets parameters for Shogun taxonomic profiling.
params['shogun'] = "--utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr"
Sets parameters for MetaPhlAn2 taxonomic profilling.
params['metaphlan'] = {'metaphlan_dir': '/home/jgsanders/git_sw/metaphlan2'}
Sets various parameters for HUMAnN2 funcitonal profiling.
humann2_aa_db
: Amino acid database for translated amino acid search
humann2_nt_db
: ChocoPhlAn database for nucleotide search
metaphan_dir
: path to metaphlan2 directory install, should also have the default metaphlan2 db
norms
: normalizations for which to output tables
other
: any other HUMAnN2 parameters to use
humann2 = {'humann2_aa_db': 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref',
'humann2_nt_db': '/home/jgsanders/ref_data/humann2/chocophlan',
'metaphlan_dir': '/home/jgsanders/share/metaphlan2',
'norms': ['cpm','relab'],
'other': ''}
params['humann2'] = humann2
parameters for mash
other
: mash command-line parameters for sketchingrefseq_db
: path to refseq db sketch; default /home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.mshdepth
: number of reads to make sketch from. forward and reverse reads will be independently subsampled to reach total. Leave blank (''
) to skip subsampling.mash = {'other': '-r -m 2 -k 21 -s 1000',
'refseq_db': '/home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh',
'depth': '100000'}
Now the entire configuration dictionary can be formatted as a yaml file and saved for use in the pipeline.
config_dict['params'] = params
config_dict['samples'] = samples_dict
config_dict['envs'] = envs
Format as yaml and write to filepath:
config_yaml = yaml.dump(config_dict, default_flow_style = False)
with open('config.yaml', 'w') as f:
f.write(config_yaml)
This is the complete config yaml:
print(config_yaml)
abundance_samples: - sample_S22282 - sample_S22205 assemblers: - megahit - metaspades binning_samples: - sample_S22282 - sample_S22205 envs: anvi: source activate oecophylla-anvi humann2: source activate oecophylla-humann2 kraken: source activate oecophylla-kraken qc: source activate oecophylla-qc raw: source activate oecophylla-qc shogun: source activate oecophylla-shogun mapping_assembler: metaspades params: atropos: ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any' humann2: humann2_aa_db: 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref' humann2_nt_db: /home/jgsanders/ref_data/humann2/chocophlan metaphlan_dir: /home/jgsanders/share/metaphlan2 norms: - cpm - relab other: '' kraken: kraken_db: /home/qiz173/Databases/Kraken/stdb maxbin: -plotmarker metaphlan: metaphlan_dir: /home/jgsanders/git_sw/metaphlan2 shogun: --utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr resources: centrifuge_base: /home/jgsanders/miniconda/envs/anvio2/centrifuge centrifuge_models: /home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v samples: sample_S22205: filter_db: ./test_data/phix forward: - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz sample_S22282: filter_db: ./test_data/phix forward: - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz reverse: - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz tmp_dir_root: ./ trimmer: atropos