In [1]:

import glob
import os
import yaml
import re
import copy
from io import StringIO
import pandas as pd
import numpy as np

In [2]:

fps = ['Bs_S106_L001_R1_001.fastq.gz',
       'Bs_S106_L001_R2_001.fastq.gz',
       'Bs_S106_L002_R1_001.fastq.gz',
       'Bs_S106_L002_R2_001.fastq.gz',
       'Vf_S104_L001_R1_001.fastq.gz',
       'Vf_S104_L001_R2_001.fastq.gz',
       'Vf_S104_L002_R1_001.fastq.gz',
       'Vf_S104_L002_R2_001.fastq.gz']

exp_df = pd.DataFrame.from_dict({'Extension': {0: 'fastq.gz',
  1: 'fastq.gz',
  2: 'fastq.gz',
  3: 'fastq.gz',
  4: 'fastq.gz',
  5: 'fastq.gz',
  6: 'fastq.gz',
  7: 'fastq.gz'},
 'File': {0: 'Bs_S106_L001_R1_001.fastq.gz',
  1: 'Bs_S106_L001_R2_001.fastq.gz',
  2: 'Bs_S106_L002_R1_001.fastq.gz',
  3: 'Bs_S106_L002_R2_001.fastq.gz',
  4: 'Vf_S104_L001_R1_001.fastq.gz',
  5: 'Vf_S104_L001_R2_001.fastq.gz',
  6: 'Vf_S104_L002_R1_001.fastq.gz',
  7: 'Vf_S104_L002_R2_001.fastq.gz'},
 'Index': {0: 'S106',
  1: 'S106',
  2: 'S106',
  3: 'S106',
  4: 'S104',
  5: 'S104',
  6: 'S104',
  7: 'S104'},
 'Lane': {0: 'L001',
  1: 'L001',
  2: 'L002',
  3: 'L002',
  4: 'L001',
  5: 'L001',
  6: 'L002',
  7: 'L002'},
 'Read': {0: 'R1',
  1: 'R2',
  2: 'R1',
  3: 'R2',
  4: 'R1',
  5: 'R2',
  6: 'R1',
  7: 'R2'},
 'Run': {0: '001',
  1: '001',
  2: '001',
  3: '001',
  4: '001',
  5: '001',
  6: '001',
  7: '001'},
 'Sample': {0: 'Bs',
  1: 'Bs',
  2: 'Bs',
  3: 'Bs',
  4: 'Vf',
  5: 'Vf',
  6: 'Vf',
  7: 'Vf'}})

def parse_ilm_fps_to_df(fps,
                        pattern = '^((.+?)_(S\d+)_(L\d+)_(R[12])_(\d+)\.(.+))$',
                        pattern_names = ['File','Sample','Index','Lane','Read','Run','Extension']):
    
    p = re.compile(pattern)

    df = pd.DataFrame(columns = pattern_names)

    for f in fps:
        m = p.match(f)

        if m:
            df = df.append(dict(zip(pattern_names, m.groups())), ignore_index = True)

    return(df)

obs_df = parse_ilm_fps_to_df(fps)
pd.util.testing.assert_frame_equal(obs_df.sort_index(axis=1), exp_df.sort_index(axis=1))

df = exp_df
seq_dir = './example/reads'
exp_dict = {'Bs': {'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                               './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                   'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                               './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
            'Vf': {'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                               './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                   'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                               './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

def get_sample_reads_df(df, seq_dir):
    sample_reads_dict = {}
    
    samples = list(set(df['Sample']))
    
    for s in samples:
        f_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R1'),'File'].values))
        r_fps = sorted(list(df.loc[(df['Sample'] == s) & (df['Read'] == 'R2'),'File'].values))

        sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
                                'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
    
    return(sample_reads_dict)

obs_dict = get_sample_reads_df(df, seq_dir)
assert(exp_dict == obs_dict)

def get_sample_paths(seq_dir):
    fps = os.listdir(seq_dir)
    
    files_df = parse_ilm_fps_to_df(fps)
    sample_reads_dict = get_sample_reads_df(files_df, seq_dir)
    
    return(sample_reads_dict)

exp_db_dict = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

exp_db_dict_update = {'Bs': {'filter_db': '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens',
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

exp_db_dict_partial = {'Bs': {'filter_db': None,
                      'forward': ['./example/reads/Bs_S106_L001_R1_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Bs_S106_L001_R2_001.fastq.gz',
                                  './example/reads/Bs_S106_L002_R2_001.fastq.gz']},
               'Vf': {'filter_db': '/home/jgsanders/ref_data/genomes/mouse/mouse',
                      'forward': ['./example/reads/Vf_S104_L001_R1_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R1_001.fastq.gz'],
                      'reverse': ['./example/reads/Vf_S104_L001_R2_001.fastq.gz',
                                  './example/reads/Vf_S104_L002_R2_001.fastq.gz']}}

def add_filter_db(sample_fp_dict, db_fp, samples = None):
    
    if samples is None:
        samples = sample_fp_dict.keys()
    
    samples_dict = copy.deepcopy(sample_fp_dict)
    
    for s in samples_dict:
        if s in samples:
            samples_dict[s]['filter_db'] = db_fp
        elif 'filter_db' in samples_dict[s]:
            continue
        else:
            samples_dict[s]['filter_db'] = None
        

        
    return(samples_dict)

obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens')

assert(obs_dict == exp_db_dict)

obs_dict = add_filter_db(exp_db_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])

assert(obs_dict == exp_db_dict_update)

obs_dict = add_filter_db(exp_dict, '/home/jgsanders/ref_data/genomes/mouse/mouse', samples = ['Vf'])

assert(obs_dict == exp_db_dict_partial)

Steps:

Get samples
- read in sample sheet
- guess samples from sequence folder
Get params
- filter db per sample
- samples for binning
- samples for abundance profiling
- tmp dir
- envs
- software
- assembler
- trimmer
- params
  - atropos
  - maxbin
  - humann2
  - metaphlan
  - kraken
  - shogun?
format yaml

Step 1: get samples¶

First, we need to read in a set of samples for analysis.

There are two options for this: guess the sample names, or read them in from a sample sheet or manifest.

Option 1: guess sample names from sequence output folder.¶

Enter the correct sequencing directory below, and a list of some example filenames should pop up. Make sure these look correct.

In [3]:

seq_dir = './test_data/test_reads/'
!ls -l {seq_dir} | head

total 219216
-rw-r--r--+ 1 jonsanders  staff  8797141 Sep 12 16:52 S22205_S104_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  9666850 Sep 12 16:52 S22205_S104_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  8982873 Sep 12 16:52 S22207_S103_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  9939119 Sep 12 16:52 S22207_S103_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  9012384 Sep 12 16:52 S22282_S102_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  9930763 Sep 12 16:52 S22282_S102_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  8901974 Sep 12 16:52 S22400_S101_L001_R1_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  9781744 Sep 12 16:52 S22400_S101_L001_R2_001.fastq.gz
-rw-r--r--+ 1 jonsanders  staff  8864399 Sep 12 16:52 S22401_S100_L001_R1_001.fastq.gz

If that looks correct, execute the following cell. It should result in a dictionary associating samples to filepaths.

To validate, a portion of the dictionary is converted to yaml and printed below:

In [4]:

sample_paths = get_sample_paths(seq_dir)

print(yaml.dump(sample_paths, default_flow_style = False))

S22205:
  forward:
  - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz
S22207:
  forward:
  - ./test_data/test_reads/S22207_S103_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22207_S103_L001_R2_001.fastq.gz
S22282:
  forward:
  - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz
S22400:
  forward:
  - ./test_data/test_reads/S22400_S101_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22400_S101_L001_R2_001.fastq.gz
S22401:
  forward:
  - ./test_data/test_reads/S22401_S100_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22401_S100_L001_R2_001.fastq.gz
S22402:
  forward:
  - ./test_data/test_reads/S22402_S105_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22402_S105_L001_R2_001.fastq.gz

Option 2: read sample names and prefixes from sample sheet¶

Read in samples from the sample sheet used to demultiplex them.

This is useful if, for example, you have nonstandard file naming, only want to process a subset of sequences in a run, or want to associate other names to the sequences.

In [5]:

# read in sample sheet

def read_sample_sheet(f, sep='\t'):
    data = False
    data_lines = ''
    for line in f:
        if data:
            if line.startswith('[') or line.strip() == '':
                data = False
                continue
            data_lines += line
        elif line.startswith('[Data]'):
            data = True
            continue
        else:
            continue
            
    data_df = pd.read_csv(StringIO(data_lines), sep = '\t', comment='#')
    return(data_df)

with open('./test_data/example_sample_sheet.txt', 'r') as f:
    ss_df = read_sample_sheet(f)

ss_df.head()

Out[5]:

	Lane	Sample_ID	Sample_Name	Sample_Plate	Sample_Well	I7_Index_ID	index	I5_Index_ID	index2	Sample_Project	Description
0	1	S22205	S22205	Example Plate 1	A1	iTru7_101_01	ACGTTACC	iTru5_01_A	ACCGACAA	Example Project	sample_S22205
1	1	S22282	S22282	Example Plate 1	B1	iTru7_101_02	CTGTGTTG	iTru5_01_B	AGTGGCAA	Example Project	sample_S22282

In [6]:

seq_dir = './test_data/test_reads/'

fps = os.listdir(seq_dir)

files_df = parse_ilm_fps_to_df(fps)

files_df

Out[6]:

	File	Sample	Index	Lane	Read	Run	Extension
0	S22205_S104_L001_R1_001.fastq.gz	S22205	S104	L001	R1	001	fastq.gz
1	S22205_S104_L001_R2_001.fastq.gz	S22205	S104	L001	R2	001	fastq.gz
2	S22207_S103_L001_R1_001.fastq.gz	S22207	S103	L001	R1	001	fastq.gz
3	S22207_S103_L001_R2_001.fastq.gz	S22207	S103	L001	R2	001	fastq.gz
4	S22282_S102_L001_R1_001.fastq.gz	S22282	S102	L001	R1	001	fastq.gz
5	S22282_S102_L001_R2_001.fastq.gz	S22282	S102	L001	R2	001	fastq.gz
6	S22400_S101_L001_R1_001.fastq.gz	S22400	S101	L001	R1	001	fastq.gz
7	S22400_S101_L001_R2_001.fastq.gz	S22400	S101	L001	R2	001	fastq.gz
8	S22401_S100_L001_R1_001.fastq.gz	S22401	S100	L001	R1	001	fastq.gz
9	S22401_S100_L001_R2_001.fastq.gz	S22401	S100	L001	R2	001	fastq.gz
10	S22402_S105_L001_R1_001.fastq.gz	S22402	S105	L001	R1	001	fastq.gz
11	S22402_S105_L001_R2_001.fastq.gz	S22402	S105	L001	R2	001	fastq.gz

In [7]:

def get_sample_reads_df_from_sample_sheet(ss_df, seq_dir, sample_col='Description', prefix_col='Sample_ID'):
    sample_reads_dict = {}
    
    samples = list(set(ss_df[sample_col]))
    
    fps = os.listdir(seq_dir)

    files_df = parse_ilm_fps_to_df(fps)
    
    for s in samples:
        # get the subset of sample sheet rows for that sample
        sample_rows = ss_df.loc[ss_df[sample_col] == s,]
        
        f_fps = []
        r_fps = []
        
        # iter across subset table and find file corresponding to each row
        for idx, row in sample_rows.iterrows():
            lane = 'L{0:03d}'.format(row['Lane']) 
            
            f_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & 
                                      (files_df['Lane'] == lane) &
                                      (files_df['Read'] == 'R1'), 'File'].values)
            
            r_fps.extend(files_df.loc[(files_df['Sample'] == row['Sample_ID']) & 
                                      (files_df['Lane'] == lane) &
                                      (files_df['Read'] == 'R2'), 'File'].values)
        
        f_fps = sorted(f_fps)
        r_fps = sorted(r_fps)
        
        sample_reads_dict[s] = {'forward': [os.path.join(seq_dir, x) for x in f_fps],
                                'reverse': [os.path.join(seq_dir, x) for x in r_fps]}
    
    return(sample_reads_dict)

sample_paths = get_sample_reads_df_from_sample_sheet(ss_df, seq_dir,  sample_col='Description', prefix_col='Sample_ID')
print(yaml.dump(sample_paths, default_flow_style = False))

sample_S22205:
  forward:
  - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz
sample_S22282:
  forward:
  - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz

Step 2: define parameters¶

- filter db per sample
- samples for binning
- samples for abundance profiling
- tmp dir
- envs
- software
- assembler
- trimmer
- params
    * atropos
    * maxbin 
    * humann2
    * metaphlan
    * kraken
    * shogun?

In [8]:

config_dict = {}

Add filter databases per sample¶

These are the filepaths to the databases used for host filtering.

The method 'add host filter db' adds the provided filepath to the sample dictionary you created above.

By default, all samples get same database.

You can also provide a list of specific sample names to update if you want to set different filter dbs for some samples. To do this, just re-execute add_filter_db providing a list of samples to update; the existing filter_db paths will remain unchanged if they are not in this list.

Example:

filter_db1 = '/home/jgsanders/ref_data/genomes/Homo_sapiens_Bowtie2_v0.1/Homo_sapiens'
filter_db2 = '/home/jgsanders/ref_data/genomes/mouse/mouse'

samples_dict = add_filter_db(sample_paths, filter_db1)
samples_dict = add_filter_db(samples_dict, filter_db2, samples = ['Vf'])

In [9]:

filter_db = './test_data/phix'

samples_dict = add_filter_db(sample_paths, filter_db)

print(yaml.dump(samples_dict, default_flow_style = False))

sample_S22205:
  filter_db: ./test_data/phix
  forward:
  - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz
sample_S22282:
  filter_db: ./test_data/phix
  forward:
  - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz
  reverse:
  - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz

Add samples for binning¶

These are the samples from which assembled contigs will be binned into putative draft genomes.

By default, bin all samples.

In [10]:

binning_samples = list(samples_dict.keys())

config_dict['binning_samples'] = binning_samples

Add samples for abundance profiling¶

These are the samples from which abundance information will be used for binning of binning_samples

By default, use all samples.

In [11]:

# This line filters out BLANK* samples from abundance mapping
abundance_samples = [x for x in samples_dict if not x.startswith('BLANK')]

# This line uses all samples for abundance mapping
abundance_samples = list(samples_dict.keys())

config_dict['abundance_samples'] = abundance_samples

Add trimmer to use¶

These are the read trimmers to use for qc.

Currently, the available options are:

- atropos
- skewer

In [12]:

config_dict['trimmer'] = 'atropos'

Add assemblers to use¶

These are the assemblers to use for assembly.

Currently, the available options are:

- spades
- metaspades
- megahit

In [13]:

assemblers = ['megahit','metaspades']

config_dict['assemblers'] = assemblers

Add assembly to use for binning¶

This is the assembler to use for binning.

Currently, the available options are:

- spades
- metaspades
- megahit

In [14]:

config_dict['mapping_assembler'] = 'metaspades'

Add temporary directory¶

This is the directory used during execution of rules. Should ideally be local scratch storage.

In [15]:

tmp_dir_root = './'

config_dict['tmp_dir_root'] = tmp_dir_root

Add environments¶

These are the conda environments to use for various rules.

env is the primary snakemake_assemble environment. Change others as necessary for other rules.

In [16]:

envs = {'anvi': 'source activate oecophylla-anvi',
        'humann2': 'source activate oecophylla-humann2',
        'kraken': 'source activate oecophylla-kraken',
        'shogun': 'source activate oecophylla-shogun',
        'qc': 'source activate oecophylla-qc',
        'raw': 'source activate oecophylla-qc'}

anvio resources¶

Sets paths to resources for anvio

In [17]:

resources =  {'centrifuge_base': '/home/jgsanders/miniconda/envs/anvio2/centrifuge',
              'centrifuge_models': '/home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v'}
config_dict['resources'] = resources

Add parameters definitions¶

These are parameters passed to the various tools used.

In [18]:

params = {}

Atropos params¶

Sets quality and adapter trimming parameters.

Some suggested defaults: Nextera: -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT -q 15 --minimum-length 100 --pair-filter any KapaHyperPlus: -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any

In [19]:

params['atropos'] = ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT -q 15 --minimum-length 100 --pair-filter any'

maxbin params¶

Sets binning parameters.

In [20]:

params['maxbin'] = '-plotmarker'

Kraken params¶

Sets parameters for Kraken taxonomic profiling.

In [21]:

params['kraken'] = {'kraken_db': '/home/qiz173/Databases/Kraken/stdb'}

SHOGUN params¶

Sets parameters for Shogun taxonomic profiling.

In [22]:

params['shogun'] = "--utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr"

MetaPhlAn2 params¶

Sets parameters for MetaPhlAn2 taxonomic profilling.

In [23]:

params['metaphlan'] = {'metaphlan_dir': '/home/jgsanders/git_sw/metaphlan2'}

humann2¶

Sets various parameters for HUMAnN2 funcitonal profiling.

humann2_aa_db: Amino acid database for translated amino acid search humann2_nt_db: ChocoPhlAn database for nucleotide search metaphan_dir: path to metaphlan2 directory install, should also have the default metaphlan2 db norms: normalizations for which to output tables other: any other HUMAnN2 parameters to use

In [24]:

humann2 = {'humann2_aa_db': 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref',
           'humann2_nt_db': '/home/jgsanders/ref_data/humann2/chocophlan',
           'metaphlan_dir': '/home/jgsanders/share/metaphlan2',
           'norms': ['cpm','relab'],
           'other': ''}

params['humann2'] = humann2

mash¶

parameters for mash

other: mash command-line parameters for sketching
refseq_db: path to refseq db sketch; default /home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh
depth: number of reads to make sketch from. forward and reverse reads will be independently subsampled to reach total. Leave blank ('') to skip subsampling.

In [25]:

mash = {'other': '-r -m 2 -k 21 -s 1000',
        'refseq_db': '/home/jgsanders/ref_data/mash/RefSeqSketchesDefaults.msh',
        'depth': '100000'}

Step 3: format yaml¶

Now the entire configuration dictionary can be formatted as a yaml file and saved for use in the pipeline.

In [29]:

config_dict['params'] = params
config_dict['samples'] = samples_dict
config_dict['envs'] = envs

Format as yaml and write to filepath:

In [30]:

config_yaml = yaml.dump(config_dict, default_flow_style = False)

with open('config.yaml', 'w') as f:
    f.write(config_yaml)

This is the complete config yaml:

In [31]:

print(config_yaml)

abundance_samples:
- sample_S22282
- sample_S22205
assemblers:
- megahit
- metaspades
binning_samples:
- sample_S22282
- sample_S22205
envs:
  anvi: source activate oecophylla-anvi
  humann2: source activate oecophylla-humann2
  kraken: source activate oecophylla-kraken
  qc: source activate oecophylla-qc
  raw: source activate oecophylla-qc
  shogun: source activate oecophylla-shogun
mapping_assembler: metaspades
params:
  atropos: ' -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT
    -q 15 --minimum-length 100 --pair-filter any'
  humann2:
    humann2_aa_db: 'humann2_aa_db: /home/jgsanders/ref_data/humann2/uniref'
    humann2_nt_db: /home/jgsanders/ref_data/humann2/chocophlan
    metaphlan_dir: /home/jgsanders/share/metaphlan2
    norms:
    - cpm
    - relab
    other: ''
  kraken:
    kraken_db: /home/qiz173/Databases/Kraken/stdb
  maxbin: -plotmarker
  metaphlan:
    metaphlan_dir: /home/jgsanders/git_sw/metaphlan2
  shogun: --utree_indx /home/qiz173/Databases/SHOGUN/annotated/utree/stdb.ctr
resources:
  centrifuge_base: /home/jgsanders/miniconda/envs/anvio2/centrifuge
  centrifuge_models: /home/jgsanders/miniconda/envs/anvio2/centrifuge/b+h+v/b+h+v
samples:
  sample_S22205:
    filter_db: ./test_data/phix
    forward:
    - ./test_data/test_reads/S22205_S104_L001_R1_001.fastq.gz
    reverse:
    - ./test_data/test_reads/S22205_S104_L001_R2_001.fastq.gz
  sample_S22282:
    filter_db: ./test_data/phix
    forward:
    - ./test_data/test_reads/S22282_S102_L001_R1_001.fastq.gz
    reverse:
    - ./test_data/test_reads/S22282_S102_L001_R2_001.fastq.gz
tmp_dir_root: ./
trimmer: atropos

In [ ]: