#!/usr/bin/env python # coding: utf-8 # # Reference sequence mapping horserace # # ipyrad is capable of incorporating a reference sequence to aid in the assembly. There are actually 4 different assembly methods, 3 of which use reference sequence in some way. Here we test reference assisted assembly for ipyrad and stacks, as ddocent and aftrrad do not allow for . Though aftrRAD performs nicely on empirical data and does allow for reference assisted assembly, we consider runtimes to be prohibitive and so exclude it from analysis here. # # Ideas for datasets (all have data in SRA): # # Selection and sex-biased dispersal in a coastal shark: the influence of philopatry on adaptive variation # - 134 individuals (paper assembled w/ ddocent) # # Genome-wide data reveal cryptic diversity and genetic introgression in an Oriental cynopterine fruit bat radiation # - < 45 samples, 2 reference genomes in the same family # # Beyond the Coral Triangle: high genetic diversity and near panmixia in Singapore's populations of the broadcast spawning sea star Protoreaster nodosus # - 77 samples, it's a passerine, so there must be something reasonably close # # PSMC (pairwise sequentially Markovian coalescent) analysis of RAD (restriction site associated DNA) sequencing data # - 17 sticklebacks, they used stacks # # For this analysis we chose the paired-end ddRAD dataset from Lah et al 2016 (http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0162792#sec002): # Spatially Explicit Analysis of Genome-Wide SNPs Detects Subtle Population Structure in a Mobile Marine Mammal, the Harbor Porpoise # - 49 samples from 3 populations of European Harbor Porpoise # # In[28]: import subprocess import ipyrad as ip import shutil import glob import sys import os ## Set the default directories for exec and data. WORK_DIR="/home/iovercast/manuscript-analysis/" REFMAP_EMPIRICAL_DIR=os.path.join(WORK_DIR, "Phocoena_empirical/") REFMAP_FASTQS=os.path.join(REFMAP_EMPIRICAL_DIR, "Final_Files_forDryad/Bbif_ddRADseq/fastq/") IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/") STACKS_DIR=os.path.join(WORK_DIR, "stacks/") for dir in [WORK_DIR, REFMAP_EMPIRICAL_DIR, IPYRAD_DIR, STACKS_DIR]: if not os.path.exists(dir): os.makedirs(dir) # # Simulated reference sequence mapping # To get some idea of how ipyrad and stacks perform with reference sequence mapping we'll # first assemble a simulated dataset. # # Right now i'm just grabbing the simulated data from the ipyrad ipsimdata directory cuz it's quick and dirty # but if you want to get crazy you can simulate new seqs by ripping code from here: https://github.com/dereneaton/ipyrad/blob/master/tests/cookbook-making-sim-data.ipynb # ### Make directories and fetch the simulated data # The toy simulated data that lives in the ipyrad git repo consists of 1000 simulated loci, 500 of which are present # in the simulated reference sequence. # In[66]: REFMAP_SIM_DIR = os.path.join(WORK_DIR, "REFMAP_SIM/") REFMAP_DAT_DIR = os.path.join(REFMAP_SIM_DIR, "ipsimdata/") IPYRAD_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ipyrad/") STACKS_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "stacks/") DDOCENT_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ddocent/") ## REFMAP_DAT_DIR will be created when we untar ipsimdata.tar.gz for d in [REFMAP_SIM_DIR, IPYRAD_SIM_DIR, STACKS_SIM_DIR, DDOCENT_SIM_DIR]: if not os.path.exists(d): os.makedirs(d) os.chdir(REFMAP_SIM_DIR) get_ipython().system('wget https://github.com/dereneaton/ipyrad/raw/master/tests/ipsimdata.tar.gz') get_ipython().system('tar -xzf ipsimdata.tar.gz') # ## Do ipyrad simulated reference mapping # For the first analysis we'll use the `reference` assembly method which will just toss out # all reads that don't map the the reference sequence. We should expect to recover 500 reads per # sample. # # The toy data runs in ~2 minutes. # In[95]: os.chdir(IPYRAD_SIM_DIR) ## Make a new assembly and set some assembly parameters data = ip.Assembly("refmap-sim") data.set_params("raw_fastq_path", REFMAP_DAT_DIR + "pairddrad_wmerge_example_R*_.fastq.gz") data.set_params("barcodes_path", REFMAP_DAT_DIR + "pairddrad_wmerge_example_barcodes.txt") data.set_params("project_dir", "reference-assembly") data.set_params("assembly_method", "reference") data.set_params("reference_sequence", REFMAP_DAT_DIR + "pairddrad_wmerge_example_genome.fa") data.set_params("datatype", "pairddrad") data.set_params("restriction_overhang", ("TGCAG", "CGG")) data.write_params(force=True) cmd = "ipyrad -p params-refmap-sim.txt -s 1234567 -c 40".format(dir) print(cmd) get_ipython().system('time $cmd') # ## Do ipyrad denovo+reference # Create a new branch and set the assembly method to `denovo+reference`. Now we will expect to recover # 1000 loci per sample (500 from the reference mapping and 500 from de novo). # # Again, the toy data runs in slighly less than 3 minutes. # In[97]: data2 = data.branch("denovo_plus_reference-sim") data2.set_params("assembly_method", "denovo+reference") data2.write_params(force=True) cmd = "ipyrad -p params-denovo_plus_reference-sim.txt -s 1234567 -c 40".format(dir) print(cmd) get_ipython().system('time $cmd') # ## Do ipyrad denovo-reference # Create a new branch and set the assembly method to `denovo+reference`. Now we will expect to recover # 1000 loci per sample (500 from the reference mapping and 500 from de novo). # # Again, the toy data runs in slighly less than 2 minutes. # In[98]: data2 = data.branch("denovo_minus_reference-sim") data2.set_params("assembly_method", "denovo-reference") data2.write_params(force=True) cmd = "ipyrad -p params-denovo_minus_reference-sim.txt -s 1234567 -c 40".format(dir) print(cmd) get_ipython().system('time $cmd') # ## Stacks simulated reference sequence assembly # Stacks needs reference mapped sequences in .bam or .sam format. ~~Since we already did the mapping # in ipyrad we'll just pluck the `*-mapped-sorted.bam` files out of the ipyrad \_refmapping directory.~~ Nope! That would certainly be nice, but we `dereplicate` reads before we map. This is obviously important information # (which ipyrad records and uses downstream), but stacks doesn't know about this, so you get bad quality # mapping. I learned this the hard way. We need to map each sample by hand from the original ipyrad \_edits files. # # On the toy simulated data stacks runs in an impressive 5 seconds. # ### bwa mapping # We are going to poach bwa and samtools from the dDocent install, since we already installed them there. ugh. # # This proceeds in 3 steps. First bwa maps to the reference (-t is # of cores, -v shuts down verbosity). Then samtools pulls out mapped and properly paired reads (-b outputs .bam format, -F 0x804 filters on mapping/pairing). Then it sorts the bam file, and cleans up the dangling sam file. Mapping is quick for the simulated data (<2 minutes). # In[14]: IPYRAD_SIMEDITS_DIR = IPYRAD_SIM_DIR + "reference-assembly/refmap-sim_edits/" REF_SEQ = REFMAP_DAT_DIR + "pairddrad_wmerge_example_genome.fa" ## Sim sample names pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"] pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"] pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"] sim_sample_names = pop1 + pop2 + pop3 for samp in sim_sample_names: R1 = IPYRAD_SIMEDITS_DIR + samp + ".trimmed_R1_.fastq.gz" R2 = IPYRAD_SIMEDITS_DIR + samp + ".trimmed_R2_.fastq.gz" samout = STACKS_SIM_DIR + samp + ".sam" bamout = STACKS_SIM_DIR + samp + ".bam" export_cmd = "export PATH=~/manuscript-analysis/dDocent:$PATH" bwa_cmd = "bwa mem -t 40 -v 1 " + REF_SEQ\ + " " + R1\ + " " + R2\ + " > " + samout samtools_cmd = "samtools view -b -F 0x804 " + samout\ + " | samtools sort -T /tmp/{}.sam -O bam -o {}".format(samp, bamout) cleanup_cmd = "rm {}".format(samout) cmd = ";".join([export_cmd, bwa_cmd, samtools_cmd, cleanup_cmd]) get_ipython().system('$cmd') # In[22]: ## This is how we'd do it since we weren't using a popmap file infiles = ["-s "+ff+" " for ff in glob.glob(STACKS_SIM_DIR + "*.bam")] ## Toggle the dryrun flag for testing DRYRUN="" DRYRUN="-d" ## Options ## -T The number of threads to use ## -O The popmap file specifying individuals and populations ## -S Disable database business ## -o Output directory. Just write to the empirical stacks directory ## -X Tell populations to create the output formats specified ## -X and use `-m 6` which sets min depth per locus OUTPUT_FORMATS = "--vcf --genepop --structure --phylip " cmd = "ref_map.pl -T 40 -b 1 -S " + DRYRUN\ + " -X \'populations:" + OUTPUT_FORMATS + "\'"\ + " -X \'populations:-m 6\'"\ + " -o " + STACKS_SIM_DIR + " "\ + " ".join(infiles) print("\nCommand to run - {}".format(cmd)) # In[21]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$STACKS_SIM_DIR" "$cmd"', 'export PATH="$1/miniconda/bin:$PATH"; export "STACKS_SIM_DIR=$2"; export "cmd=$3"\n\n## We have to play a little cat and mouse game here because of quoting in some of the args\n## and how weird bash is we have to write the cmd to a file and then exec it.\n## If you try to just run $cmd it truncates the command at the first single tic. Hassle.\ncd $STACKS_SIM_DIR\necho $cmd > stacks.sh; chmod 777 stacks.sh\ntime ./stacks.sh\n') # ## dDocent simulated reference assembly # dDocent reference sequence mapping is not clearly documented. It turns out you can skip the initial assembly and go right to mapping/snp calling, but it requires you to copy the reference sequence to the dDocent workind directory as "reference.fasta". This uses the trimmed and QC'd fastq files from the ipyrad simulated run, so it assumes you have already done that and the fq files are still in place. # In[92]: IPYRAD_SIMEDITS_DIR = IPYRAD_SIM_DIR + "reference-assembly/refmap-sim_edits/" REF_SEQ = REFMAP_DAT_DIR + "pairddrad_wmerge_example_genome.fa" DDOCENT_DIR = "/home/iovercast/manuscript-analysis/dDocent/" os.chdir(DDOCENT_SIM_DIR) ## Create a simlink to the reference sequence in the current directory cmd = "ln -sf {} reference.fasta".format(REF_SEQ) get_ipython().system('$cmd') ## Sim sample names pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"] pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"] pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"] sim_sample_names = pop1 + pop2 + pop3 sim_mapping_dict = {} for pop_num, samps in enumerate([pop1, pop2, pop3]): for samp_num, samp_name in enumerate(samps): sim_mapping_dict[samp_name] = "Pop{}_{:03d}".format(pop_num+1, samp_num+1) ## Now we have to rename all the files in the way dDocent expects them: ## 1A_0_R1_.fastq.gz -> Pop1_001.F.fq.gz for k, v in sim_mapping_dict.items(): ## Symlink R1 and R2 for i in ["1", "2"]: source = os.path.join(IPYRAD_SIMEDITS_DIR, k + ".trimmed_R{}_.fastq.gz".format(i)) ## This is the way the current documentation says to name imported trimmed ## files, but it doesn't work. ## dest = os.path.join(DDOCENT_SIM_DIR, v + ".R{}.fq.gz".format(i)) if i == "1": dest = os.path.join(DDOCENT_SIM_DIR, v + ".F.fq.gz".format(i)) else: dest = os.path.join(DDOCENT_SIM_DIR, v + ".R.fq.gz".format(i)) cmd = "ln -sf {} {}".format(source, dest) get_ipython().system('$cmd') config_file = "{}/sim-config.txt".format(DDOCENT_SIM_DIR) with open(config_file, 'w') as outfile: outfile.write('Number of Processors\n40\nMaximum Memory\n0\nTrimming\nno\nAssembly?\nno\nType_of_Assembly\nPE\nClustering_Similarity%\n0.85\nMapping_Reads?\nyes\nMapping_Match_Value\n1\nMapping_MisMatch_Value\n3\nMapping_GapOpen_Penalty\n5\nCalling_SNPs?\nyes\nEmail\nwatdo@mailinator.com\n') cmd = "export LD_LIBRARY_PATH={}/freebayes-src/vcflib/tabixpp/htslib/:$LD_LIBRARY_PATH; ".format(DDOCENT_DIR) cmd += "export PATH={}:$PATH; time dDocent {}".format(DDOCENT_DIR, config_file) print(cmd) with open("ddocent.sh", 'w') as outfile: outfile.write("#!/bin/bash\n") outfile.write(cmd) get_ipython().system('chmod 777 ddocent.sh') # In[93]: ## You have to post-process the vcf files to decompose complex genotypes and remove indels os.chdir(DDOCENT_SIM_DIR) exports = "export LD_LIBRARY_PATH=/home/iovercast/manuscript-analysis/dDocent//freebayes-src/vcflib/tabixpp/htslib/:$LD_LIBRARY_PATH; export PATH=/home/iovercast/manuscript-analysis/dDocent/:$PATH" fullvcf = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.vcf") filtvcf = os.path.join(DDOCENT_SIM_DIR, "Final.recode.vcf") for f in [fullvcf, filtvcf]: print("Finalizing - {}".format(f)) ## Rename the samples to make them agree with the ipyrad/stacks names so ## the results analysis will work. vcffile = os.path.join(DDOCENT_SIM_DIR, f) infile = open(vcffile,'r') filedata = infile.readlines() infile.close() outfile = open(vcffile,'w') for line in filedata: if "CHROM" in line: for ipname, ddname in sim_mapping_dict.items(): line = line.replace(ddname, ipname) outfile.write(line) outfile.close() ## Naming the new outfiles as .snps.vcf ## Decompose complex genotypes and remove indels outfile = os.path.join(DDOCENT_SIM_DIR, f.split("/")[-1].split(".vcf")[0] + ".snps.vcf") cmd = "{}; vcfallelicprimitives {} > ddoc-tmp.vcf".format(exports, f) print(cmd) get_ipython().system('$cmd') cmd = "{}; vcftools --vcf ddoc-tmp.vcf --remove-indels --recode --recode-INFO-all --out {}".format(exports, outfile) print(cmd) get_ipython().system('$cmd') get_ipython().system('rm ddoc-tmp.vcf') # # Empirical reference sequence mapping # ## Fetch the Phocoena raw sequence data # We will use the sra-toolkit command `fastq-dump` to pull the PE reads out of SRA. # This maybe isn't the best way, or the quickest, but it'll get the job done. Takes # ~30 minutes and requires ~70GB of space. After I downloaded the fq I looked at a # couple random samples in FastQC to get an idea where to trim in step2. # In[ ]: os.chdir(REFMAP_EMPIRICAL_DIR) get_ipython().system('mkdir raws') get_ipython().system('cd raws') ## Grab the sra-toolkit pre-built binaries to download from SRA ## This works, but commented for now so it doesn't keep redownloading get_ipython().system('wget http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.8.0/sratoolkit.2.8.0-ubuntu64.tar.gz') get_ipython().system('tar -xvzf sratoolkit*') FQ_DUMP = os.path.join(REFMAP_EMPIRICAL_DIR, "sratoolkit.2.8.0-ubuntu64/bin/fastq-dump") res = subprocess.check_output(FQ_DUMP + " -version", shell=True) ## The SRR numbers for the samples from this bioproject range from SRR4291662 to SRR4291705 ## so go fetch them one by one for samp in range(662, 706): print("Doing {}\t".format(samp)), res = subprocess.check_output(FQ_DUMP + " --split-files SRR4291" + str(samp), shell=True) # In[48]: ## The SRA download files have wonky names, like SRR1234_R1.fastq.gz, but ipyrad expects SRR1234_R1_.fastq.gz, ## so we have to fix the filenames. Filename hax... import glob for f in glob.glob(REFMAP_EMPIRICAL_DIR + "raws/*.fastq.gz"): splits = f.split("/")[-1].split("_") newf = REFMAP_EMPIRICAL_DIR + "raws/" + splits[0] + "_R" + splits[1].split(".")[0] + "_.fastq.gz" os.rename(f, newf) # ## Fetch the bottlenose dolphin genome # Tursiops truncatus reference genome. Divergence time between dolphin and porpoise is approximately 15Mya, which is on the order of divergence between humans and orang. There is also a genome for the Minke whale, which is much more deeply diverged (~30Mya), could be interesting to try both to see how it works. # # Minke whale - http://www.nature.com/ng/journal/v46/n1/full/ng.2835.html#accessions # # SRA Data table for converting fq files to sample name as used in the paper: file:///home/chronos/u-b20882bda0f801c7265d4462f127d0cb4376d46d/Downloads/Tune/SraRunTable.txt # In[30]: os.chdir(REFMAP_EMPIRICAL_DIR) get_ipython().system('mkdir TurtrunRef') get_ipython().system('cd TurtrunRef') get_ipython().system('wget ftp://ftp.ensembl.org/pub/release-87/fasta/tursiops_truncatus/dna/Tursiops_truncatus.turTru1.dna_rm.toplevel.fa.gz') ## Ensembl distributes gzip'd reference sequence files, but samtools really wants it to be bgzipped or uncompressed get_ipython().system('gunzip Tursiops_truncatus.turTru1.dna_rm.toplevel.fa.gz') # ## Trim reads w/ cutadapt # To reduce any potential bias introduced by differences in trimming and filtering methods we will trim reads w/ cutadapt, and QC w/ ipyrad and use this dataset as the starting point for assembly. If you are wondering, you __can__ run fastqc from the command line, like this # # fastqc -o fastqc_out/ SRR4291662_1.fastq SRR4291662_2.fastq # # We'll trim R1 and R2 to 85bp following Lah et al 2016. The `-l` flag for cutadapt specifies the length to which each read will be trimmed. # In[54]: get_ipython().run_cell_magic('bash', '-s "$REFMAP_EMPIRICAL_DIR"', 'cd $1\nmkdir trimmed\nfor i in `ls raws`; do echo $i; cutadapt -l 85 raws/$i | gzip > trimmed/$i; done\n## Housekeeping\nrm -rf raws\nmv trimmed raws\n') # ## Import reads into ipyrad and do QC # Now take the trimmed reads and filter for adapters, minimum sequence length, and max low quality bases. # In[29]: IPYRAD_REFMAP_DIR = os.path.join(REFMAP_EMPIRICAL_DIR, "ipyrad/") if not os.path.exists(IPYRAD_REFMAP_DIR): os.makedirs(IPYRAD_REFMAP_DIR) os.chdir(IPYRAD_REFMAP_DIR) # In[57]: ## Make a new assembly and set some assembly parameters data = ip.Assembly("refmap-empirical") data.set_params("sorted_fastq_path", REFMAP_EMPIRICAL_DIR + "raws/*.fastq.gz") data.set_params("project_dir", "reference-assembly") data.set_params("assembly_method", "reference") data.set_params("reference_sequence", REFMAP_EMPIRICAL_DIR + "TurtrunRef/Tursiops_truncatus.turTru1.dna_rm.toplevel.fa") data.set_params("datatype", "pairddrad") data.set_params("restriction_overhang", ("TGCAG", "CGG")) data.set_params('max_low_qual_bases', 5) data.set_params('filter_adapters', 2) data.write_params(force=True) cmd = "ipyrad -p params-refmap-empirical.txt -s 1 --force".format(dir) print(cmd) get_ipython().system('time $cmd') cmd = "ipyrad -p params-refmap-empirical.txt -s 2 --force".format(dir) print(cmd) get_ipython().system('time $cmd') # ## Do ipyrad refmap empirical # Here we will use the ipyrad 'reference' assembly method which will only use reads that successfully map to the reference sequence. This is similar and should be comparable to the way stacks incorporates reference sequences. # # Steps 1 & 2 run in ~15 minutes. # In[58]: ## Oops. If you run some other cell while this is running it steals stdout, so you lose track of progress. cmd = "ipyrad -p params-refmap-empirical.txt -s 34567".format(dir) print(cmd) get_ipython().system('time $cmd') # ## Do ipyrad denovo+reference # Create a new branch and set the assembly method to `denovo+reference`. # In[174]: data2 = data.branch("denovo_ref-empirical") data2.set_params("assembly_method", "denovo+reference") data2.write_params(force=True) cmd = "ipyrad -p params-denovo_ref-empirical.txt -s 34567 -c 40".format(dir) print(cmd) get_ipython().system('time $cmd') # ## Do Stacks refmap empirical # Lah et al trim R1 and R2 to 85bp and then concatenate them for stacks analysis. I thought this was weird, but the stacks manual says "For double-digest RAD data that has been paired-end sequenced, Stacks supports this type of data by treating the loci built from the single-end and paired-end as two independent loci", which I guess is effectively the same thing as concatenating. # # For reference sequence assembly stacks doesn't actually handle the mapping, you need to Do It Yourself and then pull in .sam or .bam files. Since we already do the bwa mapping in ipyrad lets just use those files. The question is do we use the full .sam file or the \*-mapped.bam (which excludes unmapped and unpaired reads). We'll use the \*-mapped.bam files because the stacks manual says the mapped reads should be clean and tidy, so no junk. # # __NB__: Stacks makes the __strong__ assumption that the "left" edge of all reads within a stack line up. It looks like it just uses the StartPos to define a stack (assuming the cut site is the same). We will clean up soft-clipped sequences with ngsutils (https://github.com/ngsutils/ngsutils.git). # # __NB__: Stacks treats R1 and R2 as independent. # # __NB__: The other consequence of being forced to remove soft clipped sequence is that you are bound to be losing some real variation. # # For the empirical data stacks runs in ~2 hours (~8 if you include the time it takes to map the sequences). # In[38]: ## Set directories and make the popmap file STACKS_REFMAP_DIR = os.path.join(REFMAP_EMPIRICAL_DIR, "stacks/") if not os.path.exists(STACKS_REFMAP_DIR): os.makedirs(STACKS_REFMAP_DIR) os.chdir(STACKS_REFMAP_DIR) make_stacks_popmap(STACKS_REFMAP_DIR) # ## Map ipyrad trimmed reads to the reference sequence # Based on the same logic as above with the simulated data, we need to remap the raw reads from the empirical ipyrad \_edits directory to make stacks happy. This step consumes a __ton__ of ram and takes a non-negligible amount of time. See docs for sim mapping above for the meaning of all the bwa/samtools flags. # # Mapping the real data will take several hours (5-6 hours if you're lucky). # In[ ]: IPYRAD_EDITS_DIR = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/refmap-empirical_edits/") REF_SEQ = REFMAP_EMPIRICAL_DIR + "TurtrunRef/Tursiops_truncatus.turTru1.dna_rm.toplevel.fa" ## Just get the sample_names = glob.glob(IPYRAD_EDITS_DIR + "*.trimmed_R1_.fastq.gz") sample_names = [x.split(".")[0].split("/")[-1] for x in sample_names] for samp in sample_names: R1 = IPYRAD_EDITS_DIR + samp + ".trimmed_R1_.fastq.gz" R2 = IPYRAD_EDITS_DIR + samp + ".trimmed_R2_.fastq.gz" samout = STACKS_REFMAP_DIR + samp + ".sam" bamout = STACKS_REFMAP_DIR + samp + ".bam" export_cmd = "export PATH=~/manuscript-analysis/dDocent:$PATH" bwa_cmd = "bwa mem -t 40 -v 0 " + REF_SEQ\ + " " + R1\ + " " + R2\ + " > " + samout samtools_cmd = "samtools view -b -F 0x804 " + samout\ + " | samtools sort -T /tmp/{}.sam -O bam -o {}".format(samp, bamout) cleanup_cmd = "rm {}".format(samout) cmd = "; ".join([export_cmd, bwa_cmd, samtools_cmd, cleanup_cmd]) get_ipython().system('time $cmd') # ### Stacks prereqs - install ngsutils # Install ngsutils and use it to remove soft-clipped sequences from each read. We could do this # by rerunning bwa with the `-H` flag, but that would be slightly more annoying. This is not # guaranteed to 'work' because i had a hell of a time getting ngsutils to install on my system. # In[ ]: get_ipython().run_cell_magic('bash', '-s "$REFMAP_EMPIRICAL_DIR"', 'cd $1/stacks\ngit clone https://github.com/ngsutils/ngsutils.git\ncd ngsutils\nmake\n') # ### Now do the filtering # This will apply the `removeclipping` method of bamutils to each mapped bam file in the stacks refmap directory. Then it deletes the old bam file and moves the new bam file to have the name of the old one. On a nice system this takes ~2 minutes per sample (so ~2 hours total). # In[56]: infiles = glob.glob(STACKS_REFMAP_DIR + "SRR*.bam") for f in infiles: outfile = f + ".tmp" print(f, outfile) subprocess.call("bamutils removeclipping {} {}".format(f, outfile), shell=True) subprocess.call("rm {}".format(f), shell=True) subprocess.call("mv {} {}".format(outfile, f), shell=True) # ## Now run stacks ref_map pipeline # We build the stacks command w/ python because then we call the command inside a bash cell because denovo_map expects all the submodules to be in the PATH. The default command line created will include the `-d` flag to do the dry run, so the bash cell won't actually do anything real. But it does create a `stacks.sh` file in the stacks directory that includes the command to run. # In[57]: ## This is how we'd do it if we weren't using a popmap file #infiles = ["-s "+ff+" " for ff in glob.glob(IPYRAD_REFMAP_DIR+"*-mapped-sorted.bam")] ## Toggle the dryrun flag for testing DRYRUN="" DRYRUN="-d" ## Options ## -T The number of threads to use ## -O The popmap file specifying individuals and populations ## -S Disable database business ## -o Output directory. Just write to the empirical stacks directory ## -X The first -X tells populations to create the output formats sepcified ## -X The second one passes `-m 6` which sets min depth per locus OUTPUT_FORMATS = "--vcf --genepop --structure --phylip " cmd = "ref_map.pl -T 40 -b 1 -S " + DRYRUN\ + " -O {}/popmap.txt".format(STACKS_REFMAP_DIR)\ + " --samples {}".format(STACKS_REFMAP_DIR)\ + " -X \'populations:" + OUTPUT_FORMATS + "\'"\ + " -X \'populations:-m 6\'"\ + " -o " + STACKS_REFMAP_DIR print("\nCommand to run - {}".format(cmd)) # In[58]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$STACKS_REFMAP_DIR" "$cmd"', 'export PATH="$1/miniconda/bin:$PATH"; export "STACKS_REFMAP_DIR=$2"; export "cmd=$3"\n\n## We have to play a little cat and mouse game here because of quoting in some of the args\n## and how weird bash is we have to write the cmd to a file and then exec it.\n## If you try to just run $cmd it truncates the command at the first single tic. Hassle.\ncd $STACKS_REFMAP_DIR\necho $cmd > stacks.sh; chmod 777 stacks.sh\ntime ./stacks.sh\n') # ## Do dDocent refmap empirical # dDocent requires a __lot__ of pre-flight housekeeping. You can't use an arbitrary directory for raw files, # you have to puth the raw reads in a __specific__ directory. You also can't use arbitrary file names, you have # to use __specific__ filenames that dDocent requires. zomg. I wonder if we can trick it with symlinks. # # This next cell will do all the housekeeping and then print the nicely formatted command to run. You have to copy # this command and run it by hand in a terminal on the machine you want to run it on bcz it doesn't like running # inside the notebook. # # ~1 hour to munge reads and index the reference sequence. ~3hrs to map the reads and build the vcf. Several hours getting the .fq.gz files to have a naming format that dDocent agreed with. # In[59]: ## A housekeeping function for getting a dictionary to map SRR* filenames in the ipyrad edits directory ## to ddocent style. ## ## Gotcha: Nice 1-based indexing for the dDocent format. ## ## For raw reads the format (for R1) is pop1_sample1.F.fq.gz format a la: ## 1A_0_R1_.fastq.gz -> Pop1_Sample1.F.fq.gz ## ## For trimmed reads the format is pop1_001.R1.fq.gz a la: ## 1A_0_R1_.fastq.gz -> Pop1_001.R1.fq.gz ## So annoying because we have to translate across a bunch of different mappings. ugh. def get_ddocent_filename_mapping(): mapping_dict = {} ## Maps sample name to population pop_dict = get_popdict() pops = set(pop_dict.values()) ## For each population go through and add items to the dict per sample ## So we have to map the sample name to the SRR and then make an entry ## mapping SRR file name to ddocent format for i, pop in enumerate(pops): ## Get a list of all the samples in this population. This is probably a stupid way but it works. samps = [item[0] for item in pop_dict.items() if item[1] == pop] for j, samp in enumerate(samps): mapping_dict[samp] = "Pop{}_{:03d}".format(i+1, j+1) ## For the untrimmed format, if you want dDocent to do the trimming ## mapping_dict[samp] = "Pop{}_Sample{}".format(i, j) return mapping_dict print(get_ddocent_filename_mapping()) # In[87]: ## Set up directory structures. change the force flag if you want to ## blow everything away and restart # force = True force = "" DDOCENT_DIR = "/home/iovercast/manuscript-analysis/dDocent/" DDOCENT_REFMAP_DIR = os.path.join(REFMAP_EMPIRICAL_DIR, "ddocent/") if force and os.path.exists(DDOCENT_REFMAP_DIR): shutil.rmtree(DDOCENT_REFMAP_DIR) if not os.path.exists(DDOCENT_REFMAP_DIR): os.makedirs(DDOCENT_REFMAP_DIR) os.chdir(DDOCENT_REFMAP_DIR) ## Create a simlink to the reference sequence in the current directory REF_SEQ = REFMAP_EMPIRICAL_DIR + "TurtrunRef/Tursiops_truncatus.turTru1.dna_rm.toplevel.fa" cmd = "ln -s {} reference.fasta".format(REF_SEQ) get_ipython().system('$cmd') ## Now we have to rename all the files in the way dDocent expects them: ## 1A_0_R1_.fastq.gz -> Pop1_Sample1.F.fq.gz ## Make symlinks to the trimmed data files in the ipyrad directory. It _should_ work. ## Trimmed reads in the ipyrad directory are of the format: SRR4291681.trimmed_R1_.fastq.gz IPYRAD_EDITS_DIR = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/refmap-empirical_edits/") name_mapping = get_ddocent_filename_mapping() for k,v in name_mapping.items(): ## Symlink R1 and R2 for i in ["1", "2"]: source = os.path.join(IPYRAD_EDITS_DIR, k + ".trimmed_R{}_.fastq.gz".format(i)) ##dest = os.path.join(DDOCENT_REFMAP_DIR, v + ".R{}.fq.gz".format(i)) if i == "1": dest = os.path.join(DDOCENT_REFMAP_DIR, v + ".R1.fq.gz".format(i)) else: dest = os.path.join(DDOCENT_REFMAP_DIR, v + ".R2.fq.gz".format(i)) cmd = "ln -sf {} {}".format(source, dest) get_ipython().system('$cmd') ## Write out the config file for this run. ## Compacted the config file into one long line here to make it not take up so much room ## Trimming = no because we trimmed in ipyrad ## Assembly = no because we are providing a reverence sequence ## Type of Assembly = PE for paired-end config_file = "{}/empirical-config.txt".format(DDOCENT_REFMAP_DIR) with open(config_file, 'w') as outfile: outfile.write('Number of Processors\n40\nMaximum Memory\n0\nTrimming\nno\nAssembly?\nno\nType_of_Assembly\nPE\nClustering_Similarity%\n0.85\nMapping_Reads?\nyes\nMapping_Match_Value\n1\nMapping_MisMatch_Value\n3\nMapping_GapOpen_Penalty\n5\nCalling_SNPs?\nyes\nEmail\nwatdo@mailinator.com\n') cmd = "export LD_LIBRARY_PATH={}/freebayes-src/vcflib/tabixpp/htslib/:$LD_LIBRARY_PATH; ".format(DDOCENT_DIR) cmd += "export PATH={}:$PATH; time dDocent {}".format(DDOCENT_DIR, config_file) print(cmd) with open("ddocent.sh", 'w') as outfile: outfile.write("#!/bin/bash\n") outfile.write(cmd) get_ipython().system('chmod 777 ddocent.sh') ## Have to run the printed command by hand from the ddocent REALDATA dir bcz it doesn't like running in the notebook #!$cmd ## NB: Must rename all the samples in the output vcf and then use vcf-shuffle-cols ## perl script in the vcf/perl directory to reorder the vcf file to match ## the output of stacks and ipyrad for pca/heatmaps to work. # In[ ]: ## You have to post-process the vcf files to decompose complex genotypes and remove indels os.chdir(DDOCENT_REFMAP_DIR) exports = "export LD_LIBRARY_PATH=/home/iovercast/manuscript-analysis/dDocent//freebayes-src/vcflib/tabixpp/htslib/:$LD_LIBRARY_PATH; export PATH=/home/iovercast/manuscript-analysis/dDocent/:$PATH" fullvcf = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.vcf") filtvcf = os.path.join(DDOCENT_REFMAP_DIR, "Final.recode.vcf") for f in [fullvcf, filtvcf]: print("Finalizing - {}".format(f)) ## Rename the samples to make them agree with the ipyrad/stacks names so ## the results analysis will work. vcffile = f infile = open(vcffile,'r') filedata = infile.readlines() infile.close() outfile = open(vcffile,'w') for line in filedata: if "CHROM" in line: for ipname, ddname in name_mapping.items(): line = line.replace(ddname, ipname) outfile.write(line) outfile.close() ## Rename columns to match ipyrad and then resort columns to be in same order IPYRAD_VCF = os.path.join(IPYRAD_REFMAP_DIR, "refmap-empirical_outfiles/refmap-empirical.vcf") os.chdir(os.path.join(DDOCENT_DIR, "vcftools_0.1.11/perl")) tmpvcf = os.path.join(DDOCENT_REFMAP_DIR, "ddocent-tmp.vcf") cmd = "perl vcf-shuffle-cols -t {} {} > {}".format(IPYRAD_VCF, vcffile, tmpvcf) print(cmd) #!$cmd os.chdir(DDOCENT_REFMAP_DIR) ## Naming the new outfiles as .snps.vcf ## Decompose complex genotypes and remove indels outfile = os.path.join(DDOCENT_REFMAP_DIR, f.split("/")[-1].split(".vcf")[0] + ".snps.vcf") cmd = "{}; vcfallelicprimitives {} > ddoc-tmp.vcf".format(exports, f) print(cmd) get_ipython().system('$cmd') cmd = "{}; vcftools --vcf ddoc-tmp.vcf --remove-indels --recode --recode-INFO-all --out {}".format(exports, outfile) print(cmd) get_ipython().system('$cmd') get_ipython().system('rm ddoc-tmp.vcf') # # Housekeeping # This should be at the top, but i'm leaving it here so it's out of the way. Just some housekeeping for translating between SRA sequence file name and sample name from the paper. This is only for the benefit of stacks populations script. # ## Fetch info from SRA for mapping between SRR accession numbers and sample names # The files that come down from SRA are not helpfully named. We have to create a mapping between sample # names from the paper and SRR numbers. # # Get the RunInfo table from here: https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP090334 # # In[23]: def get_sampsdict(): info_header = "BioSample_s Experiment_s Library_Name_s MBases_l MBytes_l Run_s SRA_Sample_s Sample_Name_s dev_stage_s ecotype_s lat_lon_s sex_s tissue_s Assay_Type_s AssemblyName_s BioProject_s BioSampleModel_s Center_Name_s Consent_s InsertSize_l LibraryLayout_s LibrarySelection_s LibrarySource_s LoadDate_s Organism_s Platform_s ReleaseDate_s SRA_Study_s g1k_analysis_group_s g1k_pop_code_s source_s" info = """SAMN05806468 SRX2187156 Pp01 595 395 SRR4291662 SRS1709994 Pp01 relicta 44.09 N 29.81 E female muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806469 SRX2187157 Pp02 478 318 SRR4291663 SRS1709995 Pp02 relicta 41.42 N 28.92 E female muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806478 SRX2187158 Pp11 242 162 SRR4291664 SRS1709996 Pp11 adult phocoena 54.96 N 8.32 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806479 SRX2187159 Pp12 261 174 SRR4291665 SRS1709997 Pp12 adult phocoena 54.95 N 8.32 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806480 SRX2187160 Pp13 595 397 SRR4291666 SRS1709998 Pp13 juvenile phocoena 54.16 N 8.82 E male skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806481 SRX2187161 Pp14 769 511 SRR4291667 SRS1709999 Pp14 phocoena 57.00 N 11.00 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806482 SRX2187162 Pp15 624 414 SRR4291668 SRS1710000 Pp15 phocoena 56.89 N 12.50 E male skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806483 SRX2187163 Pp16 665 446 SRR4291669 SRS1710001 Pp16 phocoena 57.37 N 9.68 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806484 SRX2187164 Pp17 264 177 SRR4291670 SRS1710002 Pp17 phocoena 57.59 N 10.10 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806485 SRX2187165 Pp18 684 453 SRR4291671 SRS1710003 Pp18 phocoena 58.93 N 11.15 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806486 SRX2187166 Pp19 601 398 SRR4291672 SRS1710004 Pp19 phocoena 55.43 N 107.0 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806487 SRX2187167 Pp20 392 261 SRR4291673 SRS1710005 Pp20 phocoena 55.97 N 11.18 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806470 SRX2187168 Pp03 471 316 SRR4291674 SRS1710006 Pp03 relicta 41.48 N 28.31 E female muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806488 SRX2187169 Pp21 592 397 SRR4291675 SRS1710007 Pp21 phocoena 55.43 N 10.70 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806489 SRX2187170 Pp22 446 300 SRR4291676 SRS1710008 Pp22 phocoena 56.25 N 12.82 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806490 SRX2187171 Pp23 617 409 SRR4291677 SRS1710009 Pp23 phocoena 56.65 N 12.85 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806491 SRX2187172 Pp24 554 367 SRR4291678 SRS1710010 Pp24 phocoena 56.00 N 12.00 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806492 SRX2187173 Pp25 753 500 SRR4291679 SRS1710011 Pp25 juvenile phocoena 55.00 N 10.23 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806493 SRX2187174 Pp26 530 353 SRR4291680 SRS1710012 Pp26 phocoena 54.38 N 10.99 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806494 SRX2187175 Pp27 639 426 SRR4291681 SRS1710013 Pp27 juvenile phocoena 54.83 N 9.62 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806495 SRX2187176 Pp28 646 430 SRR4291682 SRS1710014 Pp28 juvenile phocoena 54.59 N 10.03 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806496 SRX2187177 Pp29 374 247 SRR4291683 SRS1710015 Pp29 juvenile phocoena 54.42 N 11.55 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806497 SRX2187178 Pp30 569 376 SRR4291684 SRS1710016 Pp30 juvenile phocoena 54.53 N 11.12 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806471 SRX2187179 Pp04 451 303 SRR4291685 SRS1710017 Pp04 relicta 41.65 N 28.27 E female muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806498 SRX2187180 Pp31 578 384 SRR4291686 SRS1710018 Pp31 adult phocoena 54.53 N 11.11 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806499 SRX2187181 Pp32 586 392 SRR4291687 SRS1710019 Pp32 juvenile phocoena 54.32 N 13.09 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806500 SRX2187182 Pp33 288 189 SRR4291688 SRS1710020 Pp33 juvenile phocoena 54.46 N 12.54 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806501 SRX2187183 Pp34 587 389 SRR4291689 SRS1710021 Pp34 phocoena 54.32 N 13.09 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806502 SRX2187184 Pp35 496 330 SRR4291690 SRS1710022 Pp35 phocoena 55.00 N 14.00 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806503 SRX2187185 Pp36 1085 720 SRR4291691 SRS1710023 Pp36 juvenile phocoena 56.00 N 15.00 E male skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806504 SRX2187186 Pp37 214 141 SRR4291692 SRS1710024 Pp37 phocoena 55.56 N 17.63 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806505 SRX2187187 Pp38 397 263 SRR4291693 SRS1710025 Pp38 phocoena 55.50 N 17.00 E female muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806506 SRX2187188 Pp39 670 447 SRR4291694 SRS1710026 Pp39 juvenile phocoena 56.00 N 16.00 E male muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806507 SRX2187189 Pp40 342 226 SRR4291695 SRS1710027 Pp40 phocoena 54.73 N 18.58 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806472 SRX2187190 Pp05 611 406 SRR4291696 SRS1710028 Pp05 phocoena 64.78 N 13.22 E male skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806508 SRX2187191 Pp41 586 389 SRR4291697 SRS1710029 Pp41 phocoena 54.80 N 18.44 E female muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806509 SRX2187192 Pp42 329 219 SRR4291698 SRS1710030 Pp42 phocoena 54.67 N 18.59 E male muscle OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806510 SRX2187193 Pp43 517 343 SRR4291699 SRS1710031 Pp43 juvenile phocoena 57.00 N 20.00 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806511 SRX2187194 Pp44 491 326 SRR4291700 SRS1710032 Pp44 adult phocoena 57.01 N 20.00 E male skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806473 SRX2187195 Pp06 632 423 SRR4291701 SRS1710033 Pp06 phocoena 64.58 N 13.58 E male skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806474 SRX2187196 Pp07 905 602 SRR4291702 SRS1710034 Pp07 phocoena 64.31 N 14.00 E male skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806475 SRX2187197 Pp08 585 390 SRR4291703 SRS1710035 Pp08 adult phocoena 54.70 N 8.33 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806476 SRX2187198 Pp09 590 392 SRR4291704 SRS1710036 Pp09 phocoena 54.30 N 8.93 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 SAMN05806477 SRX2187199 Pp10 625 414 SRR4291705 SRS1710037 Pp10 adult phocoena 55.47 N 8.38 E female skin OTHER PRJNA343959 Model organism or animal public 0 PAIRED Restriction Digest GENOMIC 2016-09-22 Phocoena phocoena ILLUMINA 2016-09-27 SRP090334 """.split("\n") samps_dict = {} for i in info: line = i.split("\t") samps_dict[line[2]] = line[5] return(samps_dict) # ## Create a population map # Supplementary table S1 (http://journals.plos.org/plosone/article/file?type=supplementary&id=info:doi/10.1371/journal.pone.0162792.s006) contains detailed sample information. We can just extract the sample names and populations they belong to. # In[24]: def get_popdict(): samps_dict = get_sampsdict() popmap = \ """01 WBS 02 WBS 03 WBS 04 WBS 05 IS 06 IS 07 IS 08 NOS 09 NOS 10 NOS 11 NOS 12 NOS 13 NOS 14 SK1 15 SK1 16 SK1 17 SK1 18 SK1 19 KB1 20 KB1 21 KB1 22 KB1 23 KB1 24 KB1 25 BES2 26 BES2 27 BES2 28 BES2 29 BES2 30 BES2 31 BES2 32 BES2 33 BES2 34 BES2 35 IBS 36 IBS 37 IBS 38 IBS 39 IBS 40 IBS 41 IBS 42 IBS 43 IBS 44 IBS""".split("\n") pop_dict = {} for i in popmap: line = i.split("\t") pop_dict[samps_dict["Pp"+line[0]]] = line[1] return(pop_dict) # In[25]: ## Adding "-mapped-sorted" to each individual name to avoid having to rename the .bam files created by ipyrad def make_stacks_popmap(OUTDIR): pop_dict = get_popdict() out = os.path.join(OUTDIR, "popmap.txt") print("Writing popmap file to {}".format(out)) with open(out, 'w') as outfile: for k,v in pop_dict.items(): outfile.write(k + "\t" + v + "\n") # # Ignore all below here # ~~Oh man. I thought dDocent could use a reference sequence for assembly, but it totally can't. I did all this work # to get the housekeeping in order, but then finally figured out at the last minute while setting the config # that it just can't use an external reference sequence. zomg. Keeping this here cuz it might be useful some day # in the event that dDocent makes this possible.~~ Nvm, it _can_ do reference sequence mapping, it's just not well documented. # In[ ]: