#!/usr/bin/env python
# coding: utf-8

# ## Use taxonomic read classifications from MEGAN6 to extract day and treatment Phylum-specific FastQs from Arthropoda and Alveolata

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables

# In[2]:


# Set data directories
get_ipython().run_line_magic('env', 'crab_data=/home/sam/data/C_bairdi/RNAseq')
get_ipython().run_line_magic('env', 'hemat_data=/home/sam/data/Hematodinium/RNAseq')
get_ipython().run_line_magic('env', 'wd=/home/sam/analyses')
get_ipython().run_line_magic('env', 'line=------------------------------------------------------------------------')

# Programs
get_ipython().run_line_magic('env', 'seqtk=/home/sam/programs/seqtk-1.3/seqtk')


# #### Input data are here:
# 
# FastAs:
# - https://gannet.fish.washington.edu/Atumefaciens/20200114_cbai_MEGAN_read_extractions/
# 
# - https://gannet.fish.washington.edu/Atumefaciens/20200323_cbai_MEGAN_read_extractions/
# 
# Trimmed-FastQs:
# 
# - https://gannet.fish.washington.edu/Atumefaciens/20191218_cbai_fastp_RNAseq_trimming/
# 
# - https://gannet.fish.washington.edu/Atumefaciens/20200318_cbai_RNAseq_fastp_trimming/

# ### Extract phylum-specific reads from trimmed FastQ files
# 
# 
# Use FastA IDs from MEGAN6 taxonomic read extraction FastAs to pull out appropriate reads from each phylum (Arthropoda and Alveolata). This is performed because MEGAN6 strips paired read ID after the first space. As such, the resulting read extractions using MEGAN end up with a FastA file containing two reads with identicial headers. Not sure if this will cause any downstream issues (i.e. with Trinity) where paired end data is used, so playing it safe and using the truncated IDs to pull FastQs with complete sequence headers for use in subsequent data wrangling.

# In[3]:


get_ipython().run_cell_magic('bash', '', '\ntimestamp="20200413"\n\n# Create associative arrays\n# NOTE: These will require Bash >4.0\n\n## Infection status\ndeclare -A inf_status_array\n## Sampling day\ndeclare -A sample_day_array\n## Sample temperature array\ndeclare -A sample_temp_array\n\n# Create sample list\n{\necho "113_D9_uninfected_cold"\necho "118_D9_infected_ambient"\necho "127_D9_infected_warm"\necho "132_D9_infected_ambient"\necho "151_D9_infected_cold"\necho "173_D9_infected_warm"\necho "178_D9_infected_ambient"\necho "221_D12_uninfected_cold"\necho "222_D12_uninfected_cold"\necho "254_D12_infected_cold"\necho "272_D12_infected_warm"\necho "280_D12_infected_warm"\necho "294_D12_infected_warm"\necho "334_D12_infected_ambient"\necho "349_D12_infected_ambient"\necho "359_D12_infected_ambient"\necho "425_D26_uninfected_cold"\necho "427_D26_uninfected_cold"\necho "445_D26_infected_cold"\necho "463_D26_infected_ambient"\necho "481_D26_infected_ambient"\necho "485_D26_infected_ambient"\necho "72_D9_infected_warm"\necho "73_D9_uninfected_cold"\necho "329774_D12_infected"\necho "329775_D12_uninfected"\necho "329776_D26_infected"\necho "329777_D26_uninfected"\n} >> crab-sample-list.txt\n\n\n# Populate arrays\n# Uses underscore as internal field separator (IFS)\n# Reads each field in as a variable name (e.g. sampl)\nwhile IFS="_" read -r sample day infection temp\ndo\n  inf_status_array[$sample]=$infection\n  sample_day_array[$sample]=$day\n  sample_temp_array[$sample]=$temp\ndone < crab-sample-list.txt\n\n# Remove the sample list file\nrm crab-sample-list.txt\n\n# Check the arrays\n\necho "Printing array values:"\necho ""\n\nfor key in "${!inf_status_array[@]}"\ndo\n  printf "[%s]=%s,%s,%s\\n" \\\n  "$key" \\\n  "${inf_status_array[$key]}" \\\n  "${sample_day_array[$key]}" \\\n  "${sample_temp_array[$key]}"\ndone\n\necho ""\necho "${line}"\necho ""\n\nfor directory in ${crab_data} ${hemat_data}\ndo\n\t# Get species name\n\tspecies=$(echo "${directory}" | awk -F"/" \'{print $5}\')\n    \n    # Make new directory and change to that directory ("$_" means use previous command\'s argument)\n    mkdir --parents "${wd}"/"${timestamp}"_"${species}"_megan_reads \\\n    && cd "$_" || exit\n\n\t# Set seqtk list filename\n\tseqtk_list=${timestamp}.${species}.seqtk.read_id.list\n\n\t# Set output FastQ filenames\n    prefix=${timestamp}.${species}\n    R1_suffix=megan_R1.fq\n    R2_suffix=megan_R2.fq\n\n\t######################################################\n\t# Create FastA IDs list to use for sequence extraction\n\t######################################################\n    for fasta in "${directory}"/*.fasta\n\tdo\n      echo "${fasta}" >> fasta-list.txt\n    done\n    \n    for fasta in "${directory}"/*.fasta\n\tdo\n      grep ">" "${fasta}" | awk \'sub(/^>/, "")\'\n\tdone | sort -u >> "${seqtk_list}"\n    \n    \n    echo ""\n    echo "Finished with FastA ID extraction."\n    echo ""\n    echo "Moving on to read extractions..." \n    echo ""\n    echo ""\n    \n    \n    ######################################################\n\t# Extract corresponding R1 and R2 reads using seqtk FastA ID list\n    ######################################################\n\tfor fastq in "${directory}"/*R1*.gz\n    do\n      # Strip path from filename\n      fastq_nopath=${fastq##*/}\n                     \n      # Get sample ID from FastQ filename\n      sample=$(echo "${fastq_nopath}" | awk -F "_" \'{print $1}\')\n\n      # Ignore sample 304428 - it\'s a general pool of various sample types\n      if [ "${sample}" != "304428" ]\n      then\n\n        # Pull infection status, sample day and temp from associative arrays\n        inf_status=${inf_status_array[$sample]}\n        sample_day=${sample_day_array[$sample]}\n        temp=${sample_temp_array[$sample]}\n          \n        # Set output filename\n        ## Does not set temp value in filename when the temp value is empty in array\n        if [[ ${sample_temp_array[$sample]} ]]; then\n          R1_out="${prefix}.${sample}.${sample_day}.${inf_status}.${temp}.${R1_suffix}"\n        else\n          R1_out="${prefix}.${sample}.${sample_day}.${inf_status}.${R1_suffix}"\n        fi\n        \n   \n        echo "Extracting R1 reads from ${fastq}."\n        echo ""\n        echo "Writing R1 reads to ${R1_out}"\n        echo ""\n        echo ""\n      \n        # Use seqtk to pull out desired FastQ reads\n  \t    ${seqtk} subseq "${fastq}" "${seqtk_list}" >> "${R1_out}"\n      fi\n    done\n    \n    echo ""\n    echo "Done with R1 read extractions"\n    echo ""\n    echo "${line}"\n    echo ""\n\n\tfor fastq in "${directory}"/*R2*.gz\n\tdo\n       # Strip path from filename\n      fastq_nopath=${fastq##*/}\n                     \n      # Get sample ID from FastQ filename\n      sample=$(echo "${fastq_nopath}" | awk -F "_" \'{print $1}\')\n\n\t  # Ignore sample 304428 - it\'s a general pool of various sample types\n      if [ "${sample}" != "304428" ]\n      then\n\t\t\t\n        # Pull infection status and sample day from associative array  \n        inf_status=${inf_status_array[$sample]}\n        sample_day=${sample_day_array[$sample]}\n        temp=${sample_temp_array[$sample]}\n          \n        # Set output filename\n        ## Does not set temp value in filename when the temp value is empty in array\n        if [[ ${sample_temp_array[$sample]} ]]; then\n          R2_out="${prefix}.${sample}.${sample_day}.${inf_status}.${temp}.${R2_suffix}"\n        else\n          R2_out="${prefix}.${sample}.${sample_day}.${inf_status}.${R2_suffix}"\n        fi\n   \n        echo "Extracting R2 reads from ${fastq}."\n        echo ""\n        echo "Writing R2 reads to ${R2_out}"\n        echo ""\n        echo ""\n                     \n  \t   ${seqtk} subseq "${fastq}" "${seqtk_list}" >> "${R2_out}"\n     fi\n\tdone\n    \n    echo "${line}"\n    echo ""\n    # Print working directory and list files\n    pwd\n\tls -lh\n    echo ""\n    echo "${line}"\n    echo ""\ndone\n')


# In[ ]: