#queryID #taxID #score
query1 9606 0.9
query2 9534 0.8
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
# Set working directory - %env is useful for bash
%env work_dir = /home/sam/analyses/20190612_metagenomics_ind_coverage_krona
work_dir = "/home/sam/analyses/20190612_metagenomics_ind_coverage_krona"
# Set Krona Tools Taxonomy path - %env is useful for bash
%env krona = /home/sam/programs/KronaTools-2.7/bin/ktImportTaxonomy
%%bash
mkdir --parents "${work_dir}"
cd $work_dir
%%bash
# Download BLAST output files
wget \
--no-directories \
--recursive \
--no-parent \
--quiet \
--accept outfmt6 \
http://gannet.fish.washington.edu/Atumefaciens/20190516_metagenomics_pgen_blastx/
# Download coverage files
wget \
--no-directories \
--recursive \
--no-parent \
--quiet \
--accept coverage.txt \
https://gannet.fish.washington.edu/Atumefaciens/20190327_metagenomics_pgen_megahit/
ls -ltrh
Column one contains the "Query ID" and column 13 contains the NCBI Taxonomy ID
%%bash
head MG1_pH82.blastx.outfmt6
%%bash
head MG1.coverage.txt | column -t -s $'\t'
%%bash
for file in *.coverage.txt
do
# Parses sample name
sample=$(echo ${file} | awk -F'.' '{print $1}')
# Skips header line and prints ID and coverage
# Default awk delimiter is spaces, so
# Column 1 is Query ID and column five is Avg_fold coverage
# Sort by first column only
awk 'NR>1 {print $1 "\t" $5}' "${file}" \
| sort -k1,1 \
> "${sample}".ID.coverage.sorted.txt
done
ls -ltrh
echo ""
# Check output format
head MG1.ID.coverage.sorted.txt
%%bash
for file in *outfmt6
do
# Parse sample name
sample=$(echo ${file} | awk -F'.' '{print $1}')
sort -k1,1 "${file}" \
> "${sample}".blastx.sorted.outfmt6
done
ls -ltrh
echo ""
head MG1_pH82.blastx.sorted.outfmt6
%%bash
# Array of all sorted blastx output files
blastx_array=(MG*.blastx.sorted.outfmt6)
printf -- "BLASTx array:\n"
echo "${blastx_array[@]}"
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
echo ""
# Array of all sorted coverage files
coverage_array=(MG*.ID.coverage.sorted.txt)
printf -- "Coverage array:\n"
echo "${coverage_array[@]}"
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
echo ""
# Join with tab-delimiter
# Output column 1 from the first file, column 13 from 2nd file, column 2 from first file
for index in "${!blastx_array[@]}"
do
sample=$(echo "${blastx_array[index]}" | awk -F'.' '{print $1}')
join -t $'\t' \
-o 1.1,2.13,1.2 \
"${coverage_array[index]}" "${blastx_array[index]}" \
> "${sample}".krona-coverage.tsv
# Insert some dashes to improve viewing of output in cell below
printf -- "Joining ${coverage_array[index]} and ${blastx_array[index]}\n\n"
done
echo ""
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
ls -ltrh
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
echo ""
head MG1_pH82.krona-coverage.tsv | column -t -s $'\t'
%%bash
# Remove all files except .tsv
find . ! -name "*.tsv" -type f -exec rm -f {} +
ls -ltrh
%%bash
"${krona}" \
MG1_pH82.krona-coverage.tsv \
MG2_pH82.krona-coverage.tsv \
MG3_pH71.krona-coverage.tsv \
MG5_pH82.krona-coverage.tsv \
MG6_pH71.krona-coverage.tsv \
MG7_pH71.krona-coverage.tsv
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
ls -ltrh