#queryID #taxID #score
query1 9606 0.9
query2 9534 0.8
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Wed Jun 12 10:48:41 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2926.094 BogoMIPS: 5851.96 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 4.5G 61G 297M 5.0G 65G Swap: 4.7G 0B 4.7G
No LSB modules are available.
# Set working directory - %env is useful for bash
%env work_dir = /home/sam/analyses/20190612_metagenomics_ind_coverage_krona
work_dir = "/home/sam/analyses/20190612_metagenomics_ind_coverage_krona"
env: work_dir=/home/sam/analyses/20190612_metagenomics_ind_coverage_krona
# Set Krona Tools Taxonomy path - %env is useful for bash
%env krona = /home/sam/programs/KronaTools-2.7/bin/ktImportTaxonomy
env: krona=/home/sam/programs/KronaTools-2.7/bin/ktImportTaxonomy
%%bash
mkdir --parents "${work_dir}"
cd $work_dir
/home/sam/analyses/20190612_metagenomics_ind_coverage_krona
%%bash
# Download BLAST output files
wget \
--no-directories \
--recursive \
--no-parent \
--quiet \
--accept outfmt6 \
http://gannet.fish.washington.edu/Atumefaciens/20190516_metagenomics_pgen_blastx/
# Download coverage files
wget \
--no-directories \
--recursive \
--no-parent \
--quiet \
--accept coverage.txt \
https://gannet.fish.washington.edu/Atumefaciens/20190327_metagenomics_pgen_megahit/
ls -ltrh
total 505M -rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt -rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt -rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt -rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt -rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt -rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt -rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6
Column one contains the "Query ID" and column 13 contains the NCBI Taxonomy ID
%%bash
head MG1_pH82.blastx.outfmt6
k141_3 P20315.1 39.286 112 64 1 324 1 211 322 2.07e-15 73.6 10759 Enterobacteria phage T3 k141_7 P35675.1 49.587 121 61 0 6 368 20 140 6.22e-38 128 552 Erwinia amylovora k141_9 P37247.1 45.312 128 70 0 384 1 175 302 4.74e-30 112 817 Bacteroides fragilis k141_10 O86428.2 57.724 123 52 0 370 2 44 166 1.18e-49 162 208964 Pseudomonas aeruginosa PAO1 k141_16 P23883.2 52.174 184 81 3 4 543 124 304 1.48e-59 196 83333 Escherichia coli K-12 k141_18 P77810.1 54.783 115 52 0 202 546 16 130 8.58e-37 137 192 Azospirillum brasilense k141_23 Q6L5C4.1 33.218 289 107 9 2153 1296 236 441 3.69e-28 122 39947 Oryza sativa Japonica Group k141_25 P24918.2 74.869 191 46 1 2 568 81 271 5.69e-102 312 367110 Neurospora crassa OR74A k141_27 P23883.2 51.948 154 74 0 464 3 107 260 3.31e-53 178 83333 Escherichia coli K-12 k141_30 Q2LVI3.1 35.811 148 95 0 452 9 102 249 8.51e-31 117 56780 Syntrophus aciditrophicus SB
%%bash
head MG1.coverage.txt | column -t -s $'\t'
#ID Avg_fold Length Ref_GC Covered_percent Covered_bases Plus_reads Minus_reads Read_GC Median_fold Std_Dev k141_2 flag=1 multi=3.0000 len=304 3.8257 304 0.0000 100.0000 304 6 3 0.4084 4 1.03 k141_3 flag=1 multi=3.0000 len=325 3.9569 325 0.0000 100.0000 325 5 5 0.5342 5 1.70 k141_4 flag=1 multi=3.0000 len=334 3.5749 334 0.0000 100.0000 334 4 6 0.3836 3 1.34 k141_5 flag=1 multi=11.0000 len=315 8.6984 315 0.0000 100.0000 315 12 11 0.3119 9 3.08 k141_6 flag=1 multi=3.0000 len=367 4.2480 367 0.0000 100.0000 367 7 9 0.3532 3 3.26 k141_7 flag=1 multi=7.0000 len=370 8.1216 370 0.0000 100.0000 370 13 14 0.3438 7 3.90 k141_8 flag=3 multi=2.0286 len=211 1.6303 211 0.0000 82.9384 175 2 2 0.5814 2 0.95 k141_9 flag=1 multi=3.0000 len=386 3.9819 386 0.0000 100.0000 386 6 6 0.3591 4 1.60 k141_10 flag=1 multi=1.0000 len=370 19.6270 370 0.0000 100.0000 370 51 31 0.4142 21 9.22
%%bash
for file in *.coverage.txt
do
# Parses sample name
sample=$(echo ${file} | awk -F'.' '{print $1}')
# Skips header line and prints ID and coverage
# Default awk delimiter is spaces, so
# Column 1 is Query ID and column five is Avg_fold coverage
# Sort by first column only
awk 'NR>1 {print $1 "\t" $5}' "${file}" \
| sort -k1,1 \
> "${sample}".ID.coverage.sorted.txt
done
ls -ltrh
echo ""
# Check output format
head MG1.ID.coverage.sorted.txt
total 578M -rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt -rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt -rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt -rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt -rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt -rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt -rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG1.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG2.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 16M Jun 12 10:48 MG3.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 8.9M Jun 12 10:48 MG5.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 11M Jun 12 10:49 MG6.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 13M Jun 12 10:49 MG7.ID.coverage.sorted.txt k141_10 19.6270 k141_100 6.4182 k141_1000 191.3530 k141_10000 6.3785 k141_100000 11.8115 k141_100001 19.4566 k141_100002 5.6765 k141_100003 8.6479 k141_100004 2.4945 k141_100005 53.2919
%%bash
for file in *outfmt6
do
# Parse sample name
sample=$(echo ${file} | awk -F'.' '{print $1}')
sort -k1,1 "${file}" \
> "${sample}".blastx.sorted.outfmt6
done
ls -ltrh
echo ""
head MG1_pH82.blastx.sorted.outfmt6
total 731M -rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt -rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt -rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt -rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt -rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt -rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt -rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG1.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG2.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 16M Jun 12 10:48 MG3.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 8.9M Jun 12 10:48 MG5.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 11M Jun 12 10:49 MG6.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 13M Jun 12 10:49 MG7.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 30M Jun 12 10:49 MG1_pH82.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 28M Jun 12 10:49 MG2_pH82.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 16M Jun 12 10:49 MG3_pH71.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 22M Jun 12 10:49 MG5_pH82.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 26M Jun 12 10:49 MG6_pH71.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 33M Jun 12 10:49 MG7_pH71.blastx.sorted.outfmt6 k141_10 O86428.2 57.724 123 52 0 370 2 44 166 1.18e-49 162 208964 Pseudomonas aeruginosa PAO1 k141_100001 Q98G09.1 59.223 103 42 0 3 311 75 177 1.25e-46 156 266835 Mesorhizobium japonicum MAFF 303099 k141_100006 Q0TUZ2.1 45.946 111 60 0 398 66 271 381 9.09e-26 104 195103 Clostridium perfringens ATCC 13124 k141_100010 P0A0Z5.1 29.801 151 100 3 460 11 300 445 6.75e-14 71.2 122587 Neisseria meningitidis Z2491 k141_100015 O67178.1 26.904 394 272 6 148 1290 331 721 6.37e-39 154 224324 Aquifex aeolicus VF5 k141_100016 G3XD46.1 50.588 85 41 1 529 278 474 558 5.06e-20 89.7 208964 Pseudomonas aeruginosa PAO1 k141_100017 A1TYU8.1 49.042 261 122 2 759 1 1 258 1.21e-82 261 351348 Marinobacter hydrocarbonoclasticus VT8 k141_100020 P0AE52.1 34.586 133 79 2 761 384 4 135 1.90e-14 72.4 83333 Escherichia coli K-12 k141_100022 O85133.1 56.000 100 44 0 302 3 144 243 8.22e-34 120 1063 Rhodobacter sphaeroides k141_100023 Q9M2U3.1 29.259 270 165 7 3744 4517 72 327 5.11e-21 101 3702 Arabidopsis thaliana
%%bash
# Array of all sorted blastx output files
blastx_array=(MG*.blastx.sorted.outfmt6)
printf -- "BLASTx array:\n"
echo "${blastx_array[@]}"
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
echo ""
# Array of all sorted coverage files
coverage_array=(MG*.ID.coverage.sorted.txt)
printf -- "Coverage array:\n"
echo "${coverage_array[@]}"
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
echo ""
# Join with tab-delimiter
# Output column 1 from the first file, column 13 from 2nd file, column 2 from first file
for index in "${!blastx_array[@]}"
do
sample=$(echo "${blastx_array[index]}" | awk -F'.' '{print $1}')
join -t $'\t' \
-o 1.1,2.13,1.2 \
"${coverage_array[index]}" "${blastx_array[index]}" \
> "${sample}".krona-coverage.tsv
# Insert some dashes to improve viewing of output in cell below
printf -- "Joining ${coverage_array[index]} and ${blastx_array[index]}\n\n"
done
echo ""
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
ls -ltrh
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
echo ""
head MG1_pH82.krona-coverage.tsv | column -t -s $'\t'
BLASTx array: MG1_pH82.blastx.sorted.outfmt6 MG2_pH82.blastx.sorted.outfmt6 MG3_pH71.blastx.sorted.outfmt6 MG5_pH82.blastx.sorted.outfmt6 MG6_pH71.blastx.sorted.outfmt6 MG7_pH71.blastx.sorted.outfmt6 ---------------------------------------------------------------------------------------------------- Coverage array: MG1.ID.coverage.sorted.txt MG2.ID.coverage.sorted.txt MG3.ID.coverage.sorted.txt MG5.ID.coverage.sorted.txt MG6.ID.coverage.sorted.txt MG7.ID.coverage.sorted.txt ---------------------------------------------------------------------------------------------------- Joining MG1.ID.coverage.sorted.txt and MG1_pH82.blastx.sorted.outfmt6 Joining MG2.ID.coverage.sorted.txt and MG2_pH82.blastx.sorted.outfmt6 Joining MG3.ID.coverage.sorted.txt and MG3_pH71.blastx.sorted.outfmt6 Joining MG5.ID.coverage.sorted.txt and MG5_pH82.blastx.sorted.outfmt6 Joining MG6.ID.coverage.sorted.txt and MG6_pH71.blastx.sorted.outfmt6 Joining MG7.ID.coverage.sorted.txt and MG7_pH71.blastx.sorted.outfmt6 ---------------------------------------------------------------------------------------------------- total 769M -rw-rw-r-- 1 sam sam 65M Mar 28 11:36 MG1.coverage.txt -rw-rw-r-- 1 sam sam 64M Mar 28 15:32 MG2.coverage.txt -rw-rw-r-- 1 sam sam 73M Mar 28 19:55 MG3.coverage.txt -rw-rw-r-- 1 sam sam 43M Mar 28 22:53 MG5.coverage.txt -rw-rw-r-- 1 sam sam 49M Mar 29 02:02 MG6.coverage.txt -rw-rw-r-- 1 sam sam 61M Mar 29 05:43 MG7.coverage.txt -rw-rw-r-- 1 sam sam 30M May 17 15:41 MG1_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 28M May 18 14:21 MG2_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 16M May 19 21:26 MG3_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 22M May 20 14:07 MG5_pH82.blastx.outfmt6 -rw-rw-r-- 1 sam sam 26M May 21 09:58 MG6_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 33M May 22 09:36 MG7_pH71.blastx.outfmt6 -rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG1.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 14M Jun 12 10:48 MG2.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 16M Jun 12 10:48 MG3.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 8.9M Jun 12 10:48 MG5.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 11M Jun 12 10:49 MG6.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 13M Jun 12 10:49 MG7.ID.coverage.sorted.txt -rw-rw-r-- 1 sam sam 30M Jun 12 10:49 MG1_pH82.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 28M Jun 12 10:49 MG2_pH82.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 16M Jun 12 10:49 MG3_pH71.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 22M Jun 12 10:49 MG5_pH82.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 26M Jun 12 10:49 MG6_pH71.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 33M Jun 12 10:49 MG7_pH71.blastx.sorted.outfmt6 -rw-rw-r-- 1 sam sam 7.6M Jun 12 10:49 MG1_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 7.1M Jun 12 10:49 MG2_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 4.1M Jun 12 10:49 MG3_pH71.krona-coverage.tsv -rw-rw-r-- 1 sam sam 5.4M Jun 12 10:49 MG5_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 6.6M Jun 12 10:49 MG6_pH71.krona-coverage.tsv -rw-rw-r-- 1 sam sam 8.3M Jun 12 10:49 MG7_pH71.krona-coverage.tsv ---------------------------------------------------------------------------------------------------- k141_10 208964 19.6270 k141_100001 266835 19.4566 k141_100006 195103 5.5274 k141_100010 122587 11.3650 k141_100015 224324 20.2997 k141_100016 208964 4.1049 k141_100017 351348 157.6364 k141_100020 83333 7.2350 k141_100022 1063 2.7500 k141_100023 3702 33.4936
%%bash
# Remove all files except .tsv
find . ! -name "*.tsv" -type f -exec rm -f {} +
ls -ltrh
total 39M -rw-rw-r-- 1 sam sam 7.6M Jun 12 10:49 MG1_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 7.1M Jun 12 10:49 MG2_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 4.1M Jun 12 10:49 MG3_pH71.krona-coverage.tsv -rw-rw-r-- 1 sam sam 5.4M Jun 12 10:49 MG5_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 6.6M Jun 12 10:49 MG6_pH71.krona-coverage.tsv -rw-rw-r-- 1 sam sam 8.3M Jun 12 10:49 MG7_pH71.krona-coverage.tsv
%%bash
"${krona}" \
MG1_pH82.krona-coverage.tsv \
MG2_pH82.krona-coverage.tsv \
MG3_pH71.krona-coverage.tsv \
MG5_pH82.krona-coverage.tsv \
MG6_pH71.krona-coverage.tsv \
MG7_pH71.krona-coverage.tsv
# Insert some dashes to improve viewing of output in cell below
printf '%.0s-' {1..100}
printf -- "\n"
ls -ltrh
Loading taxonomy... Importing MG1_pH82.krona-coverage.tsv... Importing MG2_pH82.krona-coverage.tsv... Importing MG3_pH71.krona-coverage.tsv... Importing MG5_pH82.krona-coverage.tsv... Importing MG6_pH71.krona-coverage.tsv... Importing MG7_pH71.krona-coverage.tsv... Writing taxonomy.krona.html... ---------------------------------------------------------------------------------------------------- total 43M -rw-rw-r-- 1 sam sam 7.6M Jun 12 10:49 MG1_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 7.1M Jun 12 10:49 MG2_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 4.1M Jun 12 10:49 MG3_pH71.krona-coverage.tsv -rw-rw-r-- 1 sam sam 5.4M Jun 12 10:49 MG5_pH82.krona-coverage.tsv -rw-rw-r-- 1 sam sam 6.6M Jun 12 10:49 MG6_pH71.krona-coverage.tsv -rw-rw-r-- 1 sam sam 8.3M Jun 12 10:49 MG7_pH71.krona-coverage.tsv drwxrwxr-x 2 sam sam 556K Jun 12 10:50 taxonomy.krona.html.files -rw-rw-r-- 1 sam sam 3.2M Jun 12 10:50 taxonomy.krona.html
[ WARNING ] Too many query IDs to store in chart; storing supplemental files in 'taxonomy.krona.html.files'.