#!/usr/bin/env python # coding: utf-8 # In[5]: get_ipython().run_cell_magic('bash', '', 'date\n') # In[6]: get_ipython().run_cell_magic('bash', '', 'hostname\n') # In[7]: get_ipython().run_cell_magic('bash', '', 'lscpu\n') # ### Format Ava's Master Sheet # ##### Look at beginning of file # In[8]: get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list.tsv\n') # #### Want to create new header line that does not contain ```ID #``` column. # Use awk to extract the first record of the file (i.e. the existing header row; ```NR==```) and exclude the second column. # # Also, tell awk that input file is tab-delimited (```-F "\t"```) and that output should also be tab-delimited (```BEGIN { OFS = "\t" }```). # In[9]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print $1, $3, $4, $5, $6, $7}\' ava_master_ab_list.tsv\n') # Looks good, but how many columns are there? # # Use awk to print the number of fields (i.e. columns; ```{print NF}```) in the first record (```NR==1```) # In[10]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print NF}\' ava_master_ab_list.tsv\n') # Use ```cut``` to print all column names, except column two. # In[11]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\nava_master_ab_list.tsv | \\\ncut -f1,3-\n') # In[12]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3-\n') # Looks good. # # Write the output to a file. # In[13]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3- > ava_master_ab_list_formatted.tsv\n') # In[14]: get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list_formatted.tsv\n') # In[15]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print NF}\' ava_master_ab_list_formatted.tsv\n') # Use awk to combine the accession number column and the ID# column data. Also format the ID# data into a three digit number. # In[16]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 >> \\\nava_master_ab_list_formatted.tsv\n') # In[17]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2\n') # In[18]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3- > ava_master_ab_list_formatted.tsv\n') # In[19]: get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list_formatted.tsv\n') # In[20]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 | awk \'NF\'\n') # In[21]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 | awk \'{print NF}\'\n') # In[22]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 | \\\nhead -1 | \\\nawk -F "\\t" \'{print NF}\'\n') # Whoops! Finally noticed I left the ```head``` command in there! That's why the above steps were producing the wrong result. # In[23]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\ntail -n +2 >> \\\nava_master_ab_list_formatted.tsv\n') # In[24]: get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list_formatted.tsv\n') # --- # ### Format Extraction List from 20170502 # Perform same actions to this file as I did to Ava's Master List above. # In[25]: get_ipython().run_cell_magic('bash', '', 'head 20170502_Ava_Ab_List.tsv\n') # In[26]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print NF}\' 20170502_Ava_Ab_List.tsv\n') # In[27]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3- > 20170502_Ava_Ab_List_formatted.tsv\n') # In[28]: get_ipython().run_cell_magic('bash', '', 'head 20170502_Ava_Ab_List_formatted.tsv\n') # In[29]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\n{print NF}\' 20170502_Ava_Ab_List_formatted.tsv\n') # In[30]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\n20170502_Ava_Ab_List.tsv | \\\nhead | \\\ntail -n +2\n') # In[31]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\n20170502_Ava_Ab_List.tsv | \\\nhead | \\\ntail -n +2 | \\\nawk -F "\\t" \'{print NF}\'\n') # In[32]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\n20170502_Ava_Ab_List.tsv | \\\ntail -n +2 >> \\\n20170502_Ava_Ab_List_formatted.tsv\n') # In[33]: get_ipython().run_cell_magic('bash', '', 'head 20170502_Ava_Ab_List_formatted.tsv\n') # --- # ### Format My Other Extraction File # In[34]: get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions.tsv\n') # In[35]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'NR==1 {print NF}\' Ava_WS_Transmission_DNA_Extractions.tsv\n') # In[36]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'NR==1\' Ava_WS_Transmission_DNA_Extractions.tsv\n') # Header looks good; write the header to a new file. # In[37]: get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'NR==1\' Ava_WS_Transmission_DNA_Extractions.tsv > Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n') # In[38]: get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n') # In[39]: get_ipython().run_cell_magic('bash', '', "awk -F'[-:]' 'NR==12' Ava_WS_Transmission_DNA_Extractions.tsv\n") # Use regular expression in field separator to select hyphen, colon, or tab (```-F'[-:\t]'```) and count the number of fields. Original number fields, just using tab delimiter, was 18. # In[40]: get_ipython().run_cell_magic('bash', '', "awk -F'[-:\\t]' 'NR==12 {print NF}' Ava_WS_Transmission_DNA_Extractions.tsv\n") # Combine first three fields to read like this: nn:nn:nnn # # Print out each remaining field in a tab-delimited fashion, followed by a newline. # In[41]: get_ipython().run_cell_magic('bash', '', 'awk -F\'[-:\\t]\' \'{ printf "%s:%02d-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18, $19, $20}\' \\\nAva_WS_Transmission_DNA_Extractions.tsv | \\\nhead -12 | tail -n +2\n') # Append the result to the previously made file that contains the header. # In[42]: get_ipython().run_cell_magic('bash', '', 'awk -F\'[-:\\t]\' \'{ printf "%s:%02d-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18, $19, $20}\' \\\nAva_WS_Transmission_DNA_Extractions.tsv | \\\ntail -n +2 >> Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n') # In[43]: get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n') # --- # In[44]: get_ipython().run_cell_magic('bash', '', 'cp Ava_WS_Transmission_DNA_Extractions_formatted.tsv > Ava_WS_Transmission_DNA_Extractions_all.tsv\n') # In[45]: get_ipython().run_cell_magic('bash', '', 'cp Ava_WS_Transmission_DNA_Extractions_formatted.tsv Ava_WS_Transmission_DNA_Extractions_all.tsv\n') # In[46]: get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions_all.tsv\n') # In[47]: get_ipython().run_cell_magic('bash', '', 'wc -l Ava_WS_Transmission_DNA_Extractions_all.tsv\nwc -l 20170502_Ava_Ab_List_formatted.tsv\n') # Append the contents, sans header (i.e. all rows after the first row; ```'NR>1'```), of the one file to the other. # In[48]: get_ipython().run_cell_magic('bash', '', "awk 'NR>1' 20170502_Ava_Ab_List_formatted.tsv >> Ava_WS_Transmission_DNA_Extractions_all.tsv\n") # Quick line count to verify that the two files have been combined into one. # In[49]: get_ipython().run_cell_magic('bash', '', 'wc -l Ava_WS_Transmission_DNA_Extractions_all.tsv\n') # --- # ### Join the master list with the extraction list to identify samples that still need to be processed. # In[50]: get_ipython().run_cell_magic('bash', '', 'join <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n') # In[51]: get_ipython().run_cell_magic('bash', '', 'sort ava_master_ab_list_formatted.tsv | head\n') # In[52]: get_ipython().run_cell_magic('bash', '', 'sort Ava_WS_Transmission_DNA_Extractions_all.tsv\n') # In[53]: get_ipython().run_cell_magic('bash', '', "join -t $'\\t' <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n") # In[54]: get_ipython().run_cell_magic('bash', '', "join -a1 -t $'\\t' <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n") # In[55]: get_ipython().run_cell_magic('bash', '', "join -a1 -a2 -t $'\\t' <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n") # In[56]: get_ipython().run_cell_magic('bash', '', "join -t $'\\t' -a1 -a2 -1 1 -2 1 <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n") # In[ ]: