#!/usr/bin/env python
# coding: utf-8
# In[5]:
get_ipython().run_cell_magic('bash', '', 'date\n')
# In[6]:
get_ipython().run_cell_magic('bash', '', 'hostname\n')
# In[7]:
get_ipython().run_cell_magic('bash', '', 'lscpu\n')
# ### Format Ava's Master Sheet
# ##### Look at beginning of file
# In[8]:
get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list.tsv\n')
# #### Want to create new header line that does not contain ```ID #``` column.
# Use awk to extract the first record of the file (i.e. the existing header row; ```NR==```) and exclude the second column.
#
# Also, tell awk that input file is tab-delimited (```-F "\t"```) and that output should also be tab-delimited (```BEGIN { OFS = "\t" }```).
# In[9]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print $1, $3, $4, $5, $6, $7}\' ava_master_ab_list.tsv\n')
# Looks good, but how many columns are there?
#
# Use awk to print the number of fields (i.e. columns; ```{print NF}```) in the first record (```NR==1```)
# In[10]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print NF}\' ava_master_ab_list.tsv\n')
# Use ```cut``` to print all column names, except column two.
# In[11]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\nava_master_ab_list.tsv | \\\ncut -f1,3-\n')
# In[12]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3-\n')
# Looks good.
#
# Write the output to a file.
# In[13]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3- > ava_master_ab_list_formatted.tsv\n')
# In[14]:
get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list_formatted.tsv\n')
# In[15]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print NF}\' ava_master_ab_list_formatted.tsv\n')
# Use awk to combine the accession number column and the ID# column data. Also format the ID# data into a three digit number.
# In[16]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 >> \\\nava_master_ab_list_formatted.tsv\n')
# In[17]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2\n')
# In[18]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3- > ava_master_ab_list_formatted.tsv\n')
# In[19]:
get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list_formatted.tsv\n')
# In[20]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 | awk \'NF\'\n')
# In[21]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 | awk \'{print NF}\'\n')
# In[22]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\nhead | \\\ntail -n +2 | \\\nhead -1 | \\\nawk -F "\\t" \'{print NF}\'\n')
# Whoops! Finally noticed I left the ```head``` command in there! That's why the above steps were producing the wrong result.
# In[23]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\nava_master_ab_list.tsv | \\\ntail -n +2 >> \\\nava_master_ab_list_formatted.tsv\n')
# In[24]:
get_ipython().run_cell_magic('bash', '', 'head ava_master_ab_list_formatted.tsv\n')
# ---
# ### Format Extraction List from 20170502
# Perform same actions to this file as I did to Ava's Master List above.
# In[25]:
get_ipython().run_cell_magic('bash', '', 'head 20170502_Ava_Ab_List.tsv\n')
# In[26]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1 \\\n{print NF}\' 20170502_Ava_Ab_List.tsv\n')
# In[27]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\nNR==1\' \\\nava_master_ab_list.tsv | \\\ncut -f1,3- > 20170502_Ava_Ab_List_formatted.tsv\n')
# In[28]:
get_ipython().run_cell_magic('bash', '', 'head 20170502_Ava_Ab_List_formatted.tsv\n')
# In[29]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'BEGIN { OFS = "\\t" } \\\n{print NF}\' 20170502_Ava_Ab_List_formatted.tsv\n')
# In[30]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\n20170502_Ava_Ab_List.tsv | \\\nhead | \\\ntail -n +2\n')
# In[31]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\n20170502_Ava_Ab_List.tsv | \\\nhead | \\\ntail -n +2 | \\\nawk -F "\\t" \'{print NF}\'\n')
# In[32]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{ printf "%s-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18 ,$19 }\' \\\n20170502_Ava_Ab_List.tsv | \\\ntail -n +2 >> \\\n20170502_Ava_Ab_List_formatted.tsv\n')
# In[33]:
get_ipython().run_cell_magic('bash', '', 'head 20170502_Ava_Ab_List_formatted.tsv\n')
# ---
# ### Format My Other Extraction File
# In[34]:
get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions.tsv\n')
# In[35]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'NR==1 {print NF}\' Ava_WS_Transmission_DNA_Extractions.tsv\n')
# In[36]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'NR==1\' Ava_WS_Transmission_DNA_Extractions.tsv\n')
# Header looks good; write the header to a new file.
# In[37]:
get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'NR==1\' Ava_WS_Transmission_DNA_Extractions.tsv > Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n')
# In[38]:
get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n')
# In[39]:
get_ipython().run_cell_magic('bash', '', "awk -F'[-:]' 'NR==12' Ava_WS_Transmission_DNA_Extractions.tsv\n")
# Use regular expression in field separator to select hyphen, colon, or tab (```-F'[-:\t]'```) and count the number of fields. Original number fields, just using tab delimiter, was 18.
# In[40]:
get_ipython().run_cell_magic('bash', '', "awk -F'[-:\\t]' 'NR==12 {print NF}' Ava_WS_Transmission_DNA_Extractions.tsv\n")
# Combine first three fields to read like this: nn:nn:nnn
#
# Print out each remaining field in a tab-delimited fashion, followed by a newline.
# In[41]:
get_ipython().run_cell_magic('bash', '', 'awk -F\'[-:\\t]\' \'{ printf "%s:%02d-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18, $19, $20}\' \\\nAva_WS_Transmission_DNA_Extractions.tsv | \\\nhead -12 | tail -n +2\n')
# Append the result to the previously made file that contains the header.
# In[42]:
get_ipython().run_cell_magic('bash', '', 'awk -F\'[-:\\t]\' \'{ printf "%s:%02d-%03d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", \\\n$1 ,$2 ,$3 ,$4 ,$5 ,$6 ,$7 ,$8 ,$9 ,$10 ,$11 ,$12 ,$13 ,$14 ,$15 ,$16 ,$17 ,$18, $19, $20}\' \\\nAva_WS_Transmission_DNA_Extractions.tsv | \\\ntail -n +2 >> Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n')
# In[43]:
get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions_formatted.tsv\n')
# ---
# In[44]:
get_ipython().run_cell_magic('bash', '', 'cp Ava_WS_Transmission_DNA_Extractions_formatted.tsv > Ava_WS_Transmission_DNA_Extractions_all.tsv\n')
# In[45]:
get_ipython().run_cell_magic('bash', '', 'cp Ava_WS_Transmission_DNA_Extractions_formatted.tsv Ava_WS_Transmission_DNA_Extractions_all.tsv\n')
# In[46]:
get_ipython().run_cell_magic('bash', '', 'head Ava_WS_Transmission_DNA_Extractions_all.tsv\n')
# In[47]:
get_ipython().run_cell_magic('bash', '', 'wc -l Ava_WS_Transmission_DNA_Extractions_all.tsv\nwc -l 20170502_Ava_Ab_List_formatted.tsv\n')
# Append the contents, sans header (i.e. all rows after the first row; ```'NR>1'```), of the one file to the other.
# In[48]:
get_ipython().run_cell_magic('bash', '', "awk 'NR>1' 20170502_Ava_Ab_List_formatted.tsv >> Ava_WS_Transmission_DNA_Extractions_all.tsv\n")
# Quick line count to verify that the two files have been combined into one.
# In[49]:
get_ipython().run_cell_magic('bash', '', 'wc -l Ava_WS_Transmission_DNA_Extractions_all.tsv\n')
# ---
# ### Join the master list with the extraction list to identify samples that still need to be processed.
# In[50]:
get_ipython().run_cell_magic('bash', '', 'join <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n')
# In[51]:
get_ipython().run_cell_magic('bash', '', 'sort ava_master_ab_list_formatted.tsv | head\n')
# In[52]:
get_ipython().run_cell_magic('bash', '', 'sort Ava_WS_Transmission_DNA_Extractions_all.tsv\n')
# In[53]:
get_ipython().run_cell_magic('bash', '', "join -t $'\\t' <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n")
# In[54]:
get_ipython().run_cell_magic('bash', '', "join -a1 -t $'\\t' <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n")
# In[55]:
get_ipython().run_cell_magic('bash', '', "join -a1 -a2 -t $'\\t' <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n")
# In[56]:
get_ipython().run_cell_magic('bash', '', "join -t $'\\t' -a1 -a2 -1 1 -2 1 <(sort ava_master_ab_list_formatted.tsv) <(sort Ava_WS_Transmission_DNA_Extractions_all.tsv) | head\n")
# In[ ]: