#!/usr/bin/env python # coding: utf-8 # ### Archiving of Panopea generosa (geoduck) genome data from BGI # ##### Print system info # In[1]: get_ipython().run_cell_magic('bash', '', 'system_profiler SPSoftwareDataType\n') # ##### List of files from BGI # In[3]: get_ipython().run_cell_magic('bash', '', 'ls /Volumes/owl_home/F15FTSUSAT0328_PANwalD/\n') # ##### Generate checksums file to verify downloaded data remained intact. # In[4]: get_ipython().run_cell_magic('bash', '', '\n#For loop generates a md5 checksum has value for each file\n#and appends the output to the checksums.md5 file.\ntime for file in /Volumes/owl_home/F15FTSUSAT0328_PANwalD/*.gz\n do\n md5 "$file" >> /Volumes/owl_home/F15FTSUSAT0328_PANwalD/checksums.md5\n done\n') # ##### View checksums.md5 file # In[5]: get_ipython().run_cell_magic('bash', '', 'cat /Volumes/owl_home/F15FTSUSAT0328_PANwalD/checksums.md5\n') # ##### View md5.txt file from BGI # In[6]: get_ipython().run_cell_magic('bash', '', 'cat /Volumes/owl_home/F15FTSUSAT0328_PANwalD/md5.txt\n') # ##### Save hash values from checksums.md5 and md5.txt to variables. Compare hash values from each to ensure they match (i.e. downloaded data is intact). # In[16]: get_ipython().run_cell_magic('bash', '', '\n#Use awk to print the 4th field (the checksum hash values) and saves the values to the new_checksums variable.\nnew_checksums=$( awk \'{print $4}\' /Volumes/owl_home/F15FTSUSAT0328_PANwalD/checksums.md5 )\n\n#Use awk to print the 1st field (the checksum hash values) and saves the values to the original_checksums variable.\noriginal_checksums=$( awk \'{print $1}\' /Volumes/owl_home/F15FTSUSAT0328_PANwalD/md5.txt)\n\n#Print contents of new_checksums variable to verify the contents.\necho $new_checksums\n\n#Print an empty line for improved readibility in the output.\necho ""\n\n#Print contents of original_checksums variable to verify the contents.\necho $original_checksums\n\n#Prints an empty line for improved readibility in the output.\necho ""\n\n\n#Compare differences between the checksum hash values in each of the two variables.\n#Uses the bash concept of process substitution "<(" to accomplish task using variable contents instead of two files.\ndiff <(echo "$new_checksums") <(echo "$original_checksums")\n\n#Check exit status of last command executed (i.e. if diff command found no differences, then the output should be 0)\necho $?\n') # ##### Calculate total number of reads generated by this project # In[17]: get_ipython().run_cell_magic('bash', '', '\n\ntotalreads=0 #Initializes variable.\n\n#For loop counts the lines in each file and divides them by four. This is performed because\n#Illumina sequencing files are composed of four lines per read.\n#A running total of the total number of reads is generated [totalreads=$((readcount+totalreads))]\n#and is printed after the for loop completes.\ntime for file in /Volumes/owl_home/F15FTSUSAT0328_PANwalD/*.gz\n do linecount=`gunzip -c "$file" | wc -l`\n readcount=$((linecount/4))\n totalreads=$((readcount+totalreads))\ndone\necho $totalreads\n') # ##### Calculate number of reads per file and, append filename and corresponding number of reads to readme file # In[1]: get_ipython().run_cell_magic('bash', '', '\n#For loop counts the lines in each file and divides them by four. This is performed because\n#Illumina sequencing files are composed of four lines per read.\n#Format the output (printf) to print the filename, followed by a tab, followed by the readcount.\n#The command "tee -a" is used to both print the output to the screen and append the output to the readme.md file.\ntime for file in /Volumes/owl_home/F15FTSUSAT0328_PANwalD/*.gz\n do linecount=`gunzip -c "$file" | wc -l`\n readcount=$(($linecount/4))\n printf "%s\\t%s\\n\\n" "${file##*/}" "$readcount" | tee -a /Volumes/owl_web/nightingales/P_generosa/readme.md\ndone\n') # ##### Move .gz files to web accessible location on Owl # In[20]: get_ipython().run_cell_magic('bash', '', '\nmv -n /Volumes/owl_home/F15FTSUSAT0328_PANwalD/*.gz /Volumes/owl_web/nightingales/P_generosa/\n') # ##### List files in Owl/web/nightingales/O_lurida # In[25]: get_ipython().run_cell_magic('bash', '', "\n#List the files from this project (grep FCH3*), showing human-readable file sizes,\n#and get rid of file permission info in output (cut -d ' ' -f7)\nls -lh /Volumes/owl_web/nightingales/P_generosa/ | grep 'FCH3*' | cut -d ' ' -f7-\n") # ##### Append new checksums to existing checksums file in Owl/nightingales/O_lurida # In[22]: cat /Volumes/owl_home/F15FTSUSAT0328_PANwalD/checksums.md5 >> /Volumes/owl_web/nightingales/P_generosa/checksums.md5 # ##### Verify checksums were added to Owl/web/nightingales/O_lurida/checksums.md5 # In[23]: cat /Volumes/owl_web/nightingales/O_lurida/checksums.md5 | grep 'FCH3*' # In[ ]: