#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().system('date')


# In[2]:


get_ipython().run_cell_magic('bash', '', 'system_profiler SPSoftwareDataType\n')


# In[6]:


get_ipython().run_cell_magic('bash', '', '#Uses grep to exclude lines that display serial number and hardware UUID\nsystem_profiler SPHardwareDataType | grep -v [SH][ea]\n')


# #### List files provided by BGI

# In[3]:


get_ipython().run_cell_magic('bash', '', 'ls /Volumes/web/O_lurida_genome_assemblies_BGI/20160512/\n')


# #### Create checksums file

# In[5]:


get_ipython().run_cell_magic('bash', '', '\n#For loop generates a md5 checksum has value for each file\n#and appends the output to the checksums.md5 file.\ntime for file in /Volumes/web/O_lurida_genome_assemblies_BGI/20160512/*.gz\n    do\n        md5 "$file" >> /Volumes/web/O_lurida_genome_assemblies_BGI/20160512/checksums.md5\n        done\n')


# #### Calculate total number of reads generated by this project.¶
# #### Calculate number of reads per file, append filename and corresponding number of reads to readme file.

# In[7]:


get_ipython().run_cell_magic('bash', '', '\n#Initializes variable.\ntotalreads=0\n\n#For loop counts the lines in each file and divides them by four. This is performed because\n#Illumina sequencing files are composed of four lines per read.\n#A running total of the total number of reads is generated [totalreads=$((readcount+totalreads))]\n#and is printed after the for loop completes.\n\n#Format the output (printf) to print the filename, followed by a tab, followed by the readcount.\n#The command "tee -a" is used to both print the output to the screen and append the output to the readme.md file.\ntime for file in /Volumes/web/O_lurida_genome_assemblies_BGI/20160512/*.gz\n    do linecount=`gunzip -c "$file" | wc -l`\n    readcount=$((linecount/4))\n    totalreads=$((readcount+totalreads))\n    printf "%s\\t%s\\n" "${file##*/}" "$readcount" | tee -a /Volumes/web/O_lurida_genome_assemblies_BGI/20160512/readme.md\ndone\necho $totalreads\n')


# #### Count the number of sequences in the scafSeq (FASTA format) file

# In[8]:


get_ipython().run_cell_magic('bash', '', 'time grep ">" /Volumes/web/O_lurida_genome_assemblies_BGI/20160512/Ostrea_lurida.scafSeq | wc -l\n')


# In[ ]: