#!/usr/bin/env python # coding: utf-8 # # Set up a WQ-MAKER run using the Jupyter Notebook # ## Step 1. Get oriented. # You will find staged example data in "/opt/WQ-MAKER_example_data/" within the MASTER instance. List its contents with the `ls` command: # In[1]: get_ipython().system('ls -alh /opt/WQ-MAKER_example_data/') # In[2]: get_ipython().system('ls /opt/WQ-MAKER_example_data/test_data') # * maker_*.ctl file are a set of configuration files that can be used for this exercise or generated as described below. # * .ansible.cfg, worker-launch.yml and maker-hosts are ansible-playbook and host file for luanching jobs on WORKERS (optional for WQ-MAKER) # * fasta files include a scaled-down genome (test_genome.fasta) which is comprised of the first 300kb of 12 chromosomes of rice and scaled-down genome (test_genome_chr1.fasta) which is comprised of the first 300kb of first chromosome of rice # * mRNA sequences from NCBI (mRNA.fasta) # * publicly available annotated protein sequences of rice (MSU7.0 and IRGSP1.0) - msu-irgsp-proteins.fasta collection of plant repeats (plant_repeats.fasta) # * ribosomal RNAsequence of rice (Os-rRNA.fa) # * WQ-MAKER-Jupyter-notebooks for running WQ-MAKER in Jupyter-notebooks # Executables for running MAKER are located in /opt/maker/bin and /opt/maker/exe: # In[3]: get_ipython().system('ls /opt/maker/bin/') # As the names suggest the **/opt/maker/bin** directory includes many useful auxiliary scripts. For example cufflinks2gff3 will convert output from an RNA-seq analysis into a GFF3 file that can be used for input as evidence for WQ-MAKER. RepeatMasker, augustus, blast, exonerate, and snap are programs that MAKER uses in its pipeline. We recommend reading [MAKER Tutorial](http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/MAKER_Tutorial_for_GMOD_Online_Training_2014) at GMOD for more information about these. # ### Step 2. Set up a WQ-MAKER run. Create a working directory called "maker_run" on your home directory using the mkdir command and use cd to move into that directory: # Navigate to the mounted volume for creating test directory. **This command assumes that you have already created and attached the volume to your MASTER instance.** # In[4]: get_ipython().run_line_magic('cd', '/vol_b') # In[5]: get_ipython().system('mkdir wq_maker_run') get_ipython().run_line_magic('cd', 'wq_maker_run') # ### Step 3. Copy the contents of "WQ-MAKER_example_data" into the current directory using cp -r command. Verify using the ls command. Change the permissions on that directory # In[6]: get_ipython().system('sudo cp -r /opt/WQ-MAKER_example_data/test_data .') get_ipython().system('sudo chown -hR $USER test_data') get_ipython().system('sudo chgrp -hR $USER test_data') # Run the maker help function # In[7]: get_ipython().system('maker -h') # ### Step 5. Create control files that tell MAKER what to do. Three files are required: # - maker_opts.ctl - gives location of input files (genome and evidence) and sets options that affect MAKER behavior # - maker_exe.ctl - gives path information for the underlying executables. # - maker_bopt.ctl - sets parameters for filtering BLAST and Exonerate alignment results # In[8]: get_ipython().system('maker -CTL') get_ipython().system('ls') # - The "maker_exe.ctl" is automatically generated with the correct paths to executables and does not need to be modified. # - The "maker_bopt.ctl" is automatically generated with reasonable default parameters and also does not need to be modified unless you want to experiment with optimization of these parameters. # - The automatically generated "maker_opts.ctl" file **needs to be modified** in order to specify the genome file and evidence files to be used as input. You can use the text editor **"vi"** or **"nano"** that is already installed in the **MASTER** instance # Delete the current file and copy the staged version here and copy the pre-edited version of the "maker_opts.ctl" file that is staged in /opt/WQ-MAKER_example_data. # In[9]: get_ipython().system('rm maker_opts.ctl') get_ipython().system('cp /opt/WQ-MAKER_example_data/maker_opts.ctl .') # ### Step 6. Run WQ-MAKER on MASTER # In[10]: get_ipython().system('ls') # In[11]: import os os.system("wq_maker -contigs-per-split 1 -cores 1 -memory 2048 -disk 4096 -N wq_test_${USER} -d all -o master.dbg -debug_size_limit=0 > log_file 2>&1 &") # * -contigs-per-split 1: splits the genome file into 1 contig/scaffold/sequence per file. By specifiying this option, we are telling wq_maker to split the genome file into 1 sequence per file. By default, the wq_maker splits the fasta file into 10 sequences per file and this case, it is not ideal because, there will be 2 files (1 containing chromosomes from 1-10 and the other containing 11-12). This will decrease the speed at the wq_maker annotates the genome. # # **Unless otherwise you have a complete genome containing chromosomes or very few scaffolds, it is not recommended to use this option. For example if you have a genome that contains 10,000 sequences, then this option will create 10,000 files on your working directory which is not ideal of navigation purposes. Check to see how many contigs/scaffolds/chromosomes you have in your genome using # $$grep ">" -c $$ # and if the number is too high, then avoid this option** # # * -N maker_run_ud sets the project name to wq_test_{USER}. This is mandatory if we need to run WQ-MAKER. # * -d all Sets the debug flag for Work Queue. For all debugging output, try 'all' # * -o master.dbg Sets the debug file for Work Queue # * -debug_size_limit=0 Sets the byte wrap around on the debug file. 0 signifies it is never wrapped (Default it 1M) # * -stats test_out_stats.txt Specifies the file were Work Queue master stats are written # * log_file.txt captures the stdout # Wait for 1-2 minutes for the MASTER to advertise master status to the catalog server before your run WQ-MAKER on the WORKERS. # In[12]: get_ipython().system('tail log_file') # ### Step 7. Run WQ-MAKER on WORKERS # For running WQ-MAKER on WORKERS you need three files. # * Ansible config file # * Maker hostes file # * Ansible playbook # \1. Copy *ansible.cfg* file into your home directory which will help you to avoid host verification # In[13]: get_ipython().system('cp /opt/WQ-MAKER_example_data/.ansible.cfg ~') # In[14]: get_ipython().system('cat ~/.ansible.cfg') # \2. Copy `maker-hosts` file into your working directory and populate it with ip addresses of the workers # In[15]: get_ipython().system('cp /opt/WQ-MAKER_example_data/maker-hosts .') get_ipython().system('echo "129.114.17.181" >> maker-hosts # This ip address is specific to my account. This will not work for you') get_ipython().system('echo "149.165.169.203" >> maker-hosts # This ip address is specific to my account. This will not work for you') # In[16]: get_ipython().system('cat maker-hosts') # \3.Copy the Ansible playbook to your working directory # In[17]: get_ipython().system('cp /opt/WQ-MAKER_example_data/worker-launch.yml .') # In[18]: get_ipython().system('cat worker-launch.yml') # * -hosts is the name of the hosts (workers in this case. It can be anything) # tasks is the task that need to be performed by the Ansible (In this case run work_queue_worker) # name is just name of the task (It can be anything) # * -N maker_run_test sets the project name to maker_run_test. This is mandatory if we need to run WQ-MAKER # * -s /home/upendra/ Set the location for creating the working directory of the worker # * --debug-rotate-max=0 Set the maximum size of the debug log (default 10M, 0 disables) # * -d all Sets the debug flag for Work Queue. For all debugging output, try 'all' # * -o worker.dbg Sets the debug file for Work Queue # Run WQ-MAKER on the WORKERS now # In[19]: os.system("ansible-playbook -u ${USER} -i maker-hosts worker-launch.yml > log_file_2.txt 2>&1 &") # To check the status of the WQ-MAKER job, run the following. # In[20]: get_ipython().system('work_queue_status -M wq_test_${USER}') # ### Step 8. Stats output from MASTER instance # In[21]: get_ipython().system('tail log_file') # The following are the output files from WQ-MAKER # In[22]: get_ipython().system('ls test_genome.maker.output') # * The maker_opts.log, maker_exe.log, and maker_bopts.log files are logs of the control files used for this run of MAKER. # * The mpi_blastdb directory contains FASTA indexes and BLAST database files created from the input EST, protein, and repeat databases. # * test_genome_master_datastore_index.log contains information on both the run status of individual contigs and information on where individual contig data is stored. # * The test_genome_datastore directory contains a set of subfolders, each containing the final MAKER output for individual contigs from the genomic fasta file. # Check the test_genome_master_datastore_index.log and task_outputs.txt to see if there were any failures: # In[23]: get_ipython().system('cat test_genome.maker.output/test_genome_master_datastore_index.log') # All completed. Other possible status entries include: # # *FAILED* - indicates a failed run on this contig, MAKER will retry these # # *RETRY* - indicates that MAKER is retrying a contig that failed # # *SKIPPED_SMALL* - indicates the contig was too short to annotate (minimum contig length is specified in maker_opt.ctl) # # *DIED_SKIPPED_PERMANENT* - indicates a failed contig that MAKER will not attempt to retry (number of times to retry a contig is specified in maker_opt.ctl) # # The actual output data is stored in in nested set of directories under* test_genome_datastore* in a nested directory structure. # A typical set of outputs for chromosome 6 looks like this: # In[24]: get_ipython().system('ls test_genome.maker.output/test_genome_datastore/*/*/Chr6') # * The Chr6.gff file is in GFF3 format and contains the maker gene models and underlying evidence such as repeat regions, alignment data, and ab initio gene predictions, as well as fasta sequence. Having all of these data in one file is important to enable visualization of the called gene models and underlying evidence, especially using tools like Apollo which enable manual editing and curation of gene models. # * The fasta files Chr6.maker.proteins.fasta and Chr6.maker.transcripts.fasta contain the protein and transcript sequences for the final MAKER gene calls. # * The Chr6.maker.non_overlapping_ab_initio.proteins.fasta and Chr6.maker.non_overlapping_ab_initio.transcripts.fasta files are models that don't overlap MAKER genes that were rejected for lack of support. # * The Chr6.maker.snap_masked.proteins.fasta and Chr6.maker.snap_masked.transcript.fasta are the initial SNAP predicted models not further processed by MAKER # The output directory theVoid.Chr1 contains raw output data from all of the pipeline steps. One useful file found here is the repeat-masked version of the contig, query.masked.fasta. # ### Step 9: Merge the gff files # In[25]: get_ipython().run_line_magic('cd', 'test_genome.maker.output/') get_ipython().system('gff3_merge -n -d test_genome_master_datastore_index.log') # * -d The location of the MAKER datastore index log file. # * -n Do not print fasta sequence in footer # The final output from gff3_merge is **"test_genome.all.gff"** # In[26]: get_ipython().system('head test_genome.all.gff') # Extract only the maker annotations # In[27]: get_ipython().system('grep -P "\\tmaker\\t" test_genome.all.gff > test_genome.all.maker.gff') # In[28]: get_ipython().system('ls') # Check the number of genes # In[29]: get_ipython().system('grep -P "\\tgene\\t" test_genome.all.maker.gff | wc -l') # ### Step 10: Generate some stats from the WQ-MAKER run # In[1]: import pandas as pd # Change the format of the maker_wq.stats file # In[32]: get_ipython().system("sed 's/# //' ../maker_wq.stats > temp && mv temp ../maker_wq.stats") # In[62]: input_file1 = "../maker_wq.stats" df = pd.read_csv(input_file1, sep=" ") # In[64]: df.head() # #### Let's generate some plots # In[65]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # #### Plot between time and workers connected # In[70]: export_fig_file = os.path.splitext(input_file1)[0] + ".png" button = widgets.Button(description='Export Figure') text_box = widgets.Text(value=export_fig_file, placeholder='Type the filename you would like the figure exported to, i.e., "example.png"', description='Export File:', disabled=False ) plt.plot(df[['timestamp']], df[['workers_connected']]) plt.xlabel('Time') plt.ylabel('workers_connected') current_fig = plt.gcf() button.on_click(lambda _: current_fig.savefig(export_fig_file)) display(button) display(text_box) # #### Plot between time and committed cores # In[71]: plt.plot(df[['timestamp']], df[['committed_cores']]) plt.xlabel('Time') plt.ylabel('committed_cores') current_fig = plt.gcf() button.on_click(lambda _: current_fig.savefig(export_fig_file)) display(button) display(text_box) # #### AED score plots # In[72]: get_ipython().system('grep "AED" test_genome.all.maker.gff | cut -f 9 | tr \';\' \'\\n\' | grep -e "ID" -e "_AED" | perl -0777 -pe \'s/\\n_AED=/\\t/gxms\' | sed \'s/ID=//g\' > test_genome.all.maker.AED.txt') # In[73]: input_file2 = "test_genome.all.maker.AED.txt" df2 = pd.read_csv(input_file2, sep="\t", header=None) # In[74]: df2.head() # In[75]: df2.columns df2 = df2.rename(columns={0: 'gene', 1: 'AED'}) # In[76]: df2.head() # In[77]: export_fig_file = os.path.splitext(input_file2)[0] + ".png" button = widgets.Button(description='Export Figure') text_box = widgets.Text(value=export_fig_file, placeholder='Type the filename you would like the figure exported to, i.e., "example.png"', description='Export File:', disabled=False ) def filter(aed=0.0): df_filtered = df2.query('AED<={}'.format(aed)) plt.plot(df_filtered[['AED']]) plt.xlabel('gene') plt.ylabel('AED') current_fig = plt.gcf() button.on_click(lambda _: current_fig.savefig(text_box.value)) # Min value = 0.0, Max value = 1.0, Slider increments by 0.1 interact(filter,i=FloatSlider(min=0, max=1.0, step=0.1, continuous_update=False)); display(text_box) display(button) # #### AED score plots after filtering # In[78]: get_ipython().system('quality_filter.pl -s test_genome.all.maker.gff > test_genome.all.maker.filtered.gff') # In[79]: get_ipython().system('grep "AED" test_genome.all.maker.filtered.gff | cut -f 9 | tr \';\' \'\\n\' | grep -e "ID" -e "_AED" | perl -0777 -pe \'s/\\n_AED=/\\t/gxms\' | sed \'s/ID=//g\' > test_genome.all.maker.filtered.AED.txt') # In[80]: input_file3 = "test_genome.all.maker.filtered.AED.txt" df3 = pd.read_csv(input_file3, sep="\t", header=None) # In[81]: df3.head() # In[82]: df3 = df3.rename(columns={0: 'gene', 1: 'AED'}) # In[83]: df3.head() # In[84]: export_fig_file = os.path.splitext(input_file3)[0] + ".png" button = widgets.Button(description='Export Figure') text_box = widgets.Text(value=export_fig_file, placeholder='Type the filename you would like the figure exported to, i.e., "example.png"', description='Export File:', disabled=False ) def filter(aed=0.0): df_filtered = df3.query('AED<={}'.format(aed)) plt.plot(df_filtered[['AED']]) plt.xlabel('gene') plt.ylabel('AED') current_fig = plt.gcf() button.on_click(lambda _: current_fig.savefig(text_box.value)) # Min value = 0.0, Max value = 1.0, Slider increments by 0.1 interact(filter,i=FloatSlider(min=0, max=1.0, step=0.1, continuous_update=False)); display(text_box) display(button)