#!/usr/bin/env python # coding: utf-8 # # *This notebook contains material from [PyRosetta](https://RosettaCommons.github.io/PyRosetta.notebooks); # content is available [on Github](https://github.com/RosettaCommons/PyRosetta.notebooks.git).* # # < [Distributed computation example: miniprotein design](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.02-PyData-miniprotein-design.ipynb) | [Contents](toc.ipynb) | [Index](index.ipynb) | [Examples Using the `dask` Module](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.04-dask.delayed-Via-Slurm.ipynb) >

# # Example of Using PyRosetta with GNU Parallel # *Note:* In this tutorial, we will write a simple PyRosetta script to disk, and demonstrate how to run it in parallel using GNU parallel. This Jupyter notebook uses parallelization and is **not** meant to be executed within a Google Colab environment. # # **Please see setup instructions in Chapter 16.00** # In[1]: import logging logging.basicConfig(level=logging.INFO) import os import sys if 'google.colab' in sys.modules: print("This Jupyter notebook uses parallelization and is therefore not set up for the Google Colab environment.") sys.exit(0) # In[2]: get_ipython().run_cell_magic('file', 'outputs/mutate_residue_from_command_line.py', 'import argparse\nimport os\nimport pyrosetta\nimport uuid\n\n\n__author__ = "My Name"\n__email__ = "name@email.com"\n\n\ndef main(target=None, new_res=None):\n """Example function to run my custom PyRosetta script."""\n \n # Initialize PyRosetta with custom flags\n pyrosetta.init("-ignore_unrecognized_res 1 -renumber_pdb 1 -mute all")\n\n # Setup pose\n pose = pyrosetta.pose_from_file("inputs/5JG9.clean.pdb")\n\n # Setup directory structure\n main_dir = os.getcwd()\n unique_dir = os.path.join("outputs", "testing_" + uuid.uuid4().hex)\n if not os.path.isdir(unique_dir):\n os.mkdir(unique_dir)\n os.chdir(unique_dir)\n\n # Create scorefunction\n scorefxn = pyrosetta.create_score_function("ref2015_cart")\n\n # PyRosetta design protocol\n keep_chA = pyrosetta.rosetta.protocols.grafting.simple_movers.KeepRegionMover(\n res_start=str(pose.chain_begin(1)), res_end=str(pose.chain_end(1))\n )\n keep_chA.apply(pose)\n \n mutate = pyrosetta.rosetta.protocols.simple_moves.MutateResidue(\n target=target, new_res=new_res\n )\n mutate.apply(pose)\n\n mm = pyrosetta.rosetta.core.kinematics.MoveMap()\n mm.set_bb(True)\n mm.set_chi(True)\n min_mover = pyrosetta.rosetta.protocols.minimization_packing.MinMover()\n min_mover.set_movemap(mm)\n min_mover.score_function(scorefxn)\n min_mover.min_type("lbfgs_armijo_nonmonotone")\n min_mover.cartesian(True)\n min_mover.tolerance(0.01)\n min_mover.max_iter(200)\n min_mover.apply(pose)\n\n total_score = scorefxn(pose)\n \n # Setup outputs\n pdb_output_filename = "_".join(["5JG9.clean", str(target), str(new_res), ".pdb"])\n \n pyrosetta.dump_pdb(pose, pdb_output_filename)\n \n # Append scores to scorefile\n pyrosetta.toolbox.py_jobdistributor.output_scorefile(\n pose=pose, pdb_name="5JG9.clean", current_name=pdb_output_filename,\n scorefilepath="score.fasc", scorefxn=scorefxn,\n nstruct=1, native_pose=None, additional_decoy_info=None, json_format=True\n )\n os.chdir(main_dir)\n\n \nif __name__ == "__main__":\n \n # Declare parser object for managing input options\n parser = argparse.ArgumentParser()\n parser.add_argument("-t", "--target", type=int,\n help="Target residue to mutate as integer.")\n parser.add_argument("-r", "--new_res", type=str,\n help="Three letter amino acid code to which to mutate target.")\n args = parser.parse_args()\n \n # Run protocol\n main(target=args.target, new_res=args.new_res)\n') # Now we will run this script in parallel several different ways to demonstrate different types of job submission styles. # ### 1. Parallelize script in an interactive session: # On your laptop, you have access to as many cores as are on your machine. # On a high-performance computing cluster with SLURM scheduling, you first have to request as many cores as you want to run on in an interactive login session: # # >qlogin -c 8 --mem=16g # # will reserve 8 CPU cores and 16 GB of RAM for you and start your session on a node that has available resources. # # Then, we need to write a run file to disc specifying our input parameters # In[5]: with open("outputs/run_file_parallel_interactive.txt", "w") as f: for target in [2, 4, 6, 8, 10]: for new_res in ["ALA", "TRP"]: f.write("{0} outputs/mutate_residue_from_command_line.py -t {1} -r {2}\n".format(sys.executable, target, new_res)) # **Note**: it's always a good idea to run just one command first to make sure there aren't any errors in your script! # # **Note**: if you don't specify the correct python executable, activate the correct conda environment in your interactive session first. # # Now submit `outputs/run_file_parallel_interactive.txt` to GNU parallel from the command line in your interactive session: # # >cat outputs/run_file_parallel_interactive.txt | parallel -j 8 --no-notice & # # **Note**: The `parallel` exectuable is usually located at `/usr/bin/parallel` but the full path may differ on your computer. For installation info, visit: https://www.gnu.org/software/parallel/ # ### 2. Parallelize script on a high-performance computing cluster with Slurm scheduling (non-interactive submission): # We use GNU parallel again, but this time there is no need to pre-request server allocations. We can submit jobs to the Slurm scheduler from directly within this Jupyter Notebook or from command line! # # Useful background information: # - "Slurm is an open source, fault-tolerant, and highly scalable cluster management and job scheduling system for large and small Linux clusters. ... As a cluster workload manager, Slurm has three key functions. First, it allocates exclusive and/or non-exclusive access to resources (compute nodes) to users for some duration of time so they can perform work. Second, it provides a framework for starting, executing, and monitoring work (normally a parallel job) on the set of allocated nodes. Finally, it arbitrates contention for resources by managing a queue of pending work." Read further: https://slurm.schedmd.com/overview.html # # - With the Slurm scheduler we will use the `sbatch` command, therefore it may be useful to review the available options: # https://slurm.schedmd.com/sbatch.html # # First, write a SLURM submission script to disk specifying the job requirements. In this example, our conda environment is called `pyrosetta-bootcamp`: # In[7]: with open("outputs/sbatch_parallel.sh", "w") as f: f.write("#!/bin/bash \n") # Bash script f.write("#SBATCH -p short \n") # Specify "short" partition/queue f.write("#SBATCH -n 8 \n") # Specify eight cores f.write("#SBATCH -N 1 \n") # Specify one node f.write("#SBATCH --mem=16g \n") # Specify 16GB RAM over eight cores f.write("#SBATCH -o sbatch_parallel.log \n") # Specify output log filename f.write("conda activate pyrosetta-bootcamp \n") # Activate conda environment f.write("cat outputs/run_file_parallel_interactive.txt | /usr/bin/parallel -j 8 --no-notice \n") # Submit jobs to GNU parallel # Then, submit `outputs/sbatch_parallel.sh` to the SLURM scheduler with the `sbatch` command: # In[17]: if not os.getenv("DEBUG"): get_ipython().system('sbatch outputs/sbatch_parallel.sh') # We can then periodically check on the status of our jobs: # In[22]: get_ipython().system('squeue -u $USER') # ### 3. Submit jobs individually to the SLURM scheduler: # This time, we submit each job individually using the `sbatch` command directly on our PyRosetta script. # # **Warning**: do not submit more than ~1000 jobs with this method or risk clogging the SLURM scheduler! # # First, copy your python executable and paste it on the first line of the PyRosetta script after `#!`, followed by `#SBATCH` commands for each job: # In[8]: sys.executable # In[9]: get_ipython().run_cell_magic('file', 'outputs/mutate_residue_from_sbatch.py', '#!/home/klimaj/anaconda3/envs/pyrosetta-bootcamp/bin/python\n#SBATCH -p short\n#SBATCH -n 1\n#SBATCH --mem=2g\n#SBATCH -o sbatch.log\n\nimport argparse\nimport os\nimport pyrosetta\nimport uuid\n\n\n__author__ = "My Name"\n__email__ = "name@email.com"\n\n\ndef main(target=None, new_res=None):\n """Example function to run my custom PyRosetta script.\n """\n \n # Initialize PyRosetta with custom flags\n pyrosetta.init("-ignore_unrecognized_res 1 -renumber_pdb 1 -mute all")\n\n # Setup pose\n pose = pyrosetta.pose_from_file("inputs/5JG9.clean.pdb")\n \n # Setup directory structure\n main_dir = os.getcwd()\n unique_dir = os.path.join("outputs", "testing_" + uuid.uuid4().hex)\n if not os.path.isdir(unique_dir):\n os.mkdir(unique_dir)\n os.chdir(unique_dir)\n\n # Create scorefunction\n scorefxn = pyrosetta.create_score_function("ref2015_cart")\n\n # PyRosetta design protocol\n keep_chA = pyrosetta.rosetta.protocols.grafting.simple_movers.KeepRegionMover(\n res_start=str(pose.chain_begin(1)), res_end=str(pose.chain_end(1))\n )\n keep_chA.apply(pose)\n \n mutate = pyrosetta.rosetta.protocols.simple_moves.MutateResidue(\n target=target, new_res=new_res\n )\n mutate.apply(pose)\n\n mm = pyrosetta.rosetta.core.kinematics.MoveMap()\n mm.set_bb(True)\n mm.set_chi(True)\n min_mover = pyrosetta.rosetta.protocols.minimization_packing.MinMover()\n min_mover.set_movemap(mm)\n min_mover.score_function(scorefxn)\n min_mover.min_type("lbfgs_armijo_nonmonotone")\n min_mover.cartesian(True)\n min_mover.tolerance(0.01)\n min_mover.max_iter(200)\n min_mover.apply(pose)\n\n total_score = scorefxn(pose)\n \n # Setup outputs\n pdb_output_filename = "_".join(["5JG9.clean", str(target), str(new_res), ".pdb"])\n \n pyrosetta.dump_pdb(pose, pdb_output_filename)\n \n # Append scores to scorefile\n pyrosetta.toolbox.py_jobdistributor.output_scorefile(\n pose=pose, pdb_name="5JG9.clean", current_name=pdb_output_filename,\n scorefilepath="score.fasc", scorefxn=scorefxn,\n nstruct=1, native_pose=None, additional_decoy_info=None, json_format=True\n )\n os.chdir(main_dir)\n\n \nif __name__ == "__main__":\n \n # Declare parser object for managing input options\n parser = argparse.ArgumentParser()\n parser.add_argument("-t", "--target", type=int,\n help="Target residue to mutate as integer.")\n parser.add_argument("-r", "--new_res", type=str,\n help="Three letter amino acid code to which to mutate target.")\n args = parser.parse_args()\n \n # Run protocol\n main(target=args.target, new_res=args.new_res)\n') # Then, loop over your input parameters submitting the PyRosetta scripts to the scheduler using the `sbatch` command: # In[25]: if not os.getenv("DEBUG"): for target in [2, 4, 6, 8, 10]: for new_res in ["ALA", "TRP"]: get_ipython().system('sbatch ./outputs/mutate_residue_from_sbatch.py -t $target -r $new_res') # We can then periodically check on the status of our jobs: # In[26]: get_ipython().system('squeue -u $USER') # # < [Distributed computation example: miniprotein design](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.02-PyData-miniprotein-design.ipynb) | [Contents](toc.ipynb) | [Index](index.ipynb) | [Examples Using the `dask` Module](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.04-dask.delayed-Via-Slurm.ipynb) >