#!/usr/bin/env python
# coding: utf-8
#
# *This notebook contains material from [PyRosetta](https://RosettaCommons.github.io/PyRosetta.notebooks);
# content is available [on Github](https://github.com/RosettaCommons/PyRosetta.notebooks.git).*
#
# < [Part I: Parallelized Global Ligand Docking with `pyrosetta.distributed`](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.05-Ligand-Docking-dask.ipynb) | [Contents](toc.ipynb) | [Index](index.ipynb) | [PyRosettaCluster Tutorial 1B. Reproduce simple protocol](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.07-PyRosettaCluster-Reproduce-simple-protocol.ipynb) >
# # PyRosettaCluster Tutorial 1A. Simple protocol
#
# PyRosettaCluster Tutorial 1A is a Jupyter Lab that generates a decoy using `PyRosettaCluster`. It is the simplest use case, where one protocol takes one input `.pdb` file and returns one output `.pdb` file.
#
# All information needed to reproduce the simulation is included in the output `.pdb` file. After completing PyRosettaCluster Tutorial 1A, see PyRosettaCluster Tutorial 1B to learn how to reproduce simulations from PyRosettaCluster Tutorial 1A.
# *Warning*: This notebook uses `pyrosetta.distributed.viewer` code, which runs in `jupyter notebook` and might not run if you're using `jupyterlab`.
# *Note:* This Jupyter notebook uses parallelization and is **not** meant to be executed within a Google Colab environment.
# *Note:* This Jupyter notebook requires the PyRosetta distributed layer which is obtained by building PyRosetta with the `--serialization` flag or installing PyRosetta from the RosettaCommons conda channel
#
# **Please see Chapter 16.00 for setup instructions**
# *Note:* This Jupyter notebook is intended to be run within **Jupyter Lab**, but may still be run as a standalone Jupyter notebook.
# ### 1. Import packages
# In[7]:
import bz2
import glob
import logging
import os
import pyrosetta
import pyrosetta.distributed.io as io
import pyrosetta.distributed.viewer as viewer
from pyrosetta.distributed.cluster import PyRosettaCluster
logging.basicConfig(level=logging.INFO)
# ### 2. Initialize a compute cluster using `dask`
#
# 1. Click the "Dask" tab in Jupyter Lab (arrow, left)
# 2. Click the "+ NEW" button to launch a new compute cluster (arrow, lower)
#
# ![title](Media/dask_labextension_1.png)
#
# 3. Once the cluster has started, click the brackets to "inject client code" for the cluster into your notebook
#
# ![title](Media/dask_labextension_2.png)
#
# Inject client code here, then run the cell:
# In[8]:
# This cell is an example of the injected client code. You should delete this cell and instantiate your own client with scheduler IP/port address.
if not os.getenv("DEBUG"):
from dask.distributed import Client
client = Client("tcp://127.0.0.1:40329")
else:
client = None
client
# Providing a `client` allows you to monitor parallelization diagnostics from within this Jupyter Lab Notebook. However, providing a `client` is only optional for the `PyRosettaCluster` instance and `reproduce` function. If you do not provide a `client`, then `PyRosettaCluster` will instantiate a `LocalCluster` object using the `dask` module by default, or an `SGECluster` or `SLURMCluster` object using the `dask-jobqueue` module if you provide the `scheduler` argument parameter, e.g.:
# ***
# ```
# PyRosettaCluster(
# ...
# client=client, # Monitor diagnostics with existing client (see above)
# scheduler=None, # Bypasses making a LocalCluster because client is provided
# ...
# )
# ```
# ***
# ```
# PyRosettaCluster(
# ...
# client=None, # Existing client was not input (default)
# scheduler=None, # Runs the simluations on a LocalCluster (default)
# ...
# )
# ```
# ***
# ```
# PyRosettaCluster(
# ...
# client=None, # Existing client was not input (default)
# scheduler="sge", # Runs the simluations on the SGE job scheduler
# ...
# )
# ```
# ***
# ```
# PyRosettaCluster(
# ...
# client=None, # Existing client was not input (default)
# scheduler="slurm", # Runs the simluations on the SLURM job scheduler
# ...
# )
# ```
# ### 3. Define or import the user-provided PyRosetta protocol(s):
#
# Remember, you *must* import `pyrosetta` locally within each user-provided PyRosetta protocol. Other libraries may not need to be locally imported because they are serializable by the `distributed` module. Although, it is a good practice to locally import all of your modules in each user-provided PyRosetta protocol.
# In[9]:
if not os.getenv("DEBUG"):
from additional_scripts.my_protocols import my_protocol
# In[10]:
if not os.getenv("DEBUG"):
client.upload_file("additional_scripts/my_protocols.py") # This sends a local file up to all worker nodes.
# #### Let's look at the definition of the user-provided PyRosetta protocol `my_protocol` located in `additional_scripts/my_protocols.py`:
# ```
# def my_protocol(input_packed_pose=None, **kwargs):
# """
# Relax the input `PackedPose` object.
#
# Args:
# input_packed_pose: A `PackedPose` object to be repacked. Optional.
# **kwargs: PyRosettaCluster task keyword arguments.
#
# Returns:
# A `PackedPose` object.
# """
# import pyrosetta # Local import
# import pyrosetta.distributed.io as io # Local import
# import pyrosetta.distributed.tasks.rosetta_scripts as rosetta_scripts # Local import
#
# packed_pose = io.pose_from_file(kwargs["s"])
#
# xml = """
#
#
#
#
#
#
#
#
#
#
#
# """
#
# return rosetta_scripts.SingleoutputRosettaScriptsTask(xml)(packed_pose)
# ```
# ### 4. Define the user-provided keyword argument(s) (i.e. `kwargs`):
# Upon PyRosetta initialization on the remote worker, the "`options`" and "`extra_options`" `kwargs` get concatenated before initialization. However, specifying the "`extra_options`" `kwargs` will override the default `-out:levels all:warning` command line flags, and specifying the "`options`" `kwargs` will override the default `-ex1 -ex2aro` command line flags.
# In[11]:
def create_kwargs():
yield {
"options": "-ex1",
"extra_options": "-out:level 300 -multithreading:total_threads 1", # Used by pyrosetta.init() on disributed workers
"set_logging_handler": "interactive", # Used by pyrosetta.init() on disributed workers
"s": os.path.join(os.getcwd(), "inputs", "1QYS.pdb"),
}
# Ideally, all pose manipulation is accomplished with the user-provided PyRosetta protocols. If you must manipulate a pose prior to instantiating `PyRosettaCluster`, here are some considerations:
# - Avoid passing `Pose` and `PackedPose` objects through `create_kwargs()`. You might notice that the above cell passes the protein structure information to `PyRosettaCluster` as a `str` type locating the `.pdb` file. In this way, the input `PackedPose` object is instantiated from that `str` within `PyRosettaCluster` on the remote workers (using `io.pose_from_file(kwargs["s"])`) using a random seed which is saved by `PyRosettaCluster`. This allows the protocol to be reproduced, and avoids passing redundant large chunks of data over the network.
# - It may be tempting to instantiate your pose before `PyRosettaCluster`, and pass a `Pose` or `PackedPose` object into the `create_kwargs()`. However, in this case PyRosetta will be initialized with a random seed outside `PyRosettaCluster`, and that random seed will not be saved by `PyRosettaCluster`. As a consequence, any action taken on the pose (e.g. filling in missing heavy atoms) will not be reproducible.
# -If you must instantiate your pose before `PyRosettaCluster`, to ensure reproducibility the user must initialize PyRosetta with the constant seed `1111111` within the Jupyter notebook or standalone python script using:
#
# ```
# import pyrosetta
# pyrosetta.init("-run:constant_seed 1")
# ```
#
# The `-run:constant_seed 1` command line flag defaults to the seed `1111111` ([documentation](https://www.rosettacommons.org/docs/latest/rosetta_basics/options/run-options)). Then, instantiate the pose:
#
# ```
# input_packed_pose = pyrosetta.io.pose_from_sequence("TEST")
# ...Perform any pose manipulation...
# ```
#
# and then instantiate `PyRosettaCluster` with the additional `input_packed_pose` parameter argument, e.g.:
#
# ```
# PyRosettaCluster(
# ...
# input_packed_pose=input_packed_pose,
# ...
# )
# ```
#
# For an initialization example, see Tutorial 4.
#
# In summary, the best practice involves giving `create_kwargs` information which will be used by the distributed protocol to create a pose within `PyRosettaCluster`. In edge cases, the user may provide a `Pose` or `PackedPose` object to the `input_packed_pose` argument of `PyRosettaCluster` and set a constant seed of `1111111` outside of `PyRosettaCluster`.
# ### 5. Launch the original simulation using the `distribute()` method
#
# The protocol produces an output decoy, the exact coordinates of which we will reproduce in Tutorial 1B.
# If the Jupyter Lab Notebook or standalone PyRosetta script did not yet initialize PyRosetta before instantiating `PyRosettaCluster` (preferred workflow), then `PyRosettaCluster` automatically initializes PyRosetta within the Jupyter Lab Notebook or standalone PyRosetta script with the command line flags `-run:constant_seed 1 -multithreading:total_threads 1 -mute all`. Thus, the master node is initialized with the default constant seed, where the master node acts as the client to the distributed workers. The distributed workers actually run the user-provided PyRosetta protocol(s), and each distributed worker initializes PyRosetta with a random seed, which is the seed saved by PyRosettaCluster for downstream reproducibility. The master node is always initialized with a constant seed as best practices.
# To monitor parallelization diagnostics in real-time, in the "Dask" tab, click the various diagnostic tools _(arrows)_ to open new tabs:
# ![title](Media/dask_labextension_4.png)
# Arrange the diagnostic tool tabs within Jupyter Lab how you best see fit by clicking and dragging them:
# ![title](Media/dask_labextension_3.png)
# In[12]:
if not os.getenv("DEBUG"):
output_path = os.path.join(os.getcwd(), "outputs_1A")
PyRosettaCluster(
tasks=create_kwargs,
client=client,
scratch_dir=output_path,
output_path=output_path,
nstruct=4, # Run the first user-provided PyRosetta protocol four times in parallel
).distribute(protocols=[my_protocol])
# While jobs are running, you may monitor their progress using the dask dashboard diagnostics within Jupyter Lab!
# ### 7. Visualize the resultant decoy
# Gather the output decoys on disk into poses in memory:
# In[13]:
if not os.getenv("DEBUG"):
results = glob.glob(os.path.join(output_path, "decoys", "*", "*.pdb.bz2"))
packed_poses = []
for bz2file in results:
with open(bz2file, "rb") as f:
packed_poses.append(io.pose_from_pdbstring(bz2.decompress(f.read()).decode()))
# View the poses in memory by clicking and draging to rotate, and zooming in and out with the mouse scroller.
# In[14]:
if not os.getenv("DEBUG"):
view = viewer.init(packed_poses, window_size=(800, 600))
view.add(viewer.setStyle())
view.add(viewer.setStyle(colorscheme="whiteCarbon", radius=0.25))
view.add(viewer.setHydrogenBonds())
view.add(viewer.setHydrogens(polar_only=True))
view.add(viewer.setDisulfides(radius=0.25))
view()
# Using the `pyrosetta.distributed.viewer` macromolecular visualizer, you can visualize your results in real-time as they complete.
# ![title](Media/viewer_1.png)
# ### Congrats!
#
# You have successfully performed a PyRosetta simulation using `PyRosettaCluster`! In the next tutorial we will reproduce one of the decoys precisely to make our computational science more reproducible.
#
# < [Part I: Parallelized Global Ligand Docking with `pyrosetta.distributed`](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.05-Ligand-Docking-dask.ipynb) | [Contents](toc.ipynb) | [Index](index.ipynb) | [PyRosettaCluster Tutorial 1B. Reproduce simple protocol](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.07-PyRosettaCluster-Reproduce-simple-protocol.ipynb) >