#!/usr/bin/env python # coding: utf-8 # # Performing a quick distance comparison using PSA # In this example, the trajectories have been pre-aligned using the fitting scheme described in: # # S.L. Seyler, A. Kumar, M.F. Thorpe, and O. Beckstein, Path # Similarity Analysis: a Method for Quantifying Macromolecular # Pathways. `arXiv:1505.04807v1`_ [q-bio.QM], 2015. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # Suppress FutureWarning about element-wise comparison to None # Occurs when calling PSA plotting functions import warnings warnings.filterwarnings('ignore') # --- # ## 1) Set up input data for `PSA` using `MDAnalysis` # In[2]: from MDAnalysis import Universe from MDAnalysis.analysis.psa import PSAnalysis from pair_id import PairID # Initialize lists for the methods on which to perform PSA. PSA will be performed for four different simulations methods with three runs for each: **DIMS**, **FRODA**, **rTMD-F**, and **rTMD-S**. Also initialize a `PSAIdentifier` object to keep track of the data corresponding to comparisons between pairs of simulations. # In[3]: method_names = ['DIMS', 'FRODA', 'GOdMD', 'MDdMD', 'rTMD-F', 'rTMD-S', 'ANMP', 'iENM', 'MAP', 'MENM-SD', 'MENM-SP', 'Morph', 'LinInt'] labels = [] # Heat map labels simulations = [] # List of simulation topology/trajectory filename pairs universes = [] # List of MDAnalysis Universes representing simulations # For each method, get the topology and each of three total trajectories (per method). Each simulation is represented as a `(topology, trajectory)` pair of file names, which is appended to a master list of simulations. # In[4]: for method in method_names: # Note: DIMS uses the PSF topology format topname = 'top.psf' if 'DIMS' in method or 'TMD' in method else 'top.pdb' pathname = 'fitted_psa.dcd' method_dir = 'methods/{}'.format(method) if method is not 'LinInt': for run in xrange(1, 4): # 3 runs per method run_dir = '{}/{:03n}'.format(method_dir, run) topology = '{}/{}'.format(method_dir, topname) trajectory = '{}/{}'.format(run_dir, pathname) labels.append(method + '(' + str(run) + ')') simulations.append((topology, trajectory)) else: # only one LinInt trajectory topology = '{}/{}'.format(method_dir, topname) trajectory = '{}/{}'.format(method_dir, pathname) labels.append(method) simulations.append((topology, trajectory)) # Generate a list of universes from the list of simulations. # In[5]: for sim in simulations: universes.append(Universe(*sim)) # --- # ## 2) Compute and plot all-pairs distances using PSA # Initialize a PSA comparison from the universe list using a C$_\alpha$ trajectory representation, then generate `PSA` `Path`s from the universes. # In[6]: psa_short = PSAnalysis(universes, path_select='name CA', labels=labels) psa_short.generate_paths() # ### Computing mutual distances using Hausdorff and (discrete) Fréchet path metrics # **Hausdorff**: compute the Hausdorff distances between all unique pairs of `Path`s and store the distance matrix. # In[7]: psa_short.run(metric='hausdorff') hausdorff_distances = psa_short.get_pairwise_distances() # Plot clustered heat maps using Ward hierarchical clustering. The first heat map is plotted with the corresponding dendrogram and is fully labeled by the method names; the second heat map is annotated by the Hausdorff distances. # In[8]: psa_short.plot(filename='dh_ward_psa-short.pdf', linkage='ward'); # In[9]: psa_short.plot_annotated_heatmap(filename='dh_ward_psa-short_annot.pdf', linkage='ward'); # **Fréchet**: compute the (discrete) Fréchet distances between all unique pairs of `Path`s and store the distance matrix. # In[10]: psa_short.run(metric='discrete_frechet') frechet_distances = psa_short.get_pairwise_distances() # As above, plot heat maps for (discrete) Fréchet distances. # In[11]: psa_short.plot(filename='df_ward_psa-short.pdf', linkage='ward'); # In[12]: psa_short.plot_annotated_heatmap(filename='df_ward_psa-short_annot.pdf', linkage='ward'); # --- # ## 3) Extract specific data from PSA # Get the Simulation IDs and PSA ID for the second DIMS simulation (DIMS 2) and third rTMD-F simulation (rTMD-F 3). # In[13]: identifier = PairID() for name in method_names: run_ids = [1] if 'LinInt' in name else [1,2,3] identifier.add_sim(name, run_ids) # In[14]: sid1 = identifier.get_sim_id('DIMS 2') sid2 = identifier.get_sim_id('rTMD-F 3') pid = identifier.get_pair_id('DIMS 2', 'rTMD-F 3') # Use the **Simulation ID**s to locate Hausdorff and (discrete) Fréchet distances DIMS 2/rTMD-F 3 comparison: # In[15]: print hausdorff_distances[sid1,sid2] print frechet_distances[sid1,sid2] # Use the **Pair ID** when the distances are in the form of a distance vector (see `scipy.spatial.distance.squareform`) # In[16]: from scipy.spatial.distance import squareform hausdorff_vectorform = squareform(hausdorff_distances) frechet_vectorform = squareform(frechet_distances) # In[17]: print hausdorff_vectorform[pid] print frechet_vectorform[pid] # Check that data obtained from the distance matrix is the same as that accessed from the distance vector # In[18]: print hausdorff_distances[sid1,sid2] == hausdorff_vectorform[pid] print frechet_distances[sid1,sid2] == frechet_vectorform[pid]