#!/usr/bin/env python
# coding: utf-8

# # Performing a quick distance comparison using PSA

# In this example, the trajectories have been pre-aligned using the fitting scheme described in:
# 
# S.L. Seyler, A. Kumar, M.F. Thorpe, and O. Beckstein, Path
# Similarity Analysis: a Method for Quantifying Macromolecular
# Pathways. `arXiv:1505.04807v1`_ [q-bio.QM], 2015.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

# Suppress FutureWarning about element-wise comparison to None
# Occurs when calling PSA plotting functions
import warnings
warnings.filterwarnings('ignore')


# ---
# ## 1) Set up input data for `PSA` using `MDAnalysis`

# In[2]:


from MDAnalysis import Universe
from MDAnalysis.analysis.psa import PSAnalysis
from pair_id import PairID


# Initialize lists for the methods on which to perform PSA. PSA will be performed for four different simulations methods with three runs for each: **DIMS**, **FRODA**, **rTMD-F**, and **rTMD-S**. Also initialize a `PSAIdentifier` object to keep track of the data corresponding to comparisons between pairs of simulations.

# In[3]:


method_names = ['DIMS', 'FRODA', 'GOdMD', 'MDdMD', 'rTMD-F', 'rTMD-S',
                'ANMP', 'iENM', 'MAP', 'MENM-SD', 'MENM-SP',
                'Morph', 'LinInt']
labels = [] # Heat map labels
simulations = [] # List of simulation topology/trajectory filename pairs
universes = [] # List of MDAnalysis Universes representing simulations


# For each method, get the topology and each of three total trajectories (per method). Each simulation is represented as a `(topology, trajectory)` pair of file names, which is appended to a master list of simulations.

# In[4]:


for method in method_names:
    # Note: DIMS uses the PSF topology format
    topname = 'top.psf' if 'DIMS' in method or 'TMD' in method else 'top.pdb'
    pathname = 'fitted_psa.dcd'
    method_dir = 'methods/{}'.format(method)
    if method is not 'LinInt':
        for run in xrange(1, 4): # 3 runs per method
            run_dir = '{}/{:03n}'.format(method_dir, run)
            topology = '{}/{}'.format(method_dir, topname)
            trajectory = '{}/{}'.format(run_dir, pathname)
            labels.append(method + '(' + str(run) + ')')
            simulations.append((topology, trajectory))
    else: # only one LinInt trajectory
        topology = '{}/{}'.format(method_dir, topname)
        trajectory = '{}/{}'.format(method_dir, pathname)
        labels.append(method)
        simulations.append((topology, trajectory))


# Generate a list of universes from the list of simulations.

# In[5]:


for sim in simulations:
    universes.append(Universe(*sim))


# ---
# ## 2) Compute and plot all-pairs distances using PSA

# Initialize a PSA comparison from the universe list using a C$_\alpha$ trajectory representation, then generate `PSA` `Path`s from the universes.

# In[6]:


psa_short = PSAnalysis(universes, path_select='name CA', labels=labels)
psa_short.generate_paths()


# ### Computing mutual distances using Hausdorff and (discrete) Fréchet path metrics

# **Hausdorff**: compute the Hausdorff distances between all unique pairs of `Path`s and store the distance matrix.

# In[7]:


psa_short.run(metric='hausdorff')
hausdorff_distances = psa_short.get_pairwise_distances()


# Plot clustered heat maps using Ward hierarchical clustering. The first heat map is plotted with the corresponding dendrogram and is fully labeled by the method names; the second heat map is annotated by the Hausdorff distances.

# In[8]:


psa_short.plot(filename='dh_ward_psa-short.pdf', linkage='ward');


# In[9]:


psa_short.plot_annotated_heatmap(filename='dh_ward_psa-short_annot.pdf', linkage='ward');


# **Fréchet**: compute the (discrete) Fréchet distances between all unique pairs of `Path`s  and store the distance matrix.

# In[10]:


psa_short.run(metric='discrete_frechet')
frechet_distances = psa_short.get_pairwise_distances()


# As above, plot heat maps for (discrete) Fréchet distances.

# In[11]:


psa_short.plot(filename='df_ward_psa-short.pdf', linkage='ward');


# In[12]:


psa_short.plot_annotated_heatmap(filename='df_ward_psa-short_annot.pdf', linkage='ward');


# ---
# ## 3) Extract specific data from PSA

# Get the Simulation IDs and PSA ID for the second DIMS simulation (DIMS 2) and third rTMD-F simulation (rTMD-F 3).

# In[13]:


identifier = PairID()
for name in method_names:
    run_ids = [1] if 'LinInt' in name else [1,2,3]
    identifier.add_sim(name, run_ids)


# In[14]:


sid1 = identifier.get_sim_id('DIMS 2')
sid2 = identifier.get_sim_id('rTMD-F 3')
pid = identifier.get_pair_id('DIMS 2', 'rTMD-F 3')


# Use the **Simulation ID**s to locate Hausdorff and (discrete) Fréchet distances DIMS 2/rTMD-F 3 comparison:

# In[15]:


print hausdorff_distances[sid1,sid2]
print frechet_distances[sid1,sid2]


# Use the **Pair ID** when the distances are in the form of a distance vector (see `scipy.spatial.distance.squareform`)

# In[16]:


from scipy.spatial.distance import squareform
hausdorff_vectorform = squareform(hausdorff_distances)
frechet_vectorform = squareform(frechet_distances)


# In[17]:


print hausdorff_vectorform[pid]
print frechet_vectorform[pid]


# Check that data obtained from the distance matrix is the same as that accessed from the distance vector

# In[18]:


print hausdorff_distances[sid1,sid2] == hausdorff_vectorform[pid]
print frechet_distances[sid1,sid2] == frechet_vectorform[pid]