#!/usr/bin/env python
# coding: utf-8

# # COMP 364: Molecular Structures with BioPython
# 
# 
# Last week we learned about tools to deal with biological sequences with `BioPython`.
# 
# Sequence analysis is very important, but ultimately, biological function is determined by structure.
# 
# Here is the (partial) sequence for a dopamine transporter protein:
# 
# ```
# DERETWSGKVDFLLSVIGFAVDLANVWRFPYLCYKNGGGAFLVPYGIMLAVGGIPLFYMELALGQHNRKGAITCWGRLVP
# LFKGIGYAVVLIAFYVDFYYNVIIAWSLRFFFASFTNSLPWTSCNNIWNTPNCRPFEGHVEGFQSAASEYFNRYILELNR
# SEGIHDLGAIKWDMALCLLIVYLICYFSLWKGISTSGKVVWFTALFPYAVLLILLIRGLTLPGSFLGIQYYLTPNFSAIY
# KAEVWVDAATQVFFSLGPGFGVLLAYASYNKYHNNVYKDALLTSFINSATSFIAGFVIFSVLGYMAHTLGVRIEDVATEG
# PGLVFVVYPAAIATMPASTFWALIFFMMLATLGLDSSFGGSEAIITALSDEFPKIKRNRELFVAGLFSLYFVVGLASCTQ
# GGFYFFHLLDRYAAGYSILVAVFFEAIAVSWIYGTNRFSEDIRDMIGFPPGRYWQVCWRFVAPIFLLFITVYGLIGYEPL
# TYADYVYPSWANALGWCIAGSSVVMIPAVAIFKLLSTPGSLRQRFTILTTPWRDQ
# ```
# 
# This protein's job is to take the dopamine neurotransmitter out of the neuron's synapses and terminate that feel-good positive response we get from dopamine.
# 
# This big string of text doesn't tell us much about how it can actually do this.
# 
# Structural biologists are able to determine what that sequence looks like when it takes its natural shape in the cell.
# 
# Here is that sequence's [structure](http://www.rcsb.org/pdb/ngl/ngl.do?pdbid=4XP1&bionumber=1) in 3D.
# 
# Clearly there is a lot more information coded in the sequence that we need to consider.
# 
# Today I'll walk you through extracting some useful information from this particular protein using BioPython's `PDB` module.

# ### What is a structure?
# 
# Biomolecules such as proteins, RNA, DNA, and chemicals (such as dopamine) are made up of atoms.
# 
# Each atom occupies a point in 4 dimensions: X, Y, Z, and time. For now we will forget about time and only deal with fixed shapshots of atoms.
# 
# Structural biologists isolate molecules in the lab, and using tools such as X-ray crystallography can obtain the X, Y, and Z coordinates of all the atoms in the molecule.
# 
# ![](http://i0.wp.com/cen.xraycrystals.org/wp-content/uploads/2014/07/09232-cover1-BFormDNAcxd.jpg?resize=250%2C288)
# 
# This is the famous X-ray diffraction pattern of DNA used by Watson & Crick & Franklin to solve the double helix structure of DNA.
# 
# Other notable structure solving technologies: electron microscopy, cryo-electron microscopy, Nuclear Magnetic Resonance (NMR).
# 
# We don't have to worry about how they did it, but for many years now, solved structures (3D positions of biomolecules atoms) have been deposited in the [RCSB PDB Database](http://www.rcsb.org/pdb/home/home.do)
# 
# 

# ### Downloading a Structure File from the PDB database
# 
# Every entry in the database has a unique ID code.
# 
# The dopamine transporter we are interested in is: 4XP1
# 
# The main file format for sequences is `FASTA`, for structures it is `PDB` or `mmCIF`.
# 
# BioPython.PDB has parsers for both.
# 
# [Here](http://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ) is the main BioPython.PDB reference page.
# 
# We can automatically download a structure from the database using the `PDBList` object's method `retrieve_pdb_file`.

# In[1]:


from Bio.PDB import *

pdbl = PDBList()
pdbl.retrieve_pdb_file('4XP1')


# That warning tells us that the PDB format is deprecated and that the new default is mmCIF.
# 
# We'll go into what that looks like in a little bit.

# Now we should have a folder containing `4XP1.cif` in our working directory.

# In[2]:


import os
os.listdir()


# In[3]:


os.listdir('xp')


# ### Parsing Structure files

# In[4]:


parser = MMCIFParser()


# The first argument to the instance method `get_structure` is an optional name for the molecule, and the second argument is the path to the structure file.

# In[5]:


structure = parser.get_structure('4XP1', 'xp/4xp1.cif')


# Now we have extracted all the structural information from the file.
# 
# Let's take a peek at the attributes.

# In[6]:


def cleandir(obj):
    print(", ".join([a for a in dir(obj) if not a.startswith("_")]))
cleandir(structure)


# Structure objects are organized in a specific hierarchy of objects.
# 
# ![](http://biopython.org/wiki/Smcra.png)
# 
# We're just going to focus on the core elements which are: Model --> Chain --> Residue --> Atom.
# 
# Each structure file can contain multiple "models" of the same molecule.
# 
# Each model contains several "chains" or strands of protein/RNA/DNA/.
# 
# Each "chain" is made up of residues, or amino acids/DNA bases/RNA bases.
# 
# Each residue is made up of Atoms.

# ### Visualizing structures in jupyter Notebooks

# There is a very nice "widget" for notebooks that lets us visualize structure objects.
# 
# ```
# conda config --add channels conda-forge
# conda install nglview -c bioconda
# # might need: jupyter-nbextension enable nglview --py --sys-prefix
# ```
# 
# To do more advanced manipulations, it is better to use standalone tools such as [Chimera](https://www.cgl.ucsf.edu/chimera/) and [pyMOL](https://pymol.org/2/).
# 
# Let's view our dopamine transporter protein bound to dopamine.

# In[7]:


import nglview as nv


# In[8]:


view = nv.show_biopython(structure)
view.clear_representations()
#view as ball and stick (atom and bond)
view.add_ball_and_stick()


# In[9]:


view


# As expected, we see a bunch of atoms in 3D space. 
# 
# This can often be hard to look at so we usually visualize this with what's known as a "ribbon" representation.

# In[10]:


view.clear_representations()
#add ribbons
view.add_cartoon('protein')
#add ball and stick for non-rotien
view.add_ball_and_stick('not protein')
view


# Individual atoms are not shown and instead, each chain is visualized as a continuous ribbon. 
# 
# We can see here that we have 3 different chains and a couple of ligands.
# 
# Let's see how we can get a handle on the different components.
# 
# How many models are there in this structure? It looks like there is only one transporter protein so we should only have a single model.

# In[11]:


for model in structure:
    print(f"model {model}")


# Ok we have one model. 
# 
# But we should have several chains as we saw above.

# In[12]:


model = structure[0] #since we only have one model
for chain in model:
    print(f"chain {chain}, Chain ID: {chain.id}")


# Ok as we guessed, we have 3 chains: A, L, and H.
# 
# We can go one step further and get all the residues in a chain.
# 
# We can access individual chains like keys in a dictionary from a model.

# In[13]:


chain_A = model['A']


# In[14]:


for res in chain_A:
    print(f"Residue name: {res.resname}, number: {res.id[1]}")


# Now we see that each chain is made up of `Residue` objects.
# 
# Finally, each residue should be made up of atoms.
# 
# Let's get residue 56 from Chain A.

# In[15]:


res = chain_A[56]


# In[16]:


print(res)


# In[17]:


for atom in res:
    print(f"{atom.name}")


# Putting it all together, we can print every atom in every chain.

# In[18]:


for model in structure:
    for chain in model:
        for residue in chain:
            for atom in residue:
                print(atom)


# # Mini project: find the dopamine binding pockets
# 
# Let's say I want to design a drug that binds to the transporter and blocks it from binding to dopamine.
# 
# This should have the effect of making people happier because dopamine now won't be removed from the synapses.
# 
# Actually, that's exactly what many antidepressants and narcotics (cocaine, amphetamines) do.
# 
# In order to do this, we need to know which residues dopamine binds so we can design a chemical that targets these kinds of residues.
# 
# ## mmCIF annotations
# 
# mmCIF files are like dictionaries with key:value pairs.
# 
# When we parsed the file above, we really only got the structure info, but there is a lot more information that we can access by parsing a little differently.
# 

# MMCIF files have a ton of information and they look like this:
# 
# ```
# # 
# loop_
# _pdbx_nonpoly_scheme.asym_id 
# _pdbx_nonpoly_scheme.entity_id 
# _pdbx_nonpoly_scheme.mon_id 
# _pdbx_nonpoly_scheme.ndb_seq_num 
# _pdbx_nonpoly_scheme.pdb_seq_num 
# _pdbx_nonpoly_scheme.auth_seq_num 
# _pdbx_nonpoly_scheme.pdb_mon_id 
# _pdbx_nonpoly_scheme.auth_mon_id 
# _pdbx_nonpoly_scheme.pdb_strand_id 
# _pdbx_nonpoly_scheme.pdb_ins_code 
# D 4  NA  1  701 701 NA  NA  A . 
# E 4  NA  1  702 702 NA  NA  A . 
# F 5  CL  1  703 704 CL  CL  A . 
# G 6  MAL 1  704 1   MAL MAL A . 
# H 6  MAL 1  705 2   MAL MAL A . 
# I 7  NAG 1  706 1   NAG NAG A . 
# J 8  P4G 1  707 1   P4G P4G A . 
# K 9  LDP 1  708 2   LDP LDP A . 
# ```
# 
# This happens to be an entry for "non-polymer" entities aka ligands/chemicals.
# 
# There are 10 `_pdbx_nonpoly_scheme..` headers. Each corresponds to a label for the columns in the entries below.
# 
# So `_pdbx_nonpoly_scheme.mon_id` corresponds to the third column which is the `id` of the ligand.
# 
# The `loop_` means, when parsing, loop over all the rows for each column label to make a list.
# 
# For example: `_pdbx_nonpoly_scheme.asym_id ` after looping would map to: `['D', 'E', 'F', 'G', ...]`
# 
# We're interested in `LDP` which is dopamine, but we see that there are others.
# 
# BioPython lets us parse this information into a dictionary.

# In[19]:


struc_dict = MMCIF2Dict.MMCIF2Dict('xp/4xp1.cif')


# In[20]:


print(struc_dict.keys())


# As you can see we have tons of keys.
# 
# Binding sites are often annotated by the researches in their mmCIF files.
# 
# ```
# loop_
# _struct_site.id 
# _struct_site.pdbx_evidence_code 
# _struct_site.pdbx_auth_asym_id 
# _struct_site.pdbx_auth_comp_id 
# _struct_site.pdbx_auth_seq_id 
# _struct_site.pdbx_auth_ins_code 
# _struct_site.pdbx_num_residues 
# _struct_site.details 
# AC1 Software A NA  701 ? 5 'binding site for residue NA A 701'                             
# AC2 Software A NA  702 ? 5 'binding site for residue NA A 702'                             
# AC3 Software A CL  703 ? 4 'binding site for residue CL A 703'                             
# AC4 Software A MAL 704 ? 4 'binding site for residue MAL A 704'                            
# AC5 Software A MAL 705 ? 4 'binding site for residue MAL A 705'                            
# AC6 Software A P4G 707 ? 1 'binding site for residue P4G A 707'                            
# AC7 Software A LDP 708 ? 9 'binding site for residue LDP A 708'                            
# AC8 Software A EDO 709 ? 2 'binding site for residue EDO A 709'                            
# AC9 Software A Y01 710 ? 4 'binding site for residue Y01 A 710'                            
# AD1 Software A CLR 711 ? 5 'binding site for residue CLR A 711'                            
# AD2 Software L NA  301 ? 4 'binding site for residue NA L 301'                             
# AD3 Software A NAG 706 ? 1 'binding site for Mono-Saccharide NAG A 706 bound to ASN A 141' 
# ```

# In[21]:


struc_dict['_struct_site.details']


# So the binding site for LDP has ID `AC7`.

# Now if we look at another entry in the mmCIF we can get a list of all the residues in each binding site:
# 
# ```
# loop_
# _struct_site_gen.id 
# _struct_site_gen.site_id 
# _struct_site_gen.pdbx_num_res 
# _struct_site_gen.label_comp_id 
# _struct_site_gen.label_asym_id 
# _struct_site_gen.label_seq_id 
# _struct_site_gen.pdbx_auth_ins_code 
# _struct_site_gen.auth_comp_id 
# _struct_site_gen.auth_asym_id 
# _struct_site_gen.auth_seq_id 
# _struct_site_gen.label_atom_id 
# _struct_site_gen.label_alt_id 
# _struct_site_gen.symmetry 
# _struct_site_gen.details 
# 1  AC1 5 ALA A 20  ? ALA A 44  . ? 1_555 ? 
# 2  AC1 5 ASN A 25  ? ASN A 49  . ? 1_555 ? 
# 3  AC1 5 SER A 255 ? SER A 320 . ? 1_555 ? 
# 4  AC1 5 ASN A 287 ? ASN A 352 . ? 1_555 ? 
# 5  AC1 5 HOH P .   ? HOH A 814 . ? 1_555 ? 
# ...
# ...
# ```

# There are other useful entries such as the title of the publication that this structure was featured in.

# In[22]:


struc_dict['_citation.title']


# All entry types are documented [here](http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Index/).
# 
# Let's extract all residues in the LDP binding site.

# In[23]:


site_ID = struc_dict['_struct_site_gen.site_id']
site_chain = struc_dict['_struct_site_gen.auth_asym_id']
site_resnum = struc_dict['_struct_site_gen.auth_seq_id']
site_resname = struc_dict['_struct_site_gen.label_comp_id']

cif_binding_residues = []
for bind_id, ch, num, name in zip(site_ID, site_chain, site_resnum, site_resname):
    if bind_id == "AC7":
        print(bind_id, ch, num, name)
        try:
            cif_binding_residues.append(structure[0][ch][int(num)])
        except:
            continue
    else:
        continue


# Let's say we don't trust this annotation. We can compute our own ligand binding site and compare.
# 
# NOTE: Obviously we could have done this directly with the `structure` object we obtained earlier.
# 
# ### Step 1: Find the LDP residue
# 
# We can get all the residues of a model using the `model.get_residues()` method.

# In[24]:


#finding LDP residue
LDP = None
for res in structure[0].get_residues():
    if res.resname == "LDP":
        LDP = res
        break
print(LDP)


# ### Step 2: find all other residues within a certain distance cutoff
# 
# Let's set a distance cutoff of 10 Angstroms.
# 
# We will count as binding residues any residue whose $\alpha$-carbon is within 10 Angstrom of any atom in LDP.
# 
# BioPython gives us a vector of X, Y, Z coordinates for every atom in the residue.
# 
# Let's get the X, Y, Z coordinates of the alpha carbon of residue 56 in chain A.
# 
# For this, we use the `coord` attribute.

# In[25]:


res_1_CA = structure[0]['A'][56]['CA']
print(res_1_CA.coord)


# Now we can use this to compute distances!

# Let's take another atom.

# In[32]:


res_2_CA = structure[0]['A'][327]['CA']
print(res_2_CA.coord)


# The difference between the coordinates will be a 3D vector.

# In[28]:


diff = res_1_CA.coord - res_2_CA.coord
print(diff)


# To get positive values we square the vector and then take the square root.

# In[29]:


import numpy as np
dist = np.sqrt(diff * diff)
print(dist)


# We now have the distance along each axis between the two CA atoms of the two residues.
# 
# We can apply this to every residue.

# In[38]:


cutoff = 10

binding_residues = []

for res in structure[0].get_residues():
    #skip the LDP residue
    if res == LDP:
        continue
    #non-amino acid residues are tagged with an 'H' in their id tuple
    #we want to skip these
    elif res.id[0].startswith("H"):
        continue
    else:
        alpha_carbon = res['CA']
        distances = []
        for atom in LDP:
            #subtract the two position vectors
            diff_vector = alpha_carbon.coord - atom.coord
            #to get a positive value we square the difference vector
            #we then take the square root to go back to the original scale
            distances.append(np.sqrt(np.sum(diff_vector * diff_vector)))
        #we get the nearest atom using min(distances) and see if it falls inside
        #the cutoff
        if min(distances) < cutoff:
            binding_residues.append(res)


# In[39]:


print(binding_residues)


# In[40]:


#view = nv.demo()
view = nv.show_biopython(structure)

# use hex values for now.
residues = structure[0].get_residues()
#this is a bit of a hack to set the binding residues to red in the visualization
colors = ['0x0000FF' if r not in binding_residues else '0xFF0000' for r in residues]
view._set_color_by_residue(colors, component_index=0, repr_index=0)
view


# Ok now let's compare this to our list of residues obtained from the mmCIF annotations.

# In[41]:


view = nv.show_biopython(structure)

# use hex values for now.
residues = structure[0].get_residues()
#this is a bit of a hack to set the binding residues to red in the visualization
colors = ['0x0000FF' if r not in cif_binding_residues else '0xFF0000' for r in residues]
view._set_color_by_residue(colors, component_index=0, repr_index=0)
view


# It looks like we were maybe a bit too permissive with our distance cutoff.