#!/usr/bin/env python # coding: utf-8 # # COMP 364: Molecular Structures with BioPython # # # Last week we learned about tools to deal with biological sequences with `BioPython`. # # Sequence analysis is very important, but ultimately, biological function is determined by structure. # # Here is the (partial) sequence for a dopamine transporter protein: # # ``` # DERETWSGKVDFLLSVIGFAVDLANVWRFPYLCYKNGGGAFLVPYGIMLAVGGIPLFYMELALGQHNRKGAITCWGRLVP # LFKGIGYAVVLIAFYVDFYYNVIIAWSLRFFFASFTNSLPWTSCNNIWNTPNCRPFEGHVEGFQSAASEYFNRYILELNR # SEGIHDLGAIKWDMALCLLIVYLICYFSLWKGISTSGKVVWFTALFPYAVLLILLIRGLTLPGSFLGIQYYLTPNFSAIY # KAEVWVDAATQVFFSLGPGFGVLLAYASYNKYHNNVYKDALLTSFINSATSFIAGFVIFSVLGYMAHTLGVRIEDVATEG # PGLVFVVYPAAIATMPASTFWALIFFMMLATLGLDSSFGGSEAIITALSDEFPKIKRNRELFVAGLFSLYFVVGLASCTQ # GGFYFFHLLDRYAAGYSILVAVFFEAIAVSWIYGTNRFSEDIRDMIGFPPGRYWQVCWRFVAPIFLLFITVYGLIGYEPL # TYADYVYPSWANALGWCIAGSSVVMIPAVAIFKLLSTPGSLRQRFTILTTPWRDQ # ``` # # This protein's job is to take the dopamine neurotransmitter out of the neuron's synapses and terminate that feel-good positive response we get from dopamine. # # This big string of text doesn't tell us much about how it can actually do this. # # Structural biologists are able to determine what that sequence looks like when it takes its natural shape in the cell. # # Here is that sequence's [structure](http://www.rcsb.org/pdb/ngl/ngl.do?pdbid=4XP1&bionumber=1) in 3D. # # Clearly there is a lot more information coded in the sequence that we need to consider. # # Today I'll walk you through extracting some useful information from this particular protein using BioPython's `PDB` module. # ### What is a structure? # # Biomolecules such as proteins, RNA, DNA, and chemicals (such as dopamine) are made up of atoms. # # Each atom occupies a point in 4 dimensions: X, Y, Z, and time. For now we will forget about time and only deal with fixed shapshots of atoms. # # Structural biologists isolate molecules in the lab, and using tools such as X-ray crystallography can obtain the X, Y, and Z coordinates of all the atoms in the molecule. # # ![](http://i0.wp.com/cen.xraycrystals.org/wp-content/uploads/2014/07/09232-cover1-BFormDNAcxd.jpg?resize=250%2C288) # # This is the famous X-ray diffraction pattern of DNA used by Watson & Crick & Franklin to solve the double helix structure of DNA. # # Other notable structure solving technologies: electron microscopy, cryo-electron microscopy, Nuclear Magnetic Resonance (NMR). # # We don't have to worry about how they did it, but for many years now, solved structures (3D positions of biomolecules atoms) have been deposited in the [RCSB PDB Database](http://www.rcsb.org/pdb/home/home.do) # # # ### Downloading a Structure File from the PDB database # # Every entry in the database has a unique ID code. # # The dopamine transporter we are interested in is: 4XP1 # # The main file format for sequences is `FASTA`, for structures it is `PDB` or `mmCIF`. # # BioPython.PDB has parsers for both. # # [Here](http://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ) is the main BioPython.PDB reference page. # # We can automatically download a structure from the database using the `PDBList` object's method `retrieve_pdb_file`. # In[1]: from Bio.PDB import * pdbl = PDBList() pdbl.retrieve_pdb_file('4XP1') # That warning tells us that the PDB format is deprecated and that the new default is mmCIF. # # We'll go into what that looks like in a little bit. # Now we should have a folder containing `4XP1.cif` in our working directory. # In[2]: import os os.listdir() # In[3]: os.listdir('xp') # ### Parsing Structure files # In[4]: parser = MMCIFParser() # The first argument to the instance method `get_structure` is an optional name for the molecule, and the second argument is the path to the structure file. # In[5]: structure = parser.get_structure('4XP1', 'xp/4xp1.cif') # Now we have extracted all the structural information from the file. # # Let's take a peek at the attributes. # In[6]: def cleandir(obj): print(", ".join([a for a in dir(obj) if not a.startswith("_")])) cleandir(structure) # Structure objects are organized in a specific hierarchy of objects. # # ![](http://biopython.org/wiki/Smcra.png) # # We're just going to focus on the core elements which are: Model --> Chain --> Residue --> Atom. # # Each structure file can contain multiple "models" of the same molecule. # # Each model contains several "chains" or strands of protein/RNA/DNA/. # # Each "chain" is made up of residues, or amino acids/DNA bases/RNA bases. # # Each residue is made up of Atoms. # ### Visualizing structures in jupyter Notebooks # There is a very nice "widget" for notebooks that lets us visualize structure objects. # # ``` # conda config --add channels conda-forge # conda install nglview -c bioconda # # might need: jupyter-nbextension enable nglview --py --sys-prefix # ``` # # To do more advanced manipulations, it is better to use standalone tools such as [Chimera](https://www.cgl.ucsf.edu/chimera/) and [pyMOL](https://pymol.org/2/). # # Let's view our dopamine transporter protein bound to dopamine. # In[7]: import nglview as nv # In[8]: view = nv.show_biopython(structure) view.clear_representations() #view as ball and stick (atom and bond) view.add_ball_and_stick() # In[9]: view # As expected, we see a bunch of atoms in 3D space. # # This can often be hard to look at so we usually visualize this with what's known as a "ribbon" representation. # In[10]: view.clear_representations() #add ribbons view.add_cartoon('protein') #add ball and stick for non-rotien view.add_ball_and_stick('not protein') view # Individual atoms are not shown and instead, each chain is visualized as a continuous ribbon. # # We can see here that we have 3 different chains and a couple of ligands. # # Let's see how we can get a handle on the different components. # # How many models are there in this structure? It looks like there is only one transporter protein so we should only have a single model. # In[11]: for model in structure: print(f"model {model}") # Ok we have one model. # # But we should have several chains as we saw above. # In[12]: model = structure[0] #since we only have one model for chain in model: print(f"chain {chain}, Chain ID: {chain.id}") # Ok as we guessed, we have 3 chains: A, L, and H. # # We can go one step further and get all the residues in a chain. # # We can access individual chains like keys in a dictionary from a model. # In[13]: chain_A = model['A'] # In[14]: for res in chain_A: print(f"Residue name: {res.resname}, number: {res.id[1]}") # Now we see that each chain is made up of `Residue` objects. # # Finally, each residue should be made up of atoms. # # Let's get residue 56 from Chain A. # In[15]: res = chain_A[56] # In[16]: print(res) # In[17]: for atom in res: print(f"{atom.name}") # Putting it all together, we can print every atom in every chain. # In[18]: for model in structure: for chain in model: for residue in chain: for atom in residue: print(atom) # # Mini project: find the dopamine binding pockets # # Let's say I want to design a drug that binds to the transporter and blocks it from binding to dopamine. # # This should have the effect of making people happier because dopamine now won't be removed from the synapses. # # Actually, that's exactly what many antidepressants and narcotics (cocaine, amphetamines) do. # # In order to do this, we need to know which residues dopamine binds so we can design a chemical that targets these kinds of residues. # # ## mmCIF annotations # # mmCIF files are like dictionaries with key:value pairs. # # When we parsed the file above, we really only got the structure info, but there is a lot more information that we can access by parsing a little differently. # # MMCIF files have a ton of information and they look like this: # # ``` # # # loop_ # _pdbx_nonpoly_scheme.asym_id # _pdbx_nonpoly_scheme.entity_id # _pdbx_nonpoly_scheme.mon_id # _pdbx_nonpoly_scheme.ndb_seq_num # _pdbx_nonpoly_scheme.pdb_seq_num # _pdbx_nonpoly_scheme.auth_seq_num # _pdbx_nonpoly_scheme.pdb_mon_id # _pdbx_nonpoly_scheme.auth_mon_id # _pdbx_nonpoly_scheme.pdb_strand_id # _pdbx_nonpoly_scheme.pdb_ins_code # D 4 NA 1 701 701 NA NA A . # E 4 NA 1 702 702 NA NA A . # F 5 CL 1 703 704 CL CL A . # G 6 MAL 1 704 1 MAL MAL A . # H 6 MAL 1 705 2 MAL MAL A . # I 7 NAG 1 706 1 NAG NAG A . # J 8 P4G 1 707 1 P4G P4G A . # K 9 LDP 1 708 2 LDP LDP A . # ``` # # This happens to be an entry for "non-polymer" entities aka ligands/chemicals. # # There are 10 `_pdbx_nonpoly_scheme..` headers. Each corresponds to a label for the columns in the entries below. # # So `_pdbx_nonpoly_scheme.mon_id` corresponds to the third column which is the `id` of the ligand. # # The `loop_` means, when parsing, loop over all the rows for each column label to make a list. # # For example: `_pdbx_nonpoly_scheme.asym_id ` after looping would map to: `['D', 'E', 'F', 'G', ...]` # # We're interested in `LDP` which is dopamine, but we see that there are others. # # BioPython lets us parse this information into a dictionary. # In[19]: struc_dict = MMCIF2Dict.MMCIF2Dict('xp/4xp1.cif') # In[20]: print(struc_dict.keys()) # As you can see we have tons of keys. # # Binding sites are often annotated by the researches in their mmCIF files. # # ``` # loop_ # _struct_site.id # _struct_site.pdbx_evidence_code # _struct_site.pdbx_auth_asym_id # _struct_site.pdbx_auth_comp_id # _struct_site.pdbx_auth_seq_id # _struct_site.pdbx_auth_ins_code # _struct_site.pdbx_num_residues # _struct_site.details # AC1 Software A NA 701 ? 5 'binding site for residue NA A 701' # AC2 Software A NA 702 ? 5 'binding site for residue NA A 702' # AC3 Software A CL 703 ? 4 'binding site for residue CL A 703' # AC4 Software A MAL 704 ? 4 'binding site for residue MAL A 704' # AC5 Software A MAL 705 ? 4 'binding site for residue MAL A 705' # AC6 Software A P4G 707 ? 1 'binding site for residue P4G A 707' # AC7 Software A LDP 708 ? 9 'binding site for residue LDP A 708' # AC8 Software A EDO 709 ? 2 'binding site for residue EDO A 709' # AC9 Software A Y01 710 ? 4 'binding site for residue Y01 A 710' # AD1 Software A CLR 711 ? 5 'binding site for residue CLR A 711' # AD2 Software L NA 301 ? 4 'binding site for residue NA L 301' # AD3 Software A NAG 706 ? 1 'binding site for Mono-Saccharide NAG A 706 bound to ASN A 141' # ``` # In[21]: struc_dict['_struct_site.details'] # So the binding site for LDP has ID `AC7`. # Now if we look at another entry in the mmCIF we can get a list of all the residues in each binding site: # # ``` # loop_ # _struct_site_gen.id # _struct_site_gen.site_id # _struct_site_gen.pdbx_num_res # _struct_site_gen.label_comp_id # _struct_site_gen.label_asym_id # _struct_site_gen.label_seq_id # _struct_site_gen.pdbx_auth_ins_code # _struct_site_gen.auth_comp_id # _struct_site_gen.auth_asym_id # _struct_site_gen.auth_seq_id # _struct_site_gen.label_atom_id # _struct_site_gen.label_alt_id # _struct_site_gen.symmetry # _struct_site_gen.details # 1 AC1 5 ALA A 20 ? ALA A 44 . ? 1_555 ? # 2 AC1 5 ASN A 25 ? ASN A 49 . ? 1_555 ? # 3 AC1 5 SER A 255 ? SER A 320 . ? 1_555 ? # 4 AC1 5 ASN A 287 ? ASN A 352 . ? 1_555 ? # 5 AC1 5 HOH P . ? HOH A 814 . ? 1_555 ? # ... # ... # ``` # There are other useful entries such as the title of the publication that this structure was featured in. # In[22]: struc_dict['_citation.title'] # All entry types are documented [here](http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Index/). # # Let's extract all residues in the LDP binding site. # In[23]: site_ID = struc_dict['_struct_site_gen.site_id'] site_chain = struc_dict['_struct_site_gen.auth_asym_id'] site_resnum = struc_dict['_struct_site_gen.auth_seq_id'] site_resname = struc_dict['_struct_site_gen.label_comp_id'] cif_binding_residues = [] for bind_id, ch, num, name in zip(site_ID, site_chain, site_resnum, site_resname): if bind_id == "AC7": print(bind_id, ch, num, name) try: cif_binding_residues.append(structure[0][ch][int(num)]) except: continue else: continue # Let's say we don't trust this annotation. We can compute our own ligand binding site and compare. # # NOTE: Obviously we could have done this directly with the `structure` object we obtained earlier. # # ### Step 1: Find the LDP residue # # We can get all the residues of a model using the `model.get_residues()` method. # In[24]: #finding LDP residue LDP = None for res in structure[0].get_residues(): if res.resname == "LDP": LDP = res break print(LDP) # ### Step 2: find all other residues within a certain distance cutoff # # Let's set a distance cutoff of 10 Angstroms. # # We will count as binding residues any residue whose $\alpha$-carbon is within 10 Angstrom of any atom in LDP. # # BioPython gives us a vector of X, Y, Z coordinates for every atom in the residue. # # Let's get the X, Y, Z coordinates of the alpha carbon of residue 56 in chain A. # # For this, we use the `coord` attribute. # In[25]: res_1_CA = structure[0]['A'][56]['CA'] print(res_1_CA.coord) # Now we can use this to compute distances! # Let's take another atom. # In[32]: res_2_CA = structure[0]['A'][327]['CA'] print(res_2_CA.coord) # The difference between the coordinates will be a 3D vector. # In[28]: diff = res_1_CA.coord - res_2_CA.coord print(diff) # To get positive values we square the vector and then take the square root. # In[29]: import numpy as np dist = np.sqrt(diff * diff) print(dist) # We now have the distance along each axis between the two CA atoms of the two residues. # # We can apply this to every residue. # In[38]: cutoff = 10 binding_residues = [] for res in structure[0].get_residues(): #skip the LDP residue if res == LDP: continue #non-amino acid residues are tagged with an 'H' in their id tuple #we want to skip these elif res.id[0].startswith("H"): continue else: alpha_carbon = res['CA'] distances = [] for atom in LDP: #subtract the two position vectors diff_vector = alpha_carbon.coord - atom.coord #to get a positive value we square the difference vector #we then take the square root to go back to the original scale distances.append(np.sqrt(np.sum(diff_vector * diff_vector))) #we get the nearest atom using min(distances) and see if it falls inside #the cutoff if min(distances) < cutoff: binding_residues.append(res) # In[39]: print(binding_residues) # In[40]: #view = nv.demo() view = nv.show_biopython(structure) # use hex values for now. residues = structure[0].get_residues() #this is a bit of a hack to set the binding residues to red in the visualization colors = ['0x0000FF' if r not in binding_residues else '0xFF0000' for r in residues] view._set_color_by_residue(colors, component_index=0, repr_index=0) view # Ok now let's compare this to our list of residues obtained from the mmCIF annotations. # In[41]: view = nv.show_biopython(structure) # use hex values for now. residues = structure[0].get_residues() #this is a bit of a hack to set the binding residues to red in the visualization colors = ['0x0000FF' if r not in cif_binding_residues else '0xFF0000' for r in residues] view._set_color_by_residue(colors, component_index=0, repr_index=0) view # It looks like we were maybe a bit too permissive with our distance cutoff.