Open-source framework for simple and fast integration of protein structure data with sequence annotations and genetic variation.
View instructions provided in the main README.md available at https://github.com/bartongroup/ProteoFAV
import proteofav
ProteoFAV implements two approaches to handle datasets. One can fetch a few files on the fly using functions conveniently provided. For large scale studies, however, is preferable to use a local source for the multiple data used, such as the mmCIF files for three-dimensional protein structures.
# Setting Logging Level
from proteofav.config import logging
logger = logging.getLogger()
assert len(logger.handlers) == 1
handler = logger.handlers[0]
handler.setLevel(logging.WARNING)
import os
from proteofav.structures import mmCIF, PDB
pdb_id = "2pah"
# create tmp dir
out_dir = os.path.join(os.getcwd(), "tmp")
os.makedirs(out_dir, exist_ok=True)
# output file names
out_mmcif = os.path.join(out_dir, "{}.cif".format(pdb_id))
out_mmcif_bio = os.path.join(out_dir, "{}_bio.cif".format(pdb_id))
out_pdb = os.path.join(out_dir, "{}.pdb".format(pdb_id))
# download structures
mmCIF.download(identifier=pdb_id, filename=out_mmcif)
mmCIF.download(identifier=pdb_id, filename=out_mmcif_bio,
bio_unit=True, bio_unit_preferred=True)
PDB.download(identifier=pdb_id, filename=out_pdb)
assert os.path.exists(out_mmcif)
assert os.path.exists(out_mmcif_bio)
assert os.path.exists(out_pdb)
mmcif = mmCIF.read(filename=out_mmcif)
print(mmcif.head())
print(mmcif.columns)
group_PDB id type_symbol label_atom_id label_alt_id label_comp_id \ 0 ATOM 1 N N . VAL 1 ATOM 2 C CA . VAL 2 ATOM 3 C C . VAL 3 ATOM 4 O O . VAL 4 ATOM 5 C CB . VAL label_asym_id label_entity_id label_seq_id pdbx_PDB_ins_code \ 0 A 1 1 ? 1 A 1 1 ? 2 A 1 1 ? 3 A 1 1 ? 4 A 1 1 ? ... Cartn_z occupancy B_iso_or_equiv pdbx_formal_charge \ 0 ... 18.770 1.0 56.51 ? 1 ... 20.244 1.0 59.09 ? 2 ... 20.700 1.0 44.63 ? 3 ... 20.204 1.0 59.84 ? 4 ... 20.638 1.0 53.90 ? auth_seq_id auth_comp_id auth_asym_id auth_atom_id pdbx_PDB_model_num \ 0 118 VAL A N 1 1 118 VAL A CA 1 2 118 VAL A C 1 3 118 VAL A O 1 4 118 VAL A CB 1 pdbe_label_seq_id 0 1 1 1 2 1 3 1 4 1 [5 rows x 22 columns] Index(['group_PDB', 'id', 'type_symbol', 'label_atom_id', 'label_alt_id', 'label_comp_id', 'label_asym_id', 'label_entity_id', 'label_seq_id', 'pdbx_PDB_ins_code', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv', 'pdbx_formal_charge', 'auth_seq_id', 'auth_comp_id', 'auth_asym_id', 'auth_atom_id', 'pdbx_PDB_model_num', 'pdbe_label_seq_id'], dtype='object')
mmcif_bio = mmCIF.read(filename=out_mmcif_bio)
print(mmcif_bio.head())
print(mmcif_bio.columns)
group_PDB id type_symbol label_atom_id label_alt_id label_comp_id \ 0 ATOM 1 N N . VAL 1 ATOM 2 C CA . VAL 2 ATOM 3 C C . VAL 3 ATOM 4 O O . VAL 4 ATOM 5 C CB . VAL label_asym_id label_entity_id label_seq_id pdbx_PDB_ins_code \ 0 A 1 1 ? 1 A 1 1 ? 2 A 1 1 ? 3 A 1 1 ? 4 A 1 1 ? ... B_iso_or_equiv pdbx_formal_charge auth_seq_id \ 0 ... 56.51 ? 118 1 ... 59.09 ? 118 2 ... 44.63 ? 118 3 ... 59.84 ? 118 4 ... 53.90 ? 118 auth_comp_id auth_asym_id auth_atom_id pdbx_PDB_model_num \ 0 VAL A N 1 1 VAL A CA 1 2 VAL A C 1 3 VAL A O 1 4 VAL A CB 1 pdbe_label_seq_id orig_label_asym_id orig_auth_asym_id 0 1 A A 1 1 A A 2 1 A A 3 1 A A 4 1 A A [5 rows x 24 columns] Index(['group_PDB', 'id', 'type_symbol', 'label_atom_id', 'label_alt_id', 'label_comp_id', 'label_asym_id', 'label_entity_id', 'label_seq_id', 'pdbx_PDB_ins_code', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv', 'pdbx_formal_charge', 'auth_seq_id', 'auth_comp_id', 'auth_asym_id', 'auth_atom_id', 'pdbx_PDB_model_num', 'pdbe_label_seq_id', 'orig_label_asym_id', 'orig_auth_asym_id'], dtype='object')
For a forma description of each colum please see http://mmcif.wwpdb.org/
# Column names mimic of a PDB file mimics those of the mmCIF format
# Please prefer processing mmCIF instead PDB, which were deprecated
pdb = PDB.read(filename=out_pdb)
print(pdb.head())
print(pdb.columns)
group_PDB id label_atom_id label_alt_id label_comp_id label_asym_id \ 0 HETATM 5316 FE F E . 1 HETATM 5317 FE F E . label_seq_id_full label_seq_id pdbx_PDB_ins_code Cartn_x \ 0 FE C FE C ? . ? 6 1 FE D FE D ? . ? -39 ... Cartn_z occupancy B_iso_or_equiv type_symbol \ 0 ... 284 42.9 25 1.0 0 84. FE 1 ... 235 43.6 84 1.0 0 91. FE auth_atom_id auth_comp_id auth_asym_id auth_seq_id_full auth_seq_id \ 0 FE E . FE C FE C 1 FE E . FE D FE D pdbx_PDB_model_num 0 1 1 1 [2 rows x 21 columns] Index(['group_PDB', 'id', 'label_atom_id', 'label_alt_id', 'label_comp_id', 'label_asym_id', 'label_seq_id_full', 'label_seq_id', 'pdbx_PDB_ins_code', 'Cartn_x', 'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv', 'type_symbol', 'auth_atom_id', 'auth_comp_id', 'auth_asym_id', 'auth_seq_id_full', 'auth_seq_id', 'pdbx_PDB_model_num'], dtype='object')
from proteofav.sifts import SIFTS
# output file names
out_sifts = os.path.join(out_dir, "{}.xml".format(pdb_id))
SIFTS.download(identifier=pdb_id, filename=out_sifts)
assert os.path.exists(out_sifts)
sifts = SIFTS.read(filename=out_sifts)
print(sifts.head())
PDB_regionId PDB_regionStart PDB_regionEnd PDB_regionResNum \ 0 1 1 335 1 1 1 1 335 2 2 1 1 335 3 3 1 1 335 4 4 1 1 335 5 PDB_dbAccessionId PDB_dbResNum PDB_dbResName PDB_dbChainId PDB_Annotation \ 0 2pah 118 VAL A Observed 1 2pah 119 PRO A Observed 2 2pah 120 TRP A Observed 3 2pah 121 PHE A Observed 4 2pah 122 PRO A Observed PDB_entityId ... SCOP_regionEnd SCOP_regionResNum \ 0 A ... 335 1 1 A ... 335 2 2 A ... 335 3 3 A ... 335 4 4 A ... 335 5 SCOP_dbAccessionId PDB_codeSecondaryStructure PDB_nameSecondaryStructure \ 0 42581 T loop 1 42581 T loop 2 42581 T loop 3 42581 T loop 4 42581 T loop Pfam_regionId Pfam_regionStart Pfam_regionEnd Pfam_regionResNum \ 0 - 0 0 NaN 1 1 2 332 2 2 1 2 332 3 3 1 2 332 4 4 1 2 332 5 Pfam_dbAccessionId 0 NaN 1 PF00351 2 PF00351 3 PF00351 4 PF00351 [5 rows x 34 columns]
The SIFT record also contains mappings to many other databases, such as:
Bear in mind that SIFT mapping occurs at residue, but also at the domain level. The default action is to load the residue mapping.
Also see the PDB_Annotation which flags several types of annotation at residue level, for example whether a given UniProt residues was observed.
from proteofav.dssp import DSSP
# output file names
out_dssp = os.path.join(out_dir, "{}.dssp".format(pdb_id))
DSSP.download(identifier=pdb_id, filename=out_dssp)
# sometimes fecthing from the DSSP FTP server at ftp://ftp.cmbi.ru.nl/pub/molbio/data/dssp/ times out...
print(os.path.exists(out_dssp))
True
dssp = DSSP.read(filename=out_dssp)
print(dssp.head())
RES RES_FULL INSCODE CHAIN AA SS ACC TCO KAPPA ALPHA PHI PSI 0 118 118 A V 127 0.000 360.0 360.0 360.0 124.7 1 119 119 A P 42 -0.071 360.0 -105.4 -51.6 149.9 2 120 120 A W 120 -0.593 41.9 -178.8 -81.0 139.2 3 121 121 A F 17 -0.980 33.0 -92.9 -138.9 150.9 4 122 122 A P 4 -0.405 27.9 -176.6 -65.3 130.9
from proteofav.validation import Validation
out_validation = os.path.join(out_dir, "{}_validation.xml".format(pdb_id))
Validation.download(identifier=pdb_id, filename=out_validation)
assert os.path.exists(out_validation)
validation = Validation.read(filename=out_validation)
print(validation.head())
validation_rscc validation_rama validation_icode validation_ligRSRZ \ 0 0.896 NaN ? NaN 1 0.960 Favored ? NaN 2 0.961 Favored ? NaN 3 0.920 Favored ? NaN 4 0.973 Favored ? NaN validation_ligRSRnbrMean validation_flippable-sidechain validation_psi \ 0 NaN NaN NaN 1 NaN NaN 149.9 2 NaN NaN 139.2 3 NaN NaN 150.9 4 NaN NaN 130.9 validation_rsr validation_owab validation_ligRSRnumnbrs ... \ 0 0.233 52.97 NaN ... 1 0.190 28.84 NaN ... 2 0.154 33.47 NaN ... 3 0.229 39.98 NaN ... 4 0.197 26.14 NaN ... validation_chain validation_phi validation_said validation_rsrz \ 0 A NaN A -0.160 1 A -51.6 A -0.274 2 A -81.0 A -0.874 3 A -138.9 A -0.308 4 A -65.3 A -0.204 validation_seq validation_ligRSRnbrStdev validation_altcode \ 0 1 NaN . 1 2 NaN . 2 3 NaN . 3 4 NaN . 4 5 NaN . validation_lig_rsrz_nbr_id validation_NatomsEDS validation_resnum 0 NaN 7 118 1 NaN 7 119 2 NaN 14 120 3 NaN 11 121 4 NaN 7 122 [5 rows x 27 columns]
PDB validation record is convenient when filtering a protein structure for analysis.
Protein structure representation is a hierarchical data structure (See http://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ). So to obtain the data in tabular format, ProteoFAV transforms the data. For example, for use cases that require one residue per row, the residue three-dimensional coordinates can be represented by the residue's Cα. Other filtering parameters are obtained with filter_structures
from proteofav.structures import filter_structures
mmcif_sel = filter_structures(mmcif, excluded_cols=None,
models='first', chains='A', res=None, res_full=None,
comps=None, atoms='CA', lines=None, category='auth',
residue_agg=False,
add_res_full=True, add_atom_altloc=False, reset_atom_id=True,
remove_altloc=False, remove_hydrogens=True, remove_partial_res=False)
print(mmcif_sel.head())
group_PDB id type_symbol label_atom_id label_alt_id label_comp_id \ 1 ATOM 2 C CA . VAL 8 ATOM 9 C CA . PRO 15 ATOM 16 C CA . TRP 29 ATOM 30 C CA . PHE 40 ATOM 41 C CA . PRO label_asym_id label_entity_id label_seq_id pdbx_PDB_ins_code \ 1 A 1 1 ? 8 A 1 2 ? 15 A 1 3 ? 29 A 1 4 ? 40 A 1 5 ? ... B_iso_or_equiv pdbx_formal_charge auth_seq_id \ 1 ... 59.09 ? 118 8 ... 20.13 ? 119 15 ... 33.96 ? 120 29 ... 34.42 ? 121 40 ... 28.65 ? 122 auth_comp_id auth_asym_id auth_atom_id pdbx_PDB_model_num \ 1 VAL A CA 1 8 PRO A CA 1 15 TRP A CA 1 29 PHE A CA 1 40 PRO A CA 1 pdbe_label_seq_id label_seq_id_full auth_seq_id_full 1 1 1 118 8 2 2 119 15 3 3 120 29 4 4 121 40 5 5 122 [5 rows x 24 columns]
Three dimensional coordinates of all atoms can be represented by the residues centroid
from proteofav.structures import residues_aggregation
mmcif_sel = residues_aggregation(mmcif, agg_method='centroid', category='auth')
print(mmcif_sel.head())
index pdbx_PDB_model_num auth_asym_id auth_seq_id group_PDB id \ 0 0 1 A 118 ATOM 1 1 1 1 A 119 ATOM 8 2 2 1 A 120 ATOM 15 3 3 1 A 121 ATOM 29 4 4 1 A 122 ATOM 40 type_symbol label_atom_id label_alt_id label_comp_id ... \ 0 N N . VAL ... 1 N N . PRO ... 2 N N . TRP ... 3 N N . PHE ... 4 N N . PRO ... pdbx_PDB_ins_code Cartn_x Cartn_y Cartn_z occupancy \ 0 ? -7.310714 21.031714 20.424143 1.0 1 ? -4.434571 21.470714 22.630714 1.0 2 ? -1.007000 15.673429 23.737071 1.0 3 ? -5.282455 15.784000 26.269091 1.0 4 ? -2.392571 13.578286 28.745714 1.0 B_iso_or_equiv pdbx_formal_charge auth_comp_id auth_atom_id \ 0 52.974286 ? VAL N 1 28.844286 ? PRO N 2 33.466429 ? TRP N 3 39.981818 ? PHE N 4 26.141429 ? PRO N pdbe_label_seq_id 0 1 1 2 2 3 3 4 4 5 [5 rows x 23 columns]
new_out_pdb = os.path.join(out_dir, "{}_new.pdb".format(pdb_id))
PDB.write(table=mmcif, filename=new_out_pdb)
sifts = SIFTS.read(filename=out_sifts)
print(sifts.head())
uniprot_ids = sifts.UniProt_dbAccessionId.unique()
print(uniprot_ids)
PDB_regionId PDB_regionStart PDB_regionEnd PDB_regionResNum \ 0 1 1 335 1 1 1 1 335 2 2 1 1 335 3 3 1 1 335 4 4 1 1 335 5 PDB_dbAccessionId PDB_dbResNum PDB_dbResName PDB_dbChainId PDB_Annotation \ 0 2pah 118 VAL A Observed 1 2pah 119 PRO A Observed 2 2pah 120 TRP A Observed 3 2pah 121 PHE A Observed 4 2pah 122 PRO A Observed PDB_entityId ... SCOP_regionEnd SCOP_regionResNum \ 0 A ... 335 1 1 A ... 335 2 2 A ... 335 3 3 A ... 335 4 4 A ... 335 5 SCOP_dbAccessionId PDB_codeSecondaryStructure PDB_nameSecondaryStructure \ 0 42581 T loop 1 42581 T loop 2 42581 T loop 3 42581 T loop 4 42581 T loop Pfam_regionId Pfam_regionStart Pfam_regionEnd Pfam_regionResNum \ 0 - 0 0 NaN 1 1 2 332 2 2 1 2 332 3 3 1 2 332 4 4 1 2 332 5 Pfam_dbAccessionId 0 NaN 1 PF00351 2 PF00351 3 PF00351 4 PF00351 [5 rows x 34 columns] ['P00439']
UniProt provides extensive, high-quality annotation for residues in proteins
from proteofav.annotation import Annotation
out_annotation = os.path.join(out_dir, "{}.gff".format(uniprot_ids[0]))
Annotation.download(identifier=uniprot_ids[0], filename=out_annotation)
assert os.path.exists(out_annotation)
Note also that GFF files althoug tabular, contains some extra level nesting in the GROUP
column. ProteoFAV tries to deconvolute this information
annotation = Annotation.read(filename=out_annotation)
print(annotation.head())
NAME SOURCE TYPE START END SCORE STRAND FRAME \ 0 P00439 UniProtKB Chain 1 452 . . . 1 P00439 UniProtKB Domain 36 114 . . . 2 P00439 UniProtKB Metal binding 285 285 . . . 3 P00439 UniProtKB Metal binding 290 290 . . . 4 P00439 UniProtKB Metal binding 330 330 . . . GROUP Dbxref ID \ 0 ID=PRO_0000205548;Note=Phenylalanine-4-hydroxy... NaN [PRO_0000205548] 1 Note=ACT;Ontology_term=ECO:0000255;evidence=EC... NaN NaN 2 Note=Iron%3B via tele nitrogen;Ontology_term=E... NaN NaN 3 Note=Iron%3B via tele nitrogen;Ontology_term=E... NaN NaN 4 Note=Iron;Ontology_term=ECO:0000250;evidence=E... NaN NaN Note Ontology_term \ 0 [Phenylalanine-4-hydroxylase] NaN 1 [ACT] [ECO:0000255] 2 [Iron; via tele nitrogen] [ECO:0000250] 3 [Iron; via tele nitrogen] [ECO:0000250] 4 [Iron] [ECO:0000250] evidence 0 NaN 1 [ECO:0000255|PROSITE-ProRule:PRU01007] 2 [ECO:0000250|UniProtKB:P04176] 3 [ECO:0000250|UniProtKB:P04176] 4 [ECO:0000250|UniProtKB:P04176]
We could fetch genetic variants from UniProt and Ensembl with:
Variants.fetch(identifier=uniprot_ids[0], id_source='uniprot',
synonymous=False, uniprot_vars=True,
ensembl_germline_vars=True, ensembl_somatic_vars=True)
but select_variants
handles merging of Ensembl vars for us
from proteofav.variants import Variants
uniprot, ensembl = Variants.select(identifier=uniprot_ids[0], id_source='uniprot',
synonymous=False, uniprot_vars=True,
ensembl_germline_vars=True, ensembl_somatic_vars=True)
print(uniprot.head())
accession alternativeSequence association_description association_disease \ 0 P00439 A NaN True 1 P00439 L haplotypes 1,4 True 2 P00439 L NaN True 3 P00439 S haplotype 36 True 4 P00439 V NaN True association_evidences_code \ 0 ECO:0000269 1 ECO:0000269 2 NaN 3 ECO:0000269 4 ECO:0000269 association_evidences_source_alternativeUrl \ 0 [http://europepmc.org/abstract/MED/22513348, h... 1 [http://europepmc.org/abstract/MED/22513348, h... 2 NaN 3 http://europepmc.org/abstract/MED/2014802 4 [http://europepmc.org/abstract/MED/8889590, ht... association_evidences_source_id \ 0 [22513348, 8889590, 8088845, 12501224] 1 [22513348, 1672290, 8889590, 12501224, 1672294] 2 NaN 3 2014802 4 [8889590, 12501224, 22513348, 23792259] association_evidences_source_name \ 0 PubMed 1 PubMed 2 NaN 3 PubMed 4 PubMed association_evidences_source_url \ 0 [http://www.ncbi.nlm.nih.gov/pubmed/22513348, ... 1 [http://www.ncbi.nlm.nih.gov/pubmed/22513348, ... 2 NaN 3 http://www.ncbi.nlm.nih.gov/pubmed/2014802 4 [http://www.ncbi.nlm.nih.gov/pubmed/8889590, h... association_name \ 0 [Phenylketonuria (PKU), Hyperphenylalaninemia ... 1 Phenylketonuria (PKU) 2 Hyperphenylalaninemia (HPA) 3 Phenylketonuria (PKU) 4 Phenylketonuria (PKU) ... siftPrediction siftScore \ 0 ... tolerated 0.11 1 ... deleterious 0 2 ... NaN NaN 3 ... NaN NaN 4 ... tolerated 0.06 somaticStatus sourceType taxid type wildType xrefs_id \ 0 0 mixed 9606 VARIANT V rs796052017 1 0 mixed 9606 VARIANT P rs5030851 2 0 uniprot 9606 VARIANT Q rs199475662 3 0 uniprot 9606 VARIANT L rs62642930 4 0 mixed 9606 VARIANT A rs5030857 xrefs_name \ 0 [dbSNP, Ensembl, 1000Genomes, ESP, ExAC] 1 [dbSNP, Ensembl, ESP, ExAC] 2 [dbSNP, Ensembl] 3 [dbSNP, Ensembl] 4 [dbSNP, Ensembl, 1000Genomes, ESP, ExAC] xrefs_url 0 [http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?t... 1 [http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?t... 2 [http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?t... 3 [http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?t... 4 [http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?t... [5 rows x 42 columns]
print(ensembl.head())
Parent allele begin clinical_significance codons \ 0 ENST00000553106 HGMD_MUTATION 377 [] 1 ENST00000553106 C/T 75 [] Gat/Aat 2 ENST00000553106 HGMD_MUTATION 300 [] 3 ENST00000553106 HGMD_MUTATION 245 [] 4 ENST00000553106 HGMD_MUTATION 415 [] consequenceType end feature_type frequency \ 0 coding_sequence_variant 377 transcript_variation NaN 1 missense_variant 75 transcript_variation NaN 2 coding_sequence_variant 300 transcript_variation NaN 3 coding_sequence_variant 245 transcript_variation NaN 4 coding_sequence_variant 415 transcript_variation NaN polyphenScore residues seq_region_name siftScore translation \ 0 NaN ENSP00000448059 NaN ENSP00000448059 1 0.014 D/N ENSP00000448059 0.49 ENSP00000448059 2 NaN ENSP00000448059 NaN ENSP00000448059 3 NaN ENSP00000448059 NaN ENSP00000448059 4 NaN ENSP00000448059 NaN ENSP00000448059 xrefs_id 0 CD011183 1 rs767453024 2 CM950893 3 CM941133 4 CM920564
For merging variants from the UniProt and Ensembl
from proteofav.mergers import uniprot_vars_ensembl_vars_merger
variants = uniprot_vars_ensembl_vars_merger(uniprot, ensembl)
print(variants.head())
Parent accession allele alternativeSequence association_description \ 0 NaN P00439 NaN del NaN 1 NaN P00439 NaN del NaN 2 NaN P00439 NaN K NaN 3 NaN P00439 NaN del NaN 4 NaN P00439 NaN L NaN association_disease association_evidences_code \ 0 NaN NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 True ECO:0000269 association_evidences_source_alternativeUrl association_evidences_source_id \ 0 NaN NaN 1 NaN NaN 2 NaN NaN 3 NaN NaN 4 http://europepmc.org/abstract/MED/23792259 23792259 association_evidences_source_name \ 0 NaN 1 NaN 2 NaN 3 NaN 4 PubMed ... siftScore somaticStatus \ 0 ... NaN 0.0 1 ... NaN 0.0 2 ... 0 0.0 3 ... NaN 0.0 4 ... NaN 0.0 sourceType taxid translation type wildType xrefs_id \ 0 uniprot 9606.0 NaN VARIANT L NaN 1 uniprot 9606.0 NaN VARIANT Y NaN 2 large_scale_study 9606.0 NaN VARIANT T COSM546084 3 uniprot 9606.0 NaN VARIANT L NaN 4 uniprot 9606.0 NaN VARIANT F NaN xrefs_name xrefs_url 0 NaN NaN 1 NaN NaN 2 cosmic curated http://cancer.sanger.ac.uk/cosmic/mutation/ove... 3 NaN NaN 4 NaN NaN [5 rows x 50 columns]
from proteofav.mergers import table_merger
# before merging we need to select/filter or add extra columns with necessary data
from proteofav.structures import filter_structures
from proteofav.dssp import filter_dssp
from proteofav.sifts import filter_sifts
from proteofav.validation import filter_validation
from proteofav.annotation import filter_annotation
# does residue aggregation and adds 'res_full' and removes hydrogens
mmcif = filter_structures(mmcif, excluded_cols=None,
models='first', chains=None, res=None, res_full=None,
comps=None, atoms=None, lines=None, category='auth',
residue_agg=True, agg_method='centroid',
add_res_full=True, add_atom_altloc=False, reset_atom_id=True,
remove_altloc=False, remove_hydrogens=True, remove_partial_res=False)
# adds 'full_chain' and 'rsa'
dssp = filter_dssp(dssp, excluded_cols=None,
chains=None, chains_full=None, res=None,
add_full_chain=True, add_ss_reduced=False,
add_rsa=True, rsa_method="Sander", add_rsa_class=False,
reset_res_id=True)
# does nothing
sifts = filter_sifts(sifts, excluded_cols=None, chains=None,
chain_auth=None, res=None, uniprot=None, site=None)
# adds 'res_full'
validation = filter_validation(validation, excluded_cols=None, chains=None, res=None,
add_res_full=True)
# annotation residue aggregation
annotation = filter_annotation(annotation, identifier=None, annotation_agg=True,
query_type='', group_residues=True,
drop_types=('Helix', 'Beta strand', 'Turn', 'Chain'))
table = table_merger(mmcif_table=mmcif, dssp_table=dssp, sifts_table=sifts,
validation_table=validation, annotation_table=annotation,
variants_table=variants)
print(table.head())
index pdbx_PDB_model_num auth_asym_id auth_seq_id group_PDB id \ 0 0 1 A 118 ATOM 1 1 1 1 A 119 ATOM 8 2 1 1 A 119 ATOM 8 3 2 1 A 120 ATOM 15 4 3 1 A 121 ATOM 29 type_symbol label_atom_id label_alt_id label_comp_id \ 0 N N . VAL 1 N N . PRO 2 N N . PRO 3 N N . TRP 4 N N . PHE ... siftScore \ 0 ... 0.14 1 ... 0.01 2 ... (0.03, 0.01) 3 ... 0 4 ... NaN somaticStatus sourceType taxid translation type \ 0 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 1 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 2 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 3 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 4 0.0 uniprot 9606.0 NaN VARIANT wildType xrefs_id xrefs_name \ 0 V rs776442422 ExAC 1 P rs398123292 (1000Genomes, ExAC) 2 P rs374999809 (ExAC, ESP) 3 W rs775327122 ExAC 4 F NaN NaN xrefs_url 0 http://exac.broadinstitute.org/awesome?query=r... 1 (http://www.ensembl.org/Homo_sapiens/Variation... 2 (http://evs.gs.washington.edu/EVS/PopStatsServ... 3 http://exac.broadinstitute.org/awesome?query=r... 4 NaN [5 rows x 155 columns]
from proteofav.mergers import Tables
# files are read/stored in the directories defined in the user defined config.ini file.
table = Tables.generate(merge_tables=True, uniprot_id=None, pdb_id=pdb_id, bio_unit=False,
sifts=True, dssp=False, validation=True, annotations=True, variants=True,
residue_agg='centroid', overwrite=False)
print(table.head())
index pdbx_PDB_model_num auth_asym_id auth_seq_id group_PDB id \ 0 0 1 A 118 ATOM 1 1 1 1 A 119 ATOM 8 2 1 1 A 119 ATOM 8 3 2 1 A 120 ATOM 15 4 3 1 A 121 ATOM 29 type_symbol label_atom_id label_alt_id label_comp_id \ 0 N N . VAL 1 N N . PRO 2 N N . PRO 3 N N . TRP 4 N N . PHE ... siftScore \ 0 ... 0.14 1 ... 0.01 2 ... (0.03, 0.01) 3 ... 0 4 ... NaN somaticStatus sourceType taxid translation type \ 0 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 1 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 2 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 3 0.0 large_scale_study 9606.0 ENSP00000448059 VARIANT 4 0.0 uniprot 9606.0 NaN VARIANT wildType xrefs_id xrefs_name \ 0 V rs776442422 ExAC 1 P rs398123292 (1000Genomes, ExAC) 2 P rs374999809 (ExAC, ESP) 3 W rs775327122 ExAC 4 F NaN NaN xrefs_url 0 http://exac.broadinstitute.org/awesome?query=r... 1 (http://www.ensembl.org/Homo_sapiens/Variation... 2 (http://evs.gs.washington.edu/EVS/PopStatsServ... 3 http://exac.broadinstitute.org/awesome?query=r... 4 NaN [5 rows x 142 columns]
One can use ProteoFAV for high-throughput structural characterization of binding sites, such as in Britto-Borges and Barton, 2017.
For example, the cAMP-dependent protein kinase catalytic subunit alpha (PKAα) is a small protein kinase that is critical homeostatic process in human tissue and in stress response in lower organisms UniProt:P17612. Accordinly, the function of the protein has been extensively studied, including the three dimensional structure with high sequence coverage and resolution.
uniprot_id = 'P17612'
gff_path = os.path.join(out_dir, uniprot_id + ".gff")
Annotation.download(
identifier=uniprot_id,
filename=gff_path)
P17612_annotation = Annotation.read(filename=gff_path)
# phosphorylated sites in UniProt
P17612_annotation[P17612_annotation.GROUP.str.contains('Note=Phospho')]
NAME | SOURCE | TYPE | START | END | SCORE | STRAND | FRAME | GROUP | Dbxref | ID | Note | Ontology_term | evidence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10 | P17612 | UniProtKB | Modified residue | 11 | 11 | . | . | . | Note=Phosphoserine%3B by autocatalysis;Ontolog... | NaN | NaN | [Phosphoserine; by autocatalysis] | [ECO:0000250] | [ECO:0000250|UniProtKB:P05132] |
11 | P17612 | UniProtKB | Modified residue | 49 | 49 | . | . | . | Note=Phosphothreonine;Ontology_term=ECO:000024... | [PMID:18691976] | NaN | [Phosphothreonine] | [ECO:0000244] | [ECO:0000244|PubMed:18691976] |
12 | P17612 | UniProtKB | Modified residue | 140 | 140 | . | . | . | Note=Phosphoserine;Ontology_term=ECO:0000250;e... | NaN | NaN | [Phosphoserine] | [ECO:0000250] | [ECO:0000250|UniProtKB:P05132] |
13 | P17612 | UniProtKB | Modified residue | 196 | 196 | . | . | . | Note=Phosphothreonine;Ontology_term=ECO:000026... | [PMID:12372837] | NaN | [Phosphothreonine] | [ECO:0000269] | [ECO:0000269|PubMed:12372837] |
14 | P17612 | UniProtKB | Modified residue | 198 | 198 | . | . | . | Note=Phosphothreonine%3B by PDPK1;Ontology_ter... | [PMID:12372837,PMID:16765046,PMID:20137943,PMI... | NaN | [Phosphothreonine; by PDPK1] | [ECO:0000269,ECO:0000269,ECO:0000269,ECO:00002... | [ECO:0000269|PubMed:12372837,ECO:0000269|PubMe... |
15 | P17612 | UniProtKB | Modified residue | 202 | 202 | . | . | . | Note=Phosphothreonine;Ontology_term=ECO:000026... | [PMID:17909264] | NaN | [Phosphothreonine] | [ECO:0000269] | [ECO:0000269|PubMed:17909264] |
16 | P17612 | UniProtKB | Modified residue | 331 | 331 | . | . | . | Note=Phosphotyrosine;Ontology_term=ECO:0000250... | NaN | NaN | [Phosphotyrosine] | [ECO:0000250] | [ECO:0000250|UniProtKB:P05132] |
17 | P17612 | UniProtKB | Modified residue | 339 | 339 | . | . | . | Note=Phosphoserine;Ontology_term=ECO:0000244,E... | [PMID:18691976,PMID:19690332,PMID:24275569,PMI... | NaN | [Phosphoserine] | [ECO:0000244,ECO:0000244,ECO:0000244,ECO:00002... | [ECO:0000244|PubMed:18691976,ECO:0000244|PubMe... |
phospho_residues = P17612_annotation.loc[P17612_annotation.GROUP.str.contains('Note=Phospho'), 'START']
from proteofav.sifts import sifts_best
P17612_best_structure = sifts_best('P17612')['P17612'][0]
P17612_best_structure['experimental_method'] == 'X-ray diffraction'
True
P17612_best_structure['tax_id'] == 9606 # human
True
table = Tables.generate(
merge_tables=True,
uniprot_id='P17612',
bio_unit=False,
sifts=True,
validation=True,
annotations=True,
residue_agg='centroid',
overwrite=False)
# every residue in the structure not mapped to the UniProt is discarded
table.dropna(subset=['UniProt_dbResNum'], axis=0, inplace=True)
table['UniProt_dbResNum'] = table['UniProt_dbResNum'].astype(int)
table[table['UniProt_dbResNum'].isin(phospho_residues)]
index | pdbx_PDB_model_num | auth_asym_id | auth_seq_id | group_PDB | id | type_symbol | label_atom_id | label_alt_id | label_comp_id | ... | CATH_regionResNum | CATH_dbAccessionId | Pfam_regionId | Pfam_regionStart | Pfam_regionEnd | Pfam_regionResNum | Pfam_dbAccessionId | annotation | site | accession | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
35 | 286 | 1 | A | 48 | ATOM | 277 | N | N | . | THR | ... | 49 | 3.30.200.20 | 1 | 44.0 | 298.0 | 49 | PF00069 | Domain: ['Protein kinase'] (nan), Modified res... | 49 | P17612 |
130 | 39 | 1 | A | 139 | ATOM | 1040 | N | N | . | SER | ... | 140 | 1.10.510.10 | 1 | 44.0 | 298.0 | 140 | PF00069 | Domain: ['Protein kinase'] (nan), Modified res... | 140 | P17612 |
189 | 101 | 1 | A | 195 | ATOM | 1517 | N | N | . | THR | ... | 196 | 1.10.510.10 | 1 | 44.0 | 298.0 | 196 | PF00069 | Domain: ['Protein kinase'] (nan), Modified res... | 196 | P17612 |
191 | 103 | 1 | A | 197 | HETATM | 1538 | N | N | . | TPO | ... | 198 | 1.10.510.10 | 1 | 44.0 | 298.0 | 198 | PF00069 | Domain: ['Protein kinase'] (nan), Modified res... | 198 | P17612 |
195 | 109 | 1 | A | 201 | ATOM | 1567 | N | N | . | THR | ... | 202 | 1.10.510.10 | 1 | 44.0 | 298.0 | 202 | PF00069 | Domain: ['Protein kinase'] (nan), Mutagenesis:... | 202 | P17612 |
326 | 251 | 1 | A | 330 | ATOM | 2586 | N | N | . | TYR | ... | 331 | 3.30.200.20 | - | 0.0 | 0.0 | NaN | NaN | Domain: ['AGC-kinase C-terminal'] (nan), Modif... | 331 | P17612 |
334 | 259 | 1 | A | 338 | HETATM | 2648 | N | N | . | SEP | ... | 339 | 3.30.200.20 | - | 0.0 | 0.0 | NaN | NaN | Domain: ['AGC-kinase C-terminal'] (nan), Modif... | 339 | P17612 |
7 rows × 91 columns
phospho_residues_b = table.loc[table['UniProt_dbResNum'].isin(phospho_residues), 'B_iso_or_equiv'].mean()
all_residues_b = table.loc[:, 'B_iso_or_equiv'].mean()
phospho_residues_b > all_residues_b
False
Overall phophorylated Ser/Thr have are have high b-factors, hot residues, that is not true for the 3ovv
structure.
table.loc[table['UniProt_dbResNum'].isin(phospho_residues), 'PDB_codeSecondaryStructure'].value_counts()
T 4 H 2 E 1 Name: PDB_codeSecondaryStructure, dtype: int64
4 of 7 residues occur on Turns
table.loc[table['UniProt_dbResNum'].isin(phospho_residues), 'PDB_Annotation'].all()
'Observed'
And all residues were observed in the structure, not labeled in the REM465 field
table.loc[table['UniProt_dbResNum'].isin(phospho_residues), 'validation_rama']
35 Favored 130 Favored 189 Favored 191 NaN 195 Favored 326 Favored 334 NaN Name: validation_rama, dtype: object
5 out 7 have are not Ramachandran outliers, the NaN values were given for the Phopho resides observed in the protein crystal