#!/usr/bin/env python # coding: utf-8 # ## Distribution of publication count for Dmel TF genes # # For each TF gene, count the number of *curated* publications, using data from GO and Monarch # In[28]: import ontobio.golr.golr_associations as ga # In[5]: # Fetch all Dmel TF genes DNA_BINDING_TF = 'GO:0003700' DMEL = 'NCBITaxon:7227' tf_genes = ga.get_subjects_for_object(object=DNA_BINDING_TF, subject_taxon=DMEL) len(tf_genes) # In[33]: # Routine to go to GO and Monarch to fetch all annotations for a gene def get_pubs_for_gene(g): # Monarch r = ga.search_associations(subject=g, rows=-1) pubs = set() for a in r['associations']: pl = a['publications'] if pl is not None: pubs.update([p['id'] for p in pl if p['id'].startswith('PMID')]) # GO r = ga.search_associations(subject=g, rows=-1, object_category='function') for a in r['associations']: pl = a['reference'] if pl is not None: pubs.update([p for p in pl if p.startswith('PMID')]) return pubs len(get_pubs_for_gene(tf_genes[0])) # In[15]: # find all gene,numberOfPub pairs pairs = [] for g in tf_genes: np = len(get_pubs_for_gene(g)) pairs.append((g,np)) # In[16]: # Check vals = [np for _,np in pairs] vals[0:5] # In[20]: # Check tf_genes_with_no_pubs = [g for g,np in pairs if np==0] tf_genes_with_no_pubs # In[21]: # genes with fewer than 5 pubs [g for g,np in pairs if np < 5] # In[23]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[24]: # Histogram plt.hist(vals, bins=40) plt.ylabel('No of genes') plt.xlabel('No of pubs') plt.show() # In[27]: # Save results import csv with open('gene-pubs.csv', 'w', newline='') as csvfile: w = csv.writer(csvfile, delimiter=',') for g,np in pairs: w.writerow([g,np]) # In[ ]: