#!/usr/bin/env python
# coding: utf-8

# ## Distribution of publication count for Dmel TF genes
# 
# For each TF gene, count the number of *curated* publications, using data from GO and Monarch

# In[28]:


import ontobio.golr.golr_associations as ga


# In[5]:


# Fetch all Dmel TF genes
DNA_BINDING_TF = 'GO:0003700'
DMEL = 'NCBITaxon:7227'
tf_genes = ga.get_subjects_for_object(object=DNA_BINDING_TF, subject_taxon=DMEL)
len(tf_genes)


# In[33]:


# Routine to go to GO and Monarch to fetch all annotations for a gene
def get_pubs_for_gene(g):
    
    # Monarch
    r = ga.search_associations(subject=g, rows=-1)
    pubs = set()
    for a in r['associations']:
        pl = a['publications']
        if pl is not None:
            pubs.update([p['id'] for p in pl if p['id'].startswith('PMID')])
    
    # GO
    r = ga.search_associations(subject=g, rows=-1, object_category='function')
    for a in r['associations']:
        pl = a['reference']
        if pl is not None:
            pubs.update([p for p in pl if p.startswith('PMID')])
   
    return pubs
    
len(get_pubs_for_gene(tf_genes[0]))


# In[15]:


# find all gene,numberOfPub pairs
pairs = []
for g in tf_genes:
    np = len(get_pubs_for_gene(g))
    pairs.append((g,np))
    

# In[16]:


# Check
vals = [np for _,np in pairs]
vals[0:5]


# In[20]:


# Check
tf_genes_with_no_pubs = [g for g,np in pairs if np==0]
tf_genes_with_no_pubs


# In[21]:


# genes with fewer than 5 pubs
[g for g,np in pairs if np < 5]


# In[23]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[24]:


# Histogram
plt.hist(vals, bins=40)
plt.ylabel('No of genes')
plt.xlabel('No of pubs')
plt.show()


# In[27]:


# Save results
import csv
with open('gene-pubs.csv', 'w', newline='') as csvfile:
    w = csv.writer(csvfile, delimiter=',')
    for g,np in pairs:
        w.writerow([g,np])
 

# In[ ]: