#!/usr/bin/env python
# coding: utf-8

# # GESIS Binder Stats
# 
# The notebook demonstrates the use of the [Gesis Gallery API](https://notebooks.gesis.org/gallery/api/v1.0/) for repositories launched at Gesis Binder.
# The notebook is based on: https://github.com/betatim/binderlyzer.

# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import requests
import time


# In[ ]:


#The data shown now in gallery is stored in database from 10/05/19 - So to make sure all data is taken, from_dt = 9th may
from_dt = datetime.datetime(2019,5,9).isoformat() 
url = f'https://notebooks.gesis.org/gallery/api/v1.0/launches/{from_dt}/'


# In[ ]:


launches = []
# because of pagination the api gives 100 results per page so for analysis you have to take data in all pages
next_page = 1
while next_page is not None:
    api_url = url + str('?page=') + str(next_page)
    r = requests.get(api_url)
    response = r.json()
    # check the limit of queries per second/minute,
    message = response.get("message", "")
    if message not in ["2 per 1 second", "100 per 1 minute"]:
        launches.extend(response['launches'])
        next_page = response['next_page']
    else:
        time.sleep(1)
    

# In[ ]:


data= pd.DataFrame.from_dict(launches)
data.head(5)


# In[ ]:


# make it easier to grab the ref
data['repo'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[0])
data['org'] = data['spec'].apply(lambda s: s.split("/", 1)[0])
data['ref'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[1])
data = data.drop(columns=['schema', 'version', 'spec'])


# In[ ]:


data.head()


# # Where are repositories hosted?

# In[ ]:


(data.groupby("provider")
   .size()
   .reset_index(name='Launches')
   .sort_values('Launches', ascending=False))


# In[ ]:


# add a nnew column showing total launches per repo
totals_per_repo = (data.groupby(["repo"])
 .size()
 .reset_index(name='repo_counts'))


# In[ ]:


# add a nnew column showing total launches per org
totals_per_org = (data.groupby(["org"])
 .size()
 .reset_index(name='org_counts'))


# In[ ]:


data_ = pd.merge(data, totals_per_repo, on='repo')
data_ = pd.merge(data_, totals_per_org, on='org')


# In[ ]:


(data_.groupby(["org", "repo", "ref", "repo_counts", "org_counts"])
 .size()
 # give the column a nice name
 .reset_index(name='ref_counts')
 # sort first by total launches, then within a repo by ref launches
 .sort_values(['org_counts', 'repo_counts', 'ref_counts'],
              ascending=[False,False, False])
 .set_index(["org", 'repo', 'ref'])
)


# In[ ]: