#!/usr/bin/env python # coding: utf-8 # # GESIS Binder Stats # # The notebook demonstrates the use of the [Gesis Gallery API](https://notebooks.gesis.org/gallery/api/v1.0/) for repositories launched at Gesis Binder. # The notebook is based on: https://github.com/betatim/binderlyzer. # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import datetime import pandas as pd import requests import time # In[ ]: #The data shown now in gallery is stored in database from 10/05/19 - So to make sure all data is taken, from_dt = 9th may from_dt = datetime.datetime(2019,5,9).isoformat() url = f'https://notebooks.gesis.org/gallery/api/v1.0/launches/{from_dt}/' # In[ ]: launches = [] # because of pagination the api gives 100 results per page so for analysis you have to take data in all pages next_page = 1 while next_page is not None: api_url = url + str('?page=') + str(next_page) r = requests.get(api_url) response = r.json() # check the limit of queries per second/minute, message = response.get("message", "") if message not in ["2 per 1 second", "100 per 1 minute"]: launches.extend(response['launches']) next_page = response['next_page'] else: time.sleep(1) # In[ ]: data= pd.DataFrame.from_dict(launches) data.head(5) # In[ ]: # make it easier to grab the ref data['repo'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[0]) data['org'] = data['spec'].apply(lambda s: s.split("/", 1)[0]) data['ref'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[1]) data = data.drop(columns=['schema', 'version', 'spec']) # In[ ]: data.head() # # Where are repositories hosted? # In[ ]: (data.groupby("provider") .size() .reset_index(name='Launches') .sort_values('Launches', ascending=False)) # In[ ]: # add a nnew column showing total launches per repo totals_per_repo = (data.groupby(["repo"]) .size() .reset_index(name='repo_counts')) # In[ ]: # add a nnew column showing total launches per org totals_per_org = (data.groupby(["org"]) .size() .reset_index(name='org_counts')) # In[ ]: data_ = pd.merge(data, totals_per_repo, on='repo') data_ = pd.merge(data_, totals_per_org, on='org') # In[ ]: (data_.groupby(["org", "repo", "ref", "repo_counts", "org_counts"]) .size() # give the column a nice name .reset_index(name='ref_counts') # sort first by total launches, then within a repo by ref launches .sort_values(['org_counts', 'repo_counts', 'ref_counts'], ascending=[False,False, False]) .set_index(["org", 'repo', 'ref']) ) # In[ ]: