The notebook demonstrates the use of the Gesis Gallery API for repositories launched at Gesis Binder. The notebook is based on: https://github.com/betatim/binderlyzer.
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import requests
import time
#The data shown now in gallery is stored in database from 10/05/19 - So to make sure all data is taken, from_dt = 9th may
from_dt = datetime.datetime(2019,5,9).isoformat()
url = f'https://notebooks.gesis.org/gallery/api/v1.0/launches/{from_dt}/'
launches = []
# because of pagination the api gives 100 results per page so for analysis you have to take data in all pages
next_page = 1
while next_page is not None:
api_url = url + str('?page=') + str(next_page)
r = requests.get(api_url)
response = r.json()
# check the limit of queries per second/minute,
message = response.get("message", "")
if message not in ["2 per 1 second", "100 per 1 minute"]:
launches.extend(response['launches'])
next_page = response['next_page']
else:
time.sleep(1)
data= pd.DataFrame.from_dict(launches)
data.head(5)
# make it easier to grab the ref
data['repo'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[0])
data['org'] = data['spec'].apply(lambda s: s.split("/", 1)[0])
data['ref'] = data['spec'].apply(lambda s: s.rsplit("/", 1)[1])
data = data.drop(columns=['schema', 'version', 'spec'])
data.head()
(data.groupby("provider")
.size()
.reset_index(name='Launches')
.sort_values('Launches', ascending=False))
# add a nnew column showing total launches per repo
totals_per_repo = (data.groupby(["repo"])
.size()
.reset_index(name='repo_counts'))
# add a nnew column showing total launches per org
totals_per_org = (data.groupby(["org"])
.size()
.reset_index(name='org_counts'))
data_ = pd.merge(data, totals_per_repo, on='repo')
data_ = pd.merge(data_, totals_per_org, on='org')
(data_.groupby(["org", "repo", "ref", "repo_counts", "org_counts"])
.size()
# give the column a nice name
.reset_index(name='ref_counts')
# sort first by total launches, then within a repo by ref launches
.sort_values(['org_counts', 'repo_counts', 'ref_counts'],
ascending=[False,False, False])
.set_index(["org", 'repo', 'ref'])
)