Anoek@Online-Go was nice enough to dump some information from their player database for me to analyze. The data available includes:
This is an update of a previous analysis (https://nbviewer.jupyter.org/gist/thouis/0f93643a8f14ff802f92)
# Setup, connect to database, fetch data
%pylab inline
from pylab import *
# from prettyplotlib import *
import numpy as np
import datetime
import matplotlib.pyplot as plt
plt.style.use('ggplot')
figsize(12,8)
# connect to the player database
import sqlite3
db = sqlite3.connect('playerinfo_dump-2020-01-02.db')
cursor = db.cursor()
# registrations are strings in YYYY-MM-DD HH:MM:SS... format
# last_active are in scientific notation in milliseconds since the epoch, but sometimes "None"
registered_dates, last_active, rating, games_count = \
zip(*cursor.execute('SELECT registered, last_active, rating, wins+losses+draws FROM pony').fetchall())
# Convert to numpy arrays
registered_dates = array([c[:10] for c in registered_dates], dtype='datetime64')
# convert to second since the epoch
last_active = array([(float(x) if x != "None" else -inf) for x in last_active]) / 1000
rating = array(rating)
games_count = array(games_count)
# Helper function for plotting time series nicely
import time
import datetime
import matplotlib.dates as mdates
def plot_time_series(dates, values):
# setup - see http://matplotlib.org/examples/api/date_demo.html
years = mdates.YearLocator() # every year
months = mdates.MonthLocator() # every month
yearsFmt = mdates.DateFormatter('%Y')
dates = dates.astype(datetime.datetime)
# plot user count vs time
plot_date(dates, values, '-k')
ax = gca()
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
ax.xaxis.set_minor_locator(months)
gcf().autofmt_xdate()
plt.setp(ax.xaxis.get_majorticklabels(), rotation=70, ha='center')
# useful for some plots where aliasing hides actual data
def jitter(data, lo=-1, hi=1):
return data + np.random.uniform(lo, hi, len(data))
Populating the interactive namespace from numpy and matplotlib
/Users/thouis/VENV/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['power', 'fft', 'random', 'linalg', 'info'] `%matplotlib` prevents importing * from pylab and numpy "\n`%matplotlib` prevents importing * from pylab and numpy"
Online-Go.Com started as a turn-based server in 2006. Nova.gs started in July 2013 as a live or turn-based server, and merged with Online-Go.Com in October 2013. The data we have includes both sets of users.
# registration dates are not sorted, due to the Nova/OGS merger
sorted_registered_dates = sort(registered_dates).astype(datetime.datetime)
plot_time_series(sorted_registered_dates, arange(len(sorted_registered_dates)))
title('Total registered users by date')
ylabel('# of registered users');
After the merge of OGS and Nova, registrations have increased to around 150 per day, currently.
font = {'size' : 15}
matplotlib.rc('font', **font)
# Registration trends since 2013. Nova started in January (black line).
# Merge was in October of 2013 (green line).
# The cliff in the curve is a DB-udpate data artifact.
recent = sorted_registered_dates[sorted_registered_dates > datetime64('2012-11')]
base = len(sorted_registered_dates) - len(recent)
plot_time_series(recent, base + arange(len(recent)))
nova_start = datetime64('2013-01-01').astype(datetime.datetime)
ogs_merge = datetime64('2013-10-01').astype(datetime.datetime)
alphago_paper = datetime64('2016-01-28').astype(datetime.datetime)
leesedol_alphago = datetime64('2016-03-09').astype(datetime.datetime)
alphazero = datetime64('2017-12-05').astype(datetime.datetime)
axvline(nova_start, c='k', lw=0.5)
axvline(ogs_merge, c='g', lw=0.5)
axvline(alphago_paper, c='g', lw=0.5)
axvline(leesedol_alphago, c='g', lw=0.5)
text(nova_start, 90000, " Nova")
text(ogs_merge, 150000, " Merge\nwith OGS")
text(alphago_paper, 400000, "AlphaGo\nPaper", ha='right')
text(leesedol_alphago, 500000, "Lee Sedol vs.\nAlphago", ha='left')
ylabel('# of registered users')
title('Same graph as above, since October of 2013')
figure()
recent = sorted_registered_dates[sorted_registered_dates > datetime64('2015-01-01')]
base = len(sorted_registered_dates) - len(recent)
plot_time_series(recent, base + arange(len(recent)))
axvline(alphago_paper, c='g', lw=0.5)
axvline(leesedol_alphago, c='g', lw=0.5)
text(alphago_paper, 400000, "AlphaGo\nPaper", ha='right')
text(leesedol_alphago, 500000, "Lee Sedol vs. Alphago", ha='left')
ylabel('# of registered users')
title('... and since 2015')
show()
figure(figsize=(15, 8))
days_since_registered = ((max(registered_dates) - registered_dates) / timedelta64(1,'D')).astype(int)
hist(days_since_registered[days_since_registered < 2 * 52 * 7], bins=arange(2 * 53) * 7)
ylabel('registrations per week')
yticks([1000, 2000, 3000], fontsize=30)
xticks([0, 365, 2*365], ['Jan 2018', 'Jan 2019', 'Jan 2020'], fontsize=25);
OGS keeps track of when users connect to the site. last_active stores the timestamp of the last time a user was online.
# last_active is in seconds since the epoch. Some values are "None", represented as infinity, or other weird values
last_day_active = last_active.astype('datetime64[s]')
days_since_active = (max(last_day_active) - last_day_active) / np.timedelta64(1, 'D')
keep = isfinite(last_active) & (days_since_active < 365)
days_since_active = days_since_active[keep]
last_day_active = last_day_active[keep]
print("Number of users: {}".format(len(last_active)))
print("Number with a last_active entry in the last 365 days: {}".format(sum(keep)))
hist(days_since_active, 200)
ylabel('# of users last seen N days ago')
xlabel('number of days (N)')
axis('tight');
Number of users: 714264 Number with a last_active entry in the last 365 days: 134549
We can look at users' activity vs. when they registered.
from matplotlib.patches import Ellipse
days_since_active = (max(last_active) - last_active) / (24 * 60 * 60)
days_since_registered = (max(registered_dates) - registered_dates)
DATA_BREAKPOINT = 2 * 365
keep = days_since_active < DATA_BREAKPOINT
days_since_active = days_since_active[keep]
days_since_registered = days_since_registered[keep] / timedelta64(1,'D')
days_since_registered += np.random.uniform(-0.5, 0.5, days_since_registered.shape)
figure(figsize=(12, 12))
mask = (days_since_registered < DATA_BREAKPOINT)
scatter(days_since_registered[mask], days_since_active[mask], alpha=0.2, lw=0)
xlabel('days since registering')
ylabel('days since last active')
title('Last active vs. days-since-registration for the last 2 years')
c = 400
text(c, c, 'Users that registered and never came back', color='k', rotation=45, va='center', ha='center')
text(c, 0, 'Users that were recently active', color='k', va='center', ha='center')
e = Ellipse((c, c), 1.5 * c, c / 8, 45, edgecolor='k', linewidth=2, fill=False)
gca().add_artist(e)
e = Ellipse((c, 0), 1.5 * c, c / 8, 0, edgecolor='k', linewidth=2, fill=False)
gca().add_artist(e)
axis('tight')
figure(figsize=(12, 12))
DATA_BREAKPOINT = 180
mask2 = days_since_registered < DATA_BREAKPOINT
scatter(days_since_registered[mask2], days_since_active[mask2], alpha=0.2, lw=0)
xlabel('days since registering')
ylabel('days since last active')
title('Same as above, for last {} days'.format(DATA_BREAKPOINT))
c = 100
text(c, c, 'Users that registered and never came back', color='k', rotation=45, va='center', ha='center')
text(c, 0, 'Users that were recently active', color='k', va='center', ha='center')
e = Ellipse((c, c), 1.5 * c, c / 8, 45, edgecolor='k', linewidth=2, fill=False)
gca().add_artist(e)
e = Ellipse((c, 0), 1.5 * c, c / 8, 0, edgecolor='k', linewidth=2, fill=False)
gca().add_artist(e)
axis('tight')
figure(figsize=(12, 12))
DATA_BREAKPOINT = 60
mask3 = days_since_registered < DATA_BREAKPOINT
scatter(days_since_registered[mask3], days_since_active[mask3], alpha=0.2, lw=0)
xlabel('days since registering')
ylabel('days since last active')
title('Same as above, for last {} days'.format(DATA_BREAKPOINT))
c = 30
text(c, c, 'Users that registered and never came back', color='k', rotation=45, va='center', ha='center')
text(c, 0, 'Users that were recently active', color='k', va='center', ha='center')
e = Ellipse((c, c), 1.5 * c, c / 8, 45, edgecolor='k', linewidth=2, fill=False)
gca().add_artist(e)
e = Ellipse((c, 0), 1.5 * c, c / 8, 0, edgecolor='k', linewidth=2, fill=False)
gca().add_artist(e)
axis('tight');
We can look at the number of registrations each day and what percentage of those users are still active in the last two weeks. OGS seems to have around 5% new registrations result in a long-term active player (active for at least a year) with a low rate of attrition after that.
days_since_active = (max(last_active) - last_active) / (24 * 60 * 60)
days_since_registered = ((max(registered_dates) - registered_dates) / timedelta64(1,'D')).astype(int)
DATA_BREAKPOINT = 2 * 365
keep = days_since_registered < DATA_BREAKPOINT
days_since_active = days_since_active[keep]
days_since_registered = days_since_registered[keep]
registrations_per_day = bincount(days_since_registered)
active_per_days = bincount(days_since_registered, weights=(days_since_active <= 14))
# smooth to show trend
from scipy.ndimage.filters import gaussian_filter
def smooth(d):
return gaussian_filter(d, 7) # half a week
ratio = smooth(active_per_days) / smooth(registrations_per_day)
plot(ratio)
title('Recent activity in players registered in the last two years')
xlabel('days since registration')
ylabel('fraction active in the last two weeks')
axhline(0.1, c='k')
axhline(0.05, c='b')
text(600, 0.1, '10%', color='k', va='bottom')
text(600, 0.05, '5%', color='b', va='bottom')
yticks(linspace(0, 1.0, 11), ['%02d%%' % (i * 10) for i in range(11)])
axis('tight')
show()
Higher ratings tend to correlate with more active players.
def rating_to_rank(r):
if r >= 2100:
return "{}d".format(1 + (r - 2100) / 100)
else:
return "{}k".format((2100 - r) / 100)
yt = [100, 500, 1000, 1500, 1700, 1900, 2100, 2300, 2500, 2700]
days_since_active = (max(last_active) - last_active) / (24 * 60 * 60)
days_since_registered = ((max(registered_dates) - registered_dates) / timedelta64(1,'D')).astype(int)
mask = (days_since_active < 30) & (days_since_registered > days_since_active + 30)
scatter(days_since_active[mask], rating[mask], alpha=0.2, lw=0)
axis('tight')
xlabel('days since active')
ylabel('rating')
yticks(yt, [rating_to_rank(y) for y in yt], fontsize=20)
show()
days_since_active = (max(last_active) - last_active) / (24 * 60 * 60)
days_since_registered = ((max(registered_dates) - registered_dates) / timedelta64(1,'D')).astype(int)
mask = (days_since_active < 7) & (days_since_registered > days_since_active + 30)
scatter(days_since_active[mask], rating[mask], alpha=0.2, lw=0)
axis('tight')
xlabel('days since active')
ylabel('rating')
yticks(yt, [rating_to_rank(y) for y in yt], fontsize=20)
show()
How strong are active players (active in the last day, or week, and registered more than a month ago)?
days_since_active = (max(last_active) - last_active) / (24 * 60 * 60)
days_since_registered = ((max(registered_dates) - registered_dates) / timedelta64(1,'D')).astype(int)
mask = np.logical_and(days_since_registered > 30, days_since_active <= 1)
hist(rating[mask], 25)
axis('tight')
title('players active in the last day')
xlabel('rating')
ylabel('# of players')
xticks(yt, [rating_to_rank(y) for y in yt], rotation=70)
figure()
days_since_active = (max(last_active) - last_active) / (24 * 60 * 60)
mask = np.logical_and(days_since_registered > 30, days_since_active <= 7)
hist(rating[mask], 25)
axis('tight')
title('players active in the last week')
xlabel('rating')
ylabel('# of players')
xticks(yt, [rating_to_rank(y) for y in yt], rotation=70)
show()
!gist -p --update https://gist.github.com/fc4003becb21e6265f913d8ec8355231 Players.ipynb
https://gist.github.com/fc4003becb21e6265f913d8ec8355231