from __future__ import print_function # Setup, connect to database, fetch data %pylab inline from pylab import * from prettyplotlib import * import numpy as np import datetime from prettyplotlib import brewer2mpl blue_green = brewer2mpl.get_map('BuGn', 'Sequential', 9).mpl_colormap figsize(8,6) # connect to the player database import sqlite3 db = sqlite3.connect('games.sql') cursor = db.cursor() # useful constants ONE_DAY = 3600 * 24.0 # fetch all the data, convert to numpy arrays ratings_white, ratings_black, start_times, end_times, board_sizes, move_times = \ zip(*cursor.execute('SELECT rating_white, rating_black, start_time, end_time, size, move_time FROM games').fetchall()) # Convert to numpy arrays ratings_white = np.array(ratings_white) ratings_black = np.array(ratings_black) start_times = np.array(start_times, dtype='datetime64') last_start_time = start_times.max() end_times = np.array([t or last_start_time for t in end_times], dtype='datetime64') board_sizes = np.array(board_sizes) move_times = np.array(move_times, dtype=float) # games without time control have move_time of 0 - set it to a move per day like most correspondence move_times[move_times == 0] = ONE_DAY # ...and clamp correspondence games at one move per day for simplicity below move_times[move_times > ONE_DAY] = ONE_DAY # Some basics # how many days of data do we have? number_of_days = start_times.ptp() / timedelta64(1, 'D') print("{:.3} days of data".format(number_of_days)) # call anything less than 10 minutes / move "live", and more than 6 hours correspondence live_games_mask = move_times < 60 * 5 corr_games_mask = move_times > 60 * 60 * 6 print("{} total games".format(len(board_sizes))) print("{} live".format(sum(live_games_mask))) print("{} correspondence".format(sum(corr_games_mask))) print("") counts = np.bincount(board_sizes[live_games_mask]) order = np.argsort(counts)[::-1] print("Game count by board size") for sz in order: if counts[sz] > 0: print("{}x{}\t\t{}\t\t{:.2%}".format(sz, sz, counts[sz], counts[sz] / float(len(board_sizes)))) print("\n") # show game starts over the last N days plot(np.bincount(((start_times - start_times.min()) / np.timedelta64(1, 'D')).astype(int))); title('# of game started by day') ylabel('# of games started') xlabel('day') xticks(np.arange(0, 31, 5), np.datetime64('2014-09-09') + np.arange(0, 31, 5) * np.timedelta64(1, 'D'), rotation=45); hist(move_times[live_games_mask] / 60.0, 30) title('Live games only - # of games by time per move') xlabel('minutes per move') ylabel('# of games'); def rating_to_rank(r): if r >= 2100: return "{}d".format(1 + (r - 2100) / 100) else: return "{}k".format((2100 - r) / 100) rank_ticks = np.array([100, 500, 1000, 1500, 1800, 2000, 2200, 2400]) rank_names = [rating_to_rank(r) for r in rank_ticks] # Ranks of players average_ratings = (ratings_white + ratings_black) / 2.0 hist(average_ratings, 25) title('Average rank of players for all games') axis('tight') xticks(rank_ticks, rank_names) ylabel('# of games') figure() hist(average_ratings[live_games_mask], 25) title('Average rank of players for live games only') axis('tight') xticks(rank_ticks, rank_names) ylabel('# of games'); # find the time of day - dates are in UTC start_hours = np.array([t.item().hour for t in start_times])[live_games_mask] # most OGS players are from the US, so move to EDT by subtracting 4 (mod 24) start_hours = (start_hours + 20) % 24 hist(start_hours, 24) axis('tight') title('game starts by hour EDT') xlabel('hour in EDT') ylabel('# of games') figure(); # group ranks into 2-stone bands, and put everything above 4d into one group ranks = np.round(average_ratings / 200)[live_games_mask] ranks[ranks > 12] = 12 rank_count = ranks.max() - ranks.min() + 1 print(rank_count, ranks.max()) D = np.histogram2d(start_hours, ranks, (24, rank_count))[0].T # apply some smoothing across ranks Dsmooth = 0.5 * D Dsmooth[1:, :] += 0.25 * D[:-1, :] Dsmooth[:-1, :] += 0.25 * D[1:, :] D = Dsmooth imshow(D, cmap=blue_green, interpolation='nearest', origin='lower-left') axis('tight') title('Game starts by EDT hour (x) and rank (y)') xlabel('hour EDT') ylabel('rank') yticks(rank_ticks / 200 - ranks.min(), rank_names) figure() # apply a normalization to smooth out some of the spikes in the less active rank/hour combinations D = D + 25 D = D / D.sum(axis=1).reshape((-1, 1)) imshow(D, cmap=blue_green, interpolation='nearest', origin='lower-left') axis('tight') title('Game starts by EDT hour (x) and rank (y), Normalized within rank') xlabel('hour EDT') ylabel('rank') yticks(rank_ticks / 200 - ranks.min(), rank_names); # We don't have the number of moves made in a game, but we can use (moves / minute) and a cummulative sum of rates rates = 60 * 1.0 / move_times starts_ends = np.hstack((start_times, end_times)) se_rates = np.hstack((rates, -rates)) order = np.argsort(starts_ends) starts_ends = starts_ends[order] cummulative_rates = np.cumsum(se_rates[order]) # sample every 1 minute sample_times = np.arange(starts_ends[0], starts_ends[-1], numpy.timedelta64(1,'m')) sample_rates = np.interp(sample_times.astype(float), starts_ends.astype(float), cummulative_rates) from scipy.signal import medfilt def smooth(v, order): for i in range(order): tmp = 0.5 * v tmp[1:] += v[:-1] * 0.25 tmp[:-1] += v[1:] * 0.25 tmp[0] += 0.25 * v[0] tmp[-1] += 0.25 * v[-1] v = tmp return tmp # add together all the days, then sample across minutes start_day = np.datetime64('2014-09-09T00:00') time_from_midnight = (sample_times - start_day) / np.timedelta64(1, 'D') time_from_midnight -= time_from_midnight.astype(int) minutes_from_midnight = (time_from_midnight * 24 * 60).astype(int) counts = np.bincount(minutes_from_midnight, sample_rates) counts = smooth(counts, 31) plot(counts / number_of_days) # normalize by number of days we are summing together # label by hour xticks(np.arange(0, 25, 2) * 60, np.arange(0, 25, 2)); xlabel('Hour EDT') ylabel('Moves per minute') title('Total moves on server per minute by time of day'); !gist -p --update https://gist.github.com/thouis/be68b0d138e70b43769c "OGS Games.ipynb"