#!/usr/bin/env python # coding: utf-8 # # Lets-Plot in 2020 # ### Preparation # In[1]: from sys import executable get_ipython().system('{executable} -m pip install colorcet') # In[2]: import numpy as np import pandas as pd import colorcet as cc from PIL import Image from lets_plot import * from lets_plot.bistro.corr import * # In[3]: LetsPlot.setup_html() # In[4]: df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/lets_plot_git_history.csv", sep=';') df = df[['author_date', 'author_name', 'files_changed', 'insertions', 'deletions']] df.author_date = pd.to_datetime(df.author_date, utc=True) df.files_changed = df.files_changed.str.split(' ').str[0].astype(int) df.insertions = df.insertions.str.split(' ').str[0].astype(int) df.deletions = df.deletions.fillna('0').str.split(' ').str[0].astype(int) df['diff'] = df.insertions - df.deletions df['month'] = df.author_date.dt.month df['day'] = df.author_date.dt.day df['weekday'] = df.author_date.dt.weekday df['hour'] = df.author_date.dt.hour df = df[df.author_date.dt.year == 2020].sort_values(by='author_date').reset_index(drop=True) df.head() # ### General Analytics # In[5]: df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index() # In[6]: agg_features = {'files_changed': ['sum', 'mean'], \ 'insertions': ['sum', 'mean'], \ 'deletions': ['sum', 'mean'], \ 'diff': ['sum']} agg_df = df.groupby('author_name').agg(agg_features).reset_index() agg_features['commits_number'] = ['sum'] agg_df = pd.merge(agg_df.sort_index(axis=1, level=0), \ df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index().sort_index(axis=1, level=0), \ left_on='author_name', right_on='author_name') agg_df['color'] = cc.palette['glasbey_bw'][:agg_df.shape[0]] plots = [] for feature, agg in [(key, val) for key, vals in agg_features.items() for val in vals]: agg_df = agg_df.sort_values(by=(feature, agg), ascending=False) aes_name = ('total {0}' if agg == 'sum' else 'mean {0} per commit').format(feature.replace('_', ' ')) plotted_df = agg_df[[('author_name', ''), (feature, agg), ('color', '')]] plotted_df.columns = plotted_df.columns.get_level_values(0) plots.append(ggplot(plotted_df) + \ geom_bar(aes(x='author_name', y=feature, color='color', fill='color'), \ stat='identity', alpha=.25, size=1, \ tooltips=layer_tooltips().line('^x') .line('{0}|^y'.format(aes_name))) + \ scale_color_identity() + scale_fill_identity() + \ xlab('') + ylab('') + \ ggtitle(aes_name.title())) w, h = 400, 300 bunch = GGBunch() bunch.add_plot(plots[7], 0, 0, w, h) bunch.add_plot(plots[6], w, 0, w, h) bunch.add_plot(plots[0], 0, h, w, h) bunch.add_plot(plots[1], w, h, w, h) bunch.add_plot(plots[2], 0, 2 * h, w, h) bunch.add_plot(plots[3], w, 2 * h, w, h) bunch.add_plot(plots[4], 0, 3 * h, w, h) bunch.add_plot(plots[5], w, 3 * h, w, h) bunch.show() # Looking at the total values, we clearly see that Igor Alshannikov and Ivan Kupriyanov outcompete the rest. But there is a real intrigue as to who takes the third place. # # Meanwhile, we see more diversity in mean values of different contribution types. # In[7]: ggplot(df.hour.value_counts().to_frame('count').reset_index().sort_values(by='hour')) + \ geom_histogram(aes(x='hour', y='count', color='hour', fill='hour'), \ stat='identity', show_legend=False, \ tooltips=layer_tooltips().line('^y')) + \ scale_x_discrete(breaks=list(range(24))) + \ scale_color_gradient(low='#e0ecf4', high='#8856a7') + \ scale_fill_gradient(low='#e0ecf4', high='#8856a7') + \ ylab('commits number') + \ ggtitle('Total Hourly Committing') + ggsize(600, 450) # The peak of commit activity is about 18 p.m. The evening seems to be a good time to save daily results. # ### Higher Resolution # In[8]: ggplot(df[df.insertions > 0]) + \ geom_lollipop(aes(x='author_date', y='insertions', fill='month'), shape=21, fatten=1, color='black', \ tooltips=layer_tooltips().line('@author_name').line('@|@insertions').line('@|@month')) + \ scale_x_datetime(name='date') + \ scale_y_log10(name='insertions (log)') + \ scale_fill_brewer(name='', type='qual', palette='Accent') + \ facet_grid(y='author_name')+ \ ggtitle('Lollipop Plot of Commits by Authors') + ggsize(800, 1000) # Some of the team members started their work only a few months ago, so they still have time to catch up next year. # In[9]: ggplot(df) + \ geom_point(aes(x='weekday', y='insertions', color='author_name', size='files_changed'), \ shape=8, alpha=.75, position='jitter', show_legend=False, \ tooltips=layer_tooltips().line('author|@author_name') .line('@|@insertions') .line('@|@deletions') .line('files changed|@files_changed')) + \ scale_x_discrete(labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', \ 'Friday', 'Saturday', 'Sunday']) + \ scale_y_log10(breaks=[2 ** n for n in range(16)]) + \ scale_color_brewer(type='qual', palette='Pastel1') + \ scale_size(range=[3, 7], trans='sqrt') + \ ggtitle('All Commits') + ggsize(800, 600) + \ theme(axis_tooltip='blank') # Usually no one works at the weekend. But if something needs to be done - it should be. # ### And Finally... # In[10]: r = df.groupby('day').insertions.median().values x = r * np.cos(np.linspace(0, 2 * np.pi, r.size)) y = r * np.sin(np.linspace(0, 2 * np.pi, r.size)) daily_insertions_df = pd.DataFrame({'x': x, 'y': y}) # In[11]: MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] mask_width, mask_height = 60, 80 mask = np.array(Image.open("images/snowman_mask.bmp").resize((mask_width, mask_height), Image.Resampling.BILINEAR)) grid = [[(0 if color.mean() > 255 / 2 else 1) for color in row] for row in mask] grid_df = pd.DataFrame(grid).stack().to_frame('month_id') grid_df.index.set_names(['y', 'x'], inplace=True) grid_df = grid_df.reset_index() grid_df.y = grid_df.y.max() - grid_df.y grid_df = grid_df[grid_df['month_id'] > 0].reset_index(drop=True) agg_df = np.round(df.month.value_counts() * grid_df.shape[0] / df.shape[0]).to_frame('commits_number') agg_df.iloc[0].commits_number += grid_df.shape[0] - agg_df.commits_number.sum() agg_df.commits_number = agg_df.commits_number.astype(int) agg_df.index.name = 'month_id' agg_df = agg_df.reset_index() grid_df['commits_number'] = 0 start_idx = 0 for idx, (month, commits_number) in agg_df.iterrows(): grid_df.loc[start_idx:(start_idx + commits_number), 'month'] = MONTHS[month - 1] grid_df.loc[start_idx:(start_idx + commits_number), 'commits_number'] = commits_number start_idx += commits_number # In[12]: blank_theme = theme_void() + theme(axis_ticks_length=0, legend_position='none') ps = ggplot(daily_insertions_df, aes(x='x', y='y')) + \ geom_polygon(color='#f03b20', fill='#fd8d3c', size=1) + coord_fixed() + blank_theme p1l = corr_plot(data=df[['insertions', 'deletions']], flip=False).tiles(type='lower', diag=True)\ .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme p1r = corr_plot(data=df[['deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\ .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme p2l = corr_plot(data=df[['insertions', 'deletions', 'diff']], flip=False).tiles(type='lower', diag=True)\ .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme p2r = corr_plot(data=df[['diff', 'deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\ .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme p3l = corr_plot(data=df[['insertions', 'deletions', 'diff', 'files_changed']], flip=False)\ .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme p3r = corr_plot(data=df[['files_changed', 'diff', 'deletions', 'insertions']], flip=True)\ .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme pt = ggplot({'x': [0], 'y': [0], 'greetings': ['Happy New Year!']}, aes(x='x', y='y')) + \ geom_text(aes(label='greetings'), color='blue', size=20, family='Times New Roman', fontface='bold') + blank_theme pm = ggplot(grid_df, aes(x='x', y='y')) + \ geom_tile(aes(fill='month'), width=.8, height=.8, \ tooltips=layer_tooltips().line('@|@month') .line('@|@commits_number')) + \ scale_fill_brewer(type='qual', palette='Set2') + \ blank_theme w, h = 50, 50 bunch = GGBunch() bunch.add_plot(ps, 3 * w, 0, 2 * w, 2 * h) bunch.add_plot(p1l, 2 * w, 2 * h, 2 * w, 2 * h) bunch.add_plot(p1r, 4 * w, 2 * h, 2 * w, 2 * h) bunch.add_plot(p2l, w, 4 * h, 3 * w, 3 * h) bunch.add_plot(p2r, 4 * w, 4 * h, 3 * w, 3 * h) bunch.add_plot(p3l, 0, 7 * h, 4 * w, 4 * h) bunch.add_plot(p3r, 4 * w, 7 * h, 4 * w, 4 * h) bunch.add_plot(pt, 0, 11 * h, 16 * w, 2 * h) bunch.add_plot(pm, 8 * w, 3 * h, 8 * w, 8 * h) bunch.show()