#!/usr/bin/env python
# coding: utf-8

# # Lets-Plot in 2020

# ### Preparation

# In[1]:


from sys import executable
get_ipython().system('{executable} -m pip install colorcet')


# In[2]:


import numpy as np
import pandas as pd
import colorcet as cc
from PIL import Image

from lets_plot import *
from lets_plot.bistro.corr import *


# In[3]:


LetsPlot.setup_html()


# In[4]:


df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/lets_plot_git_history.csv", sep=';')

df = df[['author_date', 'author_name', 'files_changed', 'insertions', 'deletions']]
df.author_date = pd.to_datetime(df.author_date, utc=True)
df.files_changed = df.files_changed.str.split(' ').str[0].astype(int)
df.insertions = df.insertions.str.split(' ').str[0].astype(int)
df.deletions = df.deletions.fillna('0').str.split(' ').str[0].astype(int)

df['diff'] = df.insertions - df.deletions
df['month'] = df.author_date.dt.month
df['day'] = df.author_date.dt.day
df['weekday'] = df.author_date.dt.weekday
df['hour'] = df.author_date.dt.hour

df = df[df.author_date.dt.year == 2020].sort_values(by='author_date').reset_index(drop=True)

df.head()


# ### General Analytics

# In[5]:


df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index()


# In[6]:


agg_features = {'files_changed': ['sum', 'mean'], \
                'insertions': ['sum', 'mean'], \
                'deletions': ['sum', 'mean'], \
                'diff': ['sum']}
agg_df = df.groupby('author_name').agg(agg_features).reset_index()
agg_features['commits_number'] = ['sum']
agg_df = pd.merge(agg_df.sort_index(axis=1, level=0), \
                  df.author_name.value_counts().to_frame(('commits_number', 'sum')).reset_index().sort_index(axis=1, level=0), \
                  left_on='author_name', right_on='author_name')
agg_df['color'] = cc.palette['glasbey_bw'][:agg_df.shape[0]]

plots = []
for feature, agg in [(key, val) for key, vals in agg_features.items() for val in vals]:
    agg_df = agg_df.sort_values(by=(feature, agg), ascending=False)
    aes_name = ('total {0}' if agg == 'sum' else 'mean {0} per commit').format(feature.replace('_', ' '))
    plotted_df = agg_df[[('author_name', ''), (feature, agg), ('color', '')]]
    plotted_df.columns = plotted_df.columns.get_level_values(0)
    plots.append(ggplot(plotted_df) + \
                 geom_bar(aes(x='author_name', y=feature, color='color', fill='color'), \
                          stat='identity', alpha=.25, size=1, \
                          tooltips=layer_tooltips().line('^x')
                                                   .line('{0}|^y'.format(aes_name))) + \
                 scale_color_identity() + scale_fill_identity() + \
                 xlab('') + ylab('') + \
                 ggtitle(aes_name.title()))

w, h = 400, 300
bunch = GGBunch()
bunch.add_plot(plots[7], 0, 0, w, h)
bunch.add_plot(plots[6], w, 0, w, h)
bunch.add_plot(plots[0], 0, h, w, h)
bunch.add_plot(plots[1], w, h, w, h)
bunch.add_plot(plots[2], 0, 2 * h, w, h)
bunch.add_plot(plots[3], w, 2 * h, w, h)
bunch.add_plot(plots[4], 0, 3 * h, w, h)
bunch.add_plot(plots[5], w, 3 * h, w, h)
bunch.show()


# Looking at the total values, we clearly see that Igor Alshannikov and Ivan Kupriyanov outcompete the rest. But there is a real intrigue as to who takes the third place.
# 
# Meanwhile, we see more diversity in mean values of different contribution types.

# In[7]:


ggplot(df.hour.value_counts().to_frame('count').reset_index().sort_values(by='hour')) + \
    geom_histogram(aes(x='hour', y='count', color='hour', fill='hour'), \
                   stat='identity', show_legend=False, \
                   tooltips=layer_tooltips().line('^y')) + \
    scale_x_discrete(breaks=list(range(24))) + \
    scale_color_gradient(low='#e0ecf4', high='#8856a7') + \
    scale_fill_gradient(low='#e0ecf4', high='#8856a7') + \
    ylab('commits number') + \
    ggtitle('Total Hourly Committing') + ggsize(600, 450)


# The peak of commit activity is about 18 p.m. The evening seems to be a good time to save daily results.

# ### Higher Resolution

# In[8]:


ggplot(df[df.insertions > 0]) + \
    geom_lollipop(aes(x='author_date', y='insertions', fill='month'), shape=21, fatten=1, color='black', \
                  tooltips=layer_tooltips().line('@author_name').line('@|@insertions').line('@|@month')) + \
    scale_x_datetime(name='date') + \
    scale_y_log10(name='insertions (log)') + \
    scale_fill_brewer(name='', type='qual', palette='Accent') + \
    facet_grid(y='author_name')+ \
    ggtitle('Lollipop Plot of Commits by Authors') + ggsize(800, 1000)


# Some of the team members started their work only a few months ago, so they still have time to catch up next year.

# In[9]:


ggplot(df) + \
    geom_point(aes(x='weekday', y='insertions', color='author_name', size='files_changed'), \
               shape=8, alpha=.75, position='jitter', show_legend=False, \
               tooltips=layer_tooltips().line('author|@author_name')
                                        .line('@|@insertions')
                                        .line('@|@deletions')
                                        .line('files changed|@files_changed')) + \
    scale_x_discrete(labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', \
                             'Friday', 'Saturday', 'Sunday']) + \
    scale_y_log10(breaks=[2 ** n for n in range(16)]) + \
    scale_color_brewer(type='qual', palette='Pastel1') + \
    scale_size(range=[3, 7], trans='sqrt') + \
    ggtitle('All Commits') + ggsize(800, 600) + \
    theme(axis_tooltip='blank')


# Usually no one works at the weekend. But if something needs to be done - it should be.

# ### And Finally...

# In[10]:


r = df.groupby('day').insertions.median().values
x = r * np.cos(np.linspace(0, 2 * np.pi, r.size))
y = r * np.sin(np.linspace(0, 2 * np.pi, r.size))
daily_insertions_df = pd.DataFrame({'x': x, 'y': y})


# In[11]:


MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
mask_width, mask_height = 60, 80

mask = np.array(Image.open("images/snowman_mask.bmp").resize((mask_width, mask_height), Image.Resampling.BILINEAR))
grid = [[(0 if color.mean() > 255 / 2 else 1) for color in row] for row in mask]

grid_df = pd.DataFrame(grid).stack().to_frame('month_id')
grid_df.index.set_names(['y', 'x'], inplace=True)
grid_df = grid_df.reset_index()
grid_df.y = grid_df.y.max() - grid_df.y
grid_df = grid_df[grid_df['month_id'] > 0].reset_index(drop=True)

agg_df = np.round(df.month.value_counts() * grid_df.shape[0] / df.shape[0]).to_frame('commits_number')
agg_df.iloc[0].commits_number += grid_df.shape[0] - agg_df.commits_number.sum()
agg_df.commits_number = agg_df.commits_number.astype(int)
agg_df.index.name = 'month_id'
agg_df = agg_df.reset_index()

grid_df['commits_number'] = 0
start_idx = 0
for idx, (month, commits_number) in agg_df.iterrows():
    grid_df.loc[start_idx:(start_idx + commits_number), 'month'] = MONTHS[month - 1]
    grid_df.loc[start_idx:(start_idx + commits_number), 'commits_number'] = commits_number
    start_idx += commits_number


# In[12]:


blank_theme = theme_void() + theme(axis_ticks_length=0, legend_position='none')

ps = ggplot(daily_insertions_df, aes(x='x', y='y')) + \
    geom_polygon(color='#f03b20', fill='#fd8d3c', size=1) + coord_fixed() + blank_theme
p1l = corr_plot(data=df[['insertions', 'deletions']], flip=False).tiles(type='lower', diag=True)\
    .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme
p1r = corr_plot(data=df[['deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\
    .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme
p2l = corr_plot(data=df[['insertions', 'deletions', 'diff']], flip=False).tiles(type='lower', diag=True)\
    .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme
p2r = corr_plot(data=df[['diff', 'deletions', 'insertions']], flip=True).tiles(type='lower', diag=True)\
    .palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme
p3l = corr_plot(data=df[['insertions', 'deletions', 'diff', 'files_changed']], flip=False)\
    .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme
p3r = corr_plot(data=df[['files_changed', 'diff', 'deletions', 'insertions']], flip=True)\
    .tiles(type='lower', diag=True).palette_gradient(low='blue', mid='green', high='darkgreen').build() + blank_theme
pt = ggplot({'x': [0], 'y': [0], 'greetings': ['Happy New Year!']}, aes(x='x', y='y')) + \
    geom_text(aes(label='greetings'), color='blue', size=20, family='Times New Roman', fontface='bold') + blank_theme
pm = ggplot(grid_df, aes(x='x', y='y')) + \
    geom_tile(aes(fill='month'), width=.8, height=.8, \
              tooltips=layer_tooltips().line('@|@month')
                                       .line('@|@commits_number')) + \
    scale_fill_brewer(type='qual', palette='Set2') + \
    blank_theme

w, h = 50, 50
bunch = GGBunch()
bunch.add_plot(ps, 3 * w, 0, 2 * w, 2 * h)
bunch.add_plot(p1l, 2 * w, 2 * h, 2 * w, 2 * h)
bunch.add_plot(p1r, 4 * w, 2 * h, 2 * w, 2 * h)
bunch.add_plot(p2l, w, 4 * h, 3 * w, 3 * h)
bunch.add_plot(p2r, 4 * w, 4 * h, 3 * w, 3 * h)
bunch.add_plot(p3l, 0, 7 * h, 4 * w, 4 * h)
bunch.add_plot(p3r, 4 * w, 7 * h, 4 * w, 4 * h)
bunch.add_plot(pt, 0, 11 * h, 16 * w, 2 * h)
bunch.add_plot(pm, 8 * w, 3 * h, 8 * w, 8 * h)
bunch.show()