import plotly
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
assert matplotlib.__version__ == "3.1.0","""
Please install matplotlib version 3.1.0 by running:
1) !pip uninstall matplotlib
2) !pip install matplotlib==3.1.0
"""
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
data = pd.read_csv('combined_set.csv')
data['Mean Log GDP per capita'] = data.groupby('Year')['Log GDP per capita'].transform(
pd.qcut,
q=5,
labels=(['Lowest','Low','Medium','High','Highest'])
).fillna('Lowest')
print(f'total number of missings vals: {data.isnull().sum().sum()} out of {data.shape[0] * data.shape[1]}')
total number of missings vals: 228 out of 36024
I started learning Python more seriously about two years ago. Since then rarely a week has passed where I did not marvel at the simplicity and ease of use of Python itself or one of the many amazing open source libraries in the ecosystem. The more commands, patterns and concepts I become familiar with, the more everything just makes sense.
The opposit holds true for plotting with Python. Initially almost every chart I created looked like a crime escaped from the eighties. What makes matters worse is that to create said abonominations most of the time I had to spend hours on Stackoverflow researching nitty-gritty commands to change the slant of the x-ticks or something similiar silly. Don't even get me started on multi charts. While the results often looks fairly impressive and it is wonderful to create those charts programatically (e.g. 50 in a row), it is just so much work.
For a brief glimps in time I thought Bokeh >Link< would become my goto solution. I came accross Bokeh when I was working on geospartial visualizations. However, I quickly realized that Bokeh, while different, was just as stupidly complicated as matplotlib.
I did try out plot.ly a while ago while, again, working on visualization of geospartial data. Back that it seemed even more stupid than the afforementioned libraries. You needed an account and everything was rendered online and you would then download it. I quickly discarded plot.ly.
Ultimately I settled on using Pandas native plotting for quick inspections and Seaborn for charts for reports and presentations (where visuals matter). However, recently I watched a Youtube video about plotly express, where most importantly they got rid of all this online nonsese. I played around with it and must say, this might actually change my plotting life for the better.
In the following article, I will talk about:
I tought statistics (Stats 119) whilst studying at Universtity in San Diego. Stats 119 is an into class to statistics. The curiculum included statistical fundamentals like data aggregation (visual and quantitative), concepts of odds and probabilities, regression, sampling, and - to me the most important one - distributions. This was the time my understanding of quantities and phenomena almost entirely shifted to a representation through distributions (most of the time Guassian).
To this day I find it astonishing how far the two quantities mean and standard deviation can get you in really grasping a distribution, the meaning and its implications.
By knowing these two numbers it is straightforward to conclude how likely a certain outcome is, one immediately knows, where the bulk of the results are going to be. It gives you a framework of reference to distinguise anecodatal events from statistically significant ones.
np.exp(data[data['Year']==2018]['Log GDP per capita']).plot(
kind='hist'
)
<matplotlib.axes._subplots.AxesSubplot at 0x12b6a47f0>
data['Year'].plot(
kind='hist',
figsize=(17,6),
title='Number of countries (y-axis) with certain nubmer of observations (x-axis)',
xlim=(2000,2025), # makes no sense
ylim=(2,50), # makes no sense
)
<matplotlib.axes._subplots.AxesSubplot at 0x1318a57b8>
data[data['Year'] == 2018]['Life Ladder'].plot(
kind='hist'
)
<matplotlib.axes._subplots.AxesSubplot at 0x130d40668>
data[data['Year'] == 2018]['Life Ladder'].plot(
kind='hist',
bins=np.arange(2,8,0.25)
)
<matplotlib.axes._subplots.AxesSubplot at 0x1333aa080>
data[
data['Year'] == 2018
].set_index('Country name')['Life Ladder'].nlargest(15).plot(
kind='bar',
figsize=(12,8)
)
<matplotlib.axes._subplots.AxesSubplot at 0x1332f2f98>
np.exp(data[
data['Year'] == 2018
].groupby('Continent')['Log GDP per capita']\
.mean()).sort_values().plot(
kind='barh',
figsize=(12,8)
)
<matplotlib.axes._subplots.AxesSubplot at 0x130e24080>
data['Life Ladder'].plot(
kind='box',
figsize=(12,8)
)
<matplotlib.axes._subplots.AxesSubplot at 0x13117af60>
data[['Healthy life expectancy at birth','Gapminder Life Expectancy']].plot(
kind='scatter',
x='Healthy life expectancy at birth',
y='Gapminder Life Expectancy',
figsize=(12,8)
)
<matplotlib.axes._subplots.AxesSubplot at 0x13152a080>
data[data['Year'] == 2018].plot(
kind='hexbin',
x='Healthy life expectancy at birth',
y='Generosity',
C='Life Ladder',
gridsize=20,
figsize=(12,8),
cmap="Blues", # defaults to greenish
sharex=False # required to get rid of a bug
)
<matplotlib.axes._subplots.AxesSubplot at 0x130f6bb70>
data[data['Year'] == 2018].groupby(
['Continent']
)['Gapminder Population'].sum().plot(
kind='pie',
figsize=(16,10),
cmap="Blues_r", # defaults to orange
)
<matplotlib.axes._subplots.AxesSubplot at 0x13341f7b8>
data.groupby(
['Year','Continent']
)['Gapminder Population'].sum().unstack().plot(
kind='area',
figsize=(12,8),
cmap="Blues", # defaults to orangish
)
<matplotlib.axes._subplots.AxesSubplot at 0x133460a20>
data[
data['Country name'] == 'Germany'
].set_index('Year')['Life Ladder'].plot(
kind='line',
figsize=(12,8)
)
<matplotlib.axes._subplots.AxesSubplot at 0x1335ced68>
As mentioned before, I am a big fan of distributions. Histograms and Kernel density alike are potent ways of visualizing a the key features of a particular variable. Let's look at how we generate distributions for
sns.reset_defaults()
sns.set(
style="white",
palette="muted" # prettier colors
)
sns_data = data[
(data['Year'] == 2018) &
(data['Continent'] == 'Asia')
]
sns.distplot(
sns_data['Life Ladder'],
label='Life Ladder'
)
sns.despine() # pretty graphs
__sns_data = {}
for val in data['Mean Log GDP per capita'].cat.categories:
__sns_data[val] = data[
(data['Year'] == 2018) &
(data['Mean Log GDP per capita'] == val)
]
sns.kdeplot(
__sns_data[val]['Life Ladder'],
label=val
)
sns.despine()
Whenever I want to visualy explore the relationship between two or multiple variables it typically comes down to some form of scatterplot and an assessment of joint distributions. There are three variations of a conceptually similar plot, where a in the center graph a form of joint distribution is shown and at the right and top side of the center graph the marginal distributions are depicted.
sns.reset_defaults()
sns.set(
rc={'figure.figsize':(7,5)},
style="white"
)
sns.jointplot(
x='Log GDP per capita',
y='Life Ladder',
data=data,
kind='scatter'
)
<seaborn.axisgrid.JointGrid at 0x134fb22e8>
sns.jointplot(
x='Log GDP per capita',
y='Life Ladder',
data=data,
kind='kde'
)
<seaborn.axisgrid.JointGrid at 0x13534abe0>
sns.jointplot(
x='Log GDP per capita',
y='Life Ladder',
data=data,
kind='hex'
)
<seaborn.axisgrid.JointGrid at 0x134decf98>
sns.scatterplot(
x='Log GDP per capita',
y='Life Ladder',
hue='Continent',
data=data[data['Year'] == 2018],
)
sns.despine()
sns.scatterplot(
x='Log GDP per capita',
y='Life Ladder',
hue='Continent',
data=data[data['Year'] == 2018],
size='Gapminder Population'
)
sns.despine()
sns.set(
rc={'figure.figsize':(18,6)},
style="white"
)
sns.violinplot(
x='Continent',
y='Life Ladder',
hue='Mean Log GDP per capita',
data=data
)
sns.despine()
sns.set(
style="white",
palette="muted",
color_codes=True
)
g = sns.pairplot(
data[data.Year == 2018][[
'Life Ladder','Log GDP per capita',
'Social support','Healthy life expectancy at birth',
'Freedom to make life choices','Generosity',
'Perceptions of corruption', 'Positive affect','Negative affect',
'Confidence in national government',"Mean Log GDP per capita"]].dropna(),
hue="Mean Log GDP per capita"
)
g.fig.savefig('yolo.png')
g = sns.FacetGrid(data, col="Continent", col_wrap=3,height=4)
g = (g.map(plt.hist, "Life Ladder",bins=np.arange(2,9,0.5)))
g = sns.FacetGrid(
data.groupby(['Mean Log GDP per capita','Year','Continent'])['Life Ladder'].mean().reset_index(),
row='Mean Log GDP per capita',
col='Continent',
margin_titles=True
)
g = (g.map(plt.plot, 'Year','Life Ladder'))
def vertical_mean_line(x, **kwargs):
plt.axvline(x.mean(), linestyle ="--",
color = kwargs.get("color", "r"))
txkw = dict(size=15, color = kwargs.get("color", "r"))
label_x_pos_adjustment = 0.08 # this needs customization base don your data
label_y_pos_adjustment = 5 # this needs customization base don your data
if x.mean() < 6: # this needs customization base don your data
tx = "mean: {:.2f}\n(std: {:.2f})".format(x.mean(),x.std())
plt.text(x.mean() + label_x_pos_adjustment, label_y_pos_adjustment, tx, **txkw)
else:
tx = "mean: {:.2f}\n (std: {:.2f})".format(x.mean(),x.std())
plt.text(x.mean() -1.4, label_y_pos_adjustment, tx, **txkw)
_ = data.groupby(['Continent','Year'])['Life Ladder'].mean().reset_index()
g = sns.FacetGrid(_, col="Continent", height=4, aspect=0.9, col_wrap=3, margin_titles=True)
g.map(sns.kdeplot, "Life Ladder", shade=True, color='royalblue')
g.map(vertical_mean_line, "Life Ladder")
<seaborn.axisgrid.FacetGrid at 0x130b87be0>
def draw_heatmap(data,inner_row, inner_col, outer_row, outer_col, values, vmin,vmax):
sns.set(font_scale=1)
fg = sns.FacetGrid(
data,
row=outer_row,
col=outer_col,
margin_titles=True
)
position = left, bottom, width, height = 1.4, .2, .1, .6
cbar_ax = fg.fig.add_axes(position)
fg.map_dataframe(
draw_heatmap_facet,
x_col=inner_col,
y_col=inner_row,
values=values,
cbar_ax=cbar_ax,
vmin=vmin,
vmax=vmax
)
fg.fig.subplots_adjust(right=1.3)
plt.show()
def draw_heatmap_facet(*args, **kwargs):
data = kwargs.pop('data')
x_col = kwargs.pop('x_col')
y_col = kwargs.pop('y_col')
values = kwargs.pop('values')
d = data.pivot(index=y_col, columns=x_col, values=values)
annot = round(d,4).values
cmap = sns.color_palette("Blues",30) + sns.color_palette("Blues",30)[0::2]
#cmap = sns.color_palette("Blues",30)
sns.heatmap(
d,
**kwargs,
annot=annot,
center=0,
cmap=cmap,
linewidth=.5
)
# Data preparation
_ = data.copy()
_['Year'] = pd.cut(_['Year'],bins=[2006,2008,2012,2018])
_['GDP per Capita'] = _.groupby(['Continent','Year'])['Log GDP per capita'].transform(
pd.qcut,
q=3,
labels=(['Low','Medium','High'])
).fillna('Low')
_['Corruption'] = _.groupby(['Continent','GDP per Capita'])['Perceptions of corruption'].transform(
pd.qcut,
q=3,
labels=(['Low','Medium','High'])
)
_ = _[_['Continent'] != 'Oceania'].groupby(['Year','Continent','GDP per Capita','Corruption'])['Life Ladder'].mean().reset_index()
_['Life Ladder'] = _['Life Ladder'].fillna(-10)
draw_heatmap(
data=_,
outer_row='Corruption',
outer_col='GDP per Capita',
inner_row='Year',
inner_col='Continent',
values='Life Ladder',
vmin=3,
vmax=8,
)
/Users/fabianbosler/anaconda3/envs/data_extraction/lib/python3.6/site-packages/seaborn/axisgrid.py:848: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
fig = px.scatter(
data[data['Year'] == 2018],
x="Log GDP per capita",
y="Life Ladder",
size="Gapminder Population",
color="Continent",
hover_name="Country name",
size_max=60
)
fig.show()
fig = px.scatter(
data_frame=data,
x="Log GDP per capita",
y="Life Ladder",
animation_frame="Year",
animation_group="Country name",
size="Gapminder Population",
color="Continent",
hover_name="Country name",
facet_col="Continent",
size_max=45,
category_orders={'Year':list(range(2007,2019))}
)
fig.show()
def q_bin_in_3(col):
return pd.qcut(
col,
q=3,
labels=['Low','Medium','High']
)
_ = data.copy()
_['Social support'] = _.groupby('Year')['Social support'].transform(q_bin_in_3)
_['Life Expectancy'] = _.groupby('Year')['Healthy life expectancy at birth'].transform(q_bin_in_3)
_['Generosity'] = _.groupby('Year')['Generosity'].transform(q_bin_in_3)
_['Perceptions of corruption'] = _.groupby('Year')['Perceptions of corruption'].transform(q_bin_in_3)
_ = _.groupby(['Social support','Life Expectancy','Generosity','Perceptions of corruption'])['Life Ladder'].mean().reset_index()
fig = px.parallel_categories(_, color="Life Ladder", color_continuous_scale=px.colors.sequential.Inferno)
fig.show()
fig = px.bar(
data_frame=data,
x="Continent",
y="Gapminder Population",
color="Mean Log GDP per capita",
barmode="stack",
facet_col="Year",
category_orders={"Year": range(2007,2019)},
hover_name='Country name',
hover_data=[
"Mean Log GDP per capita",
"Gapminder Population",
"Life Ladder"
]
)
fig.show()
fig = px.choropleth(
data_frame=data,
locations="ISO3",
color="Healthy life expectancy at birth",
hover_name="Country name",
animation_frame="Year")
fig.show()
fig = px.choropleth(
data_frame=data,
locations="ISO3",
color="Life Ladder",
hover_name="Country name",
animation_frame="Year")
fig.show()