#!/usr/bin/env python # coding: utf-8 # # *matplotlib* et visualisation de données: # ## Exemple sur la base de données *belgianmunicipalities.csv* # # *** # > __Auteur__: Joseph Salmon # > # # ## Sommaire # # * __[Import](#import)__
# * __[Améliorations visuelles](#style)__
# * __[Visualisations classiques](#visu)__
# # ## Imports # In[1]: import os import sys # utile pour avoir un affichage identique dans divers notebook, utile pour l'auteur uniquement # sys.path.append("./codes") import numpy as np import matplotlib.pyplot as plt plt.rcParams.update({'figure.max_open_warning': 25}) # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # # # # Visualisation de données, seaborn et pandas # In[3]: import seaborn as sns import pandas as pd dirname = "../prebuiltimages/" imageformat = ".pdf" saving = False def my_saving_display(fig, dirname, filename, imageformat,saving=False): """"Saving with personal function.""" if saving is True: dirname + filename + imageformat image_name = dirname + filename + imageformat fig.savefig(image_name) # In[4]: from download import download url = "http://josephsalmon.eu/enseignement/datasets/belgianmunicipalities.csv" path_target = "./belgianmunicipalities.csv" download(url, path_target, replace=False) # In[5]: cat belgianmunicipalities.csv # In[6]: df_belgium = pd.read_csv(path_target, index_col='Commune') df_belgium = df_belgium.drop(['Unnamed: 0'], axis=1) df_belgium.head() # In[7]: dictionnaire = {1: 'Anv.', 2: 'Brab.', 3: 'Fl.occ.', 4: 'Fl.or.', 5: 'Hainaut', 6: 'Liège', 7: 'Limb.', 8: 'Lux.', 9: 'Namur'} df_belgium = df_belgium.replace({'Province': dictionnaire}) # In[8]: nb_samples = df_belgium.count()[0] # In[9]: from scipy import stats from mpl_toolkits.mplot3d import Axes3D # Random setting X = df_belgium['medianincome'] y = np.ones(nb_samples,) + 0.05 * np.random.randn(nb_samples,) # Various statistics: meanX = np.mean(X) # mean minX = np.min(X) # min maxX = np.max(X) # max medX = np.median(X) # median MADX = np.median(np.abs(X - medX)) # mean absolute deviation s = np.std(X) # standard deviation alpha_trim = 0.15 tmeanX = stats.trim_mean(X, alpha_trim) # trimmed mean (level: alpha_trim) brown = (0.64, 0.16, 0.16) purple = (148. / 255, 0, 211. / 255) # In[10]: np.ptp(X.values) # In[11]: fig1, ax = plt.subplots(figsize=(10, 3)) ax.set_ylim(0, 1.5) ax.set_xlim(minX - 0.1 * np.ptp(X.values), maxX + 0.1 * np.ptp(X.values)) ax.get_xaxis().tick_bottom() ax.axes.get_yaxis().set_visible(False) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['bottom'].set_position(('data', 0.5)) ax.spines['left'].set_color('none') ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax.plot([meanX, meanX], [0, 1.5], color=brown, lw=1.5, ls="--") plt.xlabel(r'$y$', fontsize=18) plt.annotate(r'$\overline{y}_n$ : moyenne empirique', xy=(meanX, 0.4), xycoords='data', xytext=(+10, +30), textcoords='offset points', fontsize=18, color=brown) plt.tight_layout() plt.show() # my_saving_display(fig1, dirname, "GammaSampleMean", imageformat) # In[12]: fig1, ax = plt.subplots(figsize=(10, 3)) ax.set_ylim(0, 1.5) ax.set_xlim(minX - 0.1 * np.ptp(X.values), maxX + 0.1 * np.ptp(X.values)) ax.get_xaxis().tick_bottom() ax.axes.get_yaxis().set_visible(False) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['bottom'].set_position(('data', 0.5)) ax.spines['left'].set_color('none') ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--") plt.xlabel(r'$y$', fontsize=18) plt.annotate(r'$\rm{Med}_n(y):$ médiane empirique', xy=(medX, 1), xycoords='data', xytext=(-85, +30), textcoords='offset points', fontsize=18, color=purple) plt.tight_layout() plt.show() my_saving_display(fig1, dirname, "GammaSampleMediane", imageformat) # In[13]: fig1, ax = plt.subplots(figsize=(10, 3)) ax.set_ylim(0, 1.5) ax.set_xlim(minX - 0.1 * np.ptp(X.values), maxX + 0.1 * np.ptp(X.values)) ax.get_xaxis().tick_bottom() ax.axes.get_yaxis().set_visible(False) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['bottom'].set_position(('data', 0.5)) ax.spines['left'].set_color('none') ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax.plot([tmeanX, tmeanX], [0, 1.5], c='blue', lw=1.5, ls="--") plt.xlabel(r'$y$', fontsize=18) tt = "$\overline{y}_{n,%s} :$ moyenne tronquée" % str(alpha_trim) plt.annotate(tt, xy=(tmeanX, 1), xycoords='data', xytext=(+22, +50), textcoords='offset points', fontsize=18, color='blue') plt.tight_layout() plt.show() my_saving_display(fig1, dirname, "GammaSampleTrimmed", imageformat) # In[14]: fig1, ax = plt.subplots(figsize=(10, 3)) ax.set_ylim(0, 1.5) ax.set_xlim(minX - 0.1 * np.ptp(X.values), maxX + 0.1 * np.ptp(X.values)) ax.get_xaxis().tick_bottom() ax.axes.get_yaxis().set_visible(False) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['bottom'].set_position(('data', 0.5)) ax.spines['left'].set_color('none') ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax.plot([meanX, meanX], [0, 1.5], color=brown, lw=1.5, ls="--") ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--") ax.plot([tmeanX, tmeanX], [0, 1.5], color='blue', lw=1.5, ls="--") plt.xlabel(r'$y$', fontsize=18) plt.annotate(r'$\rm{Med}_n(y):$ médiane empirique', xy=(medX, 1), xycoords='data', xytext=(-85, +30), textcoords='offset points', fontsize=18, color=purple) plt.annotate(r'$\bar{y}_n :$ moyenne empirique', xy=(meanX, 0.4), xycoords='data', xytext=(+10, +30), textcoords='offset points', fontsize=18, color=brown) plt.annotate(tt, xy=(tmeanX, 1), xycoords='data', xytext=(+22, +50), textcoords='offset points', fontsize=18, color='blue') plt.tight_layout() plt.show() my_saving_display(fig1, dirname, "GammaSampleMedianeMean", imageformat) # In[15]: fig1, ax = plt.subplots(figsize=(10, 3)) ax.set_ylim(0, 1.5) ax.set_xlim(minX - 0.1 * np.ptp(X.values), maxX + 0.1 * np.ptp(X.values)) ax.get_xaxis().tick_bottom() ax.axes.get_yaxis().set_visible(False) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['bottom'].set_position(('data', 0.5)) ax.spines['left'].set_color('none') ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax.plot([meanX, meanX], [0, 1.5], color=brown, lw=1.5, ls="--") plt.arrow(meanX, 1.2, -s, 0, fc=brown, ec=brown, head_width=0.05, head_length=0.1, length_includes_head=True) plt.arrow(meanX - s, 1.2, s, 0, fc=brown, ec=brown, head_width=0.05, head_length=0.1, length_includes_head=True) plt.arrow(meanX, 1.2, s, 0, fc=brown, ec=brown, head_width=0.05, head_length=0.1, length_includes_head=True) plt.arrow(meanX + s, 1.2, -s, 0, fc=brown, ec=brown, head_width=0.05, head_length=0.1, length_includes_head=True) plt.xlabel(r'$y$', fontsize=18) plt.annotate(r'$\bar{y}_n :$ moyenne empirique', xy=(meanX, 0.4), xycoords='data', xytext=(+5, +30), textcoords='offset points', fontsize=18, color=brown) plt.annotate(r'$s_n$', xy=(meanX + s * (0.4), 1), xycoords='data', xytext=(+10, +30), textcoords='offset points', fontsize=18, color=brown) plt.annotate(r'$s_n$', xy=(meanX - s * (0.6), 1), xycoords='data', xytext=(+10, +30), textcoords='offset points', fontsize=18, color=brown) plt.tight_layout() plt.show() my_saving_display(fig1, dirname, "GammaSD", imageformat) # In[16]: fig1, ax = plt.subplots(figsize=(10, 3)) ax.set_ylim(0, 1.5) ax.set_xlim(minX - 0.1 * np.ptp(X.values), maxX + 0.1 * np.ptp(X.values)) ax.get_xaxis().tick_bottom() ax.axes.get_yaxis().set_visible(False) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['bottom'].set_position(('data', 0.5)) ax.spines['left'].set_color('none') ax.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax.plot([medX, medX], [0, 1.5], color=purple, lw=1.5, ls="--") plt.arrow(medX, 1.2, -MADX, 0, fc=purple, ec=purple, head_width=0.05, head_length=0.1, length_includes_head=True) plt.arrow(medX - MADX, 1.2, MADX, 0, fc=purple, ec=purple, head_width=0.05, head_length=0.1, length_includes_head=True) plt.arrow(medX, 1.2, MADX, 0, fc=purple, ec=purple, head_width=0.05, head_length=0.1, length_includes_head=True) plt.arrow(medX + MADX, 1.2, -MADX, 0, fc=purple, ec=purple, head_width=0.05, head_length=0.1, length_includes_head=True) plt.xlabel(r'$y$', fontsize=18) plt.annotate(r'$\rm{Med}_n(y):$ médiane empirique', xy=(medX, 0.4), xycoords='data', xytext=(+10, +30), textcoords='offset points', fontsize=18, color=purple) plt.annotate(r'$\rm{MAD}_n(y)$', xy=(medX + MADX * (0.1), 1), xycoords='data', xytext=(+10, +35), textcoords='offset points', fontsize=14, color=purple) plt.annotate(r'$\rm{MAD}_n(y)$', xy=(medX - MADX * (1.2), 1), xycoords='data', xytext=(+10, +35), textcoords='offset points', fontsize=14, color=purple) plt.tight_layout() plt.show() my_saving_display(fig1, dirname, "GammaMAD", imageformat) # # Histogramme global # In[17]: plt.figure() g = sns.distplot(df_belgium['medianincome'], kde=False, rug=False, hist_kws={"linewidth": 1, "alpha": 0.75, "color": brown}); fig = g.get_figure() filename = "belgianmunicipalities_hist" image_name = os.path.join(dirname, filename + imageformat) fig.savefig(image_name, bbox_inches='tight') # In[18]: plt.figure() g = sns.kdeplot(df_belgium['medianincome'], shade=True); fig = g.get_figure() filename = "belgianmunicipalities_kde" image_name = os.path.join(dirname, filename + imageformat) fig.savefig(image_name, bbox_inches='tight') # In[19]: yrange_lim_down = (0., 1.) yrange_lim_up = (-0.5, 2) yrange_lim = (-0.1, 0.5) xrange_lim = (12000, 26000) sorted_data = np.sort(X) # In[20]: fig1, (ax1, ax2) = plt.subplots(2, 1,figsize=(14, 6)) plt.subplots_adjust(hspace=0.3) # ax1 = fig1.add_subplot(211) ax1.set_ylim(yrange_lim_up) ax1.set_xlim(xrange_lim) ax1.get_xaxis().tick_bottom() ax1.axes.get_yaxis().set_visible(False) ax1.spines['right'].set_color('none') ax1.spines['top'].set_color('none') ax1.spines['bottom'].set_position(('data', 0.5)) ax1.spines['left'].set_color('none') ax1.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax1.set_xlabel(r'$y$', fontsize=18) plt.suptitle(r"Nombre d'échantillons: " + "$n={0}$".format(nb_samples), multialignment='center') ax2.set_xlim(xrange_lim) ax2.set_ylim(yrange_lim_down) ax2.step(sorted_data, np.arange(sorted_data.size, dtype='float') / nb_samples, color=brown) ax2.set_ylabel(r'Fréquence cumulée', fontsize=18) ax2.set_xlabel(r'$y$', fontsize=18) # pour ajuster les images dans les slides (utile pour le professeur seulement) p = 0.1 q = np.percentile(X, p * 100) ax2.annotate(r'$F_n^\leftarrow(p)={}$'.format(q), xy=(q, 0), xycoords='data', xytext=(-40, -40), textcoords='offset points', fontsize=16, color='white') plt.tight_layout() plt.show() my_saving_display(fig1, dirname, "Gammaecdf", imageformat,saving=True) # In[21]: fig3, (ax1, ax2) = plt.subplots(2, 1,figsize=(14, 6)) plt.subplots_adjust(hspace=0.3) ax1.set_ylim(yrange_lim_up) ax1.set_xlim(xrange_lim) ax1.get_xaxis().tick_bottom() ax1.axes.get_yaxis().set_visible(False) ax1.spines['right'].set_color('none') ax1.spines['top'].set_color('none') ax1.spines['bottom'].set_position(('data', 0.5)) ax1.spines['left'].set_color('none') ax1.scatter(X, y, c='black', s=300, marker='o', edgecolors=brown, lw='1') ax1.set_xlabel(r'$y$', fontsize=18) plt.suptitle(r"Nombre d'échantillons: " + "$n={0}$".format(nb_samples), multialignment='center') ax2.set_xlim(xrange_lim) ax2.set_ylim(yrange_lim_down) ax2.step(sorted_data, np.arange(sorted_data.size, dtype='float') / nb_samples, color=brown) ax2.set_ylabel(r'Fréquence cumulée', fontsize=18) ax2.set_xlabel(r'$y$', fontsize=18) ############################################################################## # Quantile function: First value to display p = 0.1 q = np.percentile(X, p * 100) ax2.plot([q, xrange_lim[0]], [p, p], color=brown, lw=1.5, ls="--") ax2.plot([q, q], [0, p], color=brown, lw=1.5, ls="--") ax2.annotate(r'$F_n^\leftarrow(p)={}$'.format(q), xy=(q, 0), xycoords='data', xytext=(-40, -40), textcoords='offset points', fontsize=16, color=brown) ax2.annotate(r'$p={}$'.format(p), xy=(0, p), xycoords='axes fraction', xytext=(5, 10), textcoords='offset points', fontsize=18, color=brown) # ############################################################################### # # Quantile function: Second value to display p = 0.90 q = np.percentile(X, p * 100) ax2.plot([q, xrange_lim[0]], [p, p], color=brown, lw=1.5, ls="--") ax2.plot([q, q], [0, p], color=brown, lw=1.5, ls="--") ax2.annotate(r'$F_n^\leftarrow(p)={}$'.format(q), xy=(q, 0), xycoords='data', xytext=(-40, -40), textcoords='offset points', fontsize=16, color=brown) ax2.annotate(r'$p={}$'.format(p), xy=(0, p), xycoords='axes fraction', xytext=(5, -20), textcoords='offset points', fontsize=18, color=brown) ax2.step(sorted_data, np.arange(sorted_data.size, dtype='float') / nb_samples, color=brown) ax2.set_ylabel(r'Fréquence cumulée', fontsize=18) ax2.set_xlabel(r'$y$', fontsize=18) plt.tight_layout() plt.show() my_saving_display(fig3, dirname, "GammaQuantiles", imageformat,saving=True) # # # Visualisations classiques # ## Barplots: # afficher la moyenne ou la médiane par région. # # # In[22]: plt.figure() fig_barplot = sns.barplot(x='Province', y='medianincome', data=df_belgium) # In[23]: plt.figure() fig_barplot = sns.barplot(x='Province', y='medianincome', data=df_belgium) fig_barplot.set_yscale('log') # # ## Boxplots: # Visualiser la médiane, les quantiles 1/4 et 3/4, 1.5 inter-quartiles et outliers? # voir e.g. : http://www.itse.be/statistique2010/co/233_Cours_boxplot.html # In[24]: plt.figure() boxplot_sns = sns.boxplot(x='Province', y='medianincome', data=df_belgium) figure = boxplot_sns.get_figure() figure.savefig("../srcimages/boxplots.svg", format='svg') # ## Violons: # https://datavizcatalogue.com/methods/violin_plot.html # In[25]: plt.figure() boxplot_sns = sns.violinplot(x='Province', y='medianincome', data=df_belgium) fig_violon = boxplot_sns.get_figure() fig_violon.savefig("../srcimages/violons.svg", format='svg') # ## Scatter plot (french: graphique de dispersion) # Corrélations entre la moyenne et la median des revenus # In[26]: fig2 = plt.figure(figsize=(6, 6)) plt.plot(df_belgium['medianincome'], df_belgium['averageincome'], '.', label='villes') plt.plot(np.linspace(0, 30000, num=100), np.linspace(0, 30000, num=100), label='bissectrice') plt.xlabel('Médiane impôt sur le revenu') plt.ylabel('Moyenne impôt sur le revenu') plt.xlim((0, 30000)) # Démarre l'axe des x a 0 et fini a 30000 plt.ylim((0, 50000)) # Démarre l'axe des y a 0 et fini a 30000 plt.legend() filename = "belgianmunicipalities" image_name = os.path.join(dirname, filename + imageformat) # fig2.savefig(image_name, bbox_inches='tight') # **Remarque**: la moyenne tire toujours la richesse d'une ville vers le haut par rapport à la médiane (un seul riche foyer ne change pas la médiane, mais change la moyenne potentiellement drastiquement). # ## Scatter plot: # ette fois on va afficher les points par couleur selon la région pour observer à une granularité plus précise # In[27]: g = sns.lmplot(x='medianincome', y='averageincome', data=df_belgium, fit_reg=False, hue='Province') g.fig.set_figheight(6) g.fig.set_figwidth(7) # plt.plot(df['medianincome'], df['medianincome']) plt.xlabel('Médiane impôt sur le revenu') plt.ylabel('Moyenne impôt sur le revenu') plt.xlim((0, 30000)) # Démarre l'axe des x a 0 et fini a 30000 plt.ylim((0, 50000)) # Démarre l'axe des y a 0 et fini a 30000 filename = "belgianmunicipalities_clean" image_name = os.path.join(dirname, filename + imageformat) g.savefig(image_name, bbox_inches='tight') # In[28]: import seaborn as sns plt.figure() plt.xlim(12000, 25000) cdf_plot=sns.kdeplot(df_belgium['medianincome'], cumulative=True) # In[29]: n_counts, bin_edges = np.histogram( df_belgium['medianincome'], bins=df_belgium['medianincome'].count()) cdf = np.cumsum(n_counts) # cdf not normalized, despite above scale = 1.0 / cdf[-1] ncdf = scale * cdf fig_cdf = plt.figure(figsize=(7, 5)) plt.plot(np.sort(df_belgium['medianincome']), ncdf) plt.xlim(12000, 25000) plt.title('Fonction de répartition empirique:\n revenus médians belges') filename = "belgianmunicipalities_median_cdf" image_name = os.path.join(dirname, filename + imageformat) fig_cdf.savefig(image_name, bbox_inches='tight') # In[30]: df_belgium_1 = df_belgium[df_belgium["Province"] == "Anv."] n_counts, bin_edges = np.histogram( df_belgium_1['medianincome'], bins=df_belgium_1['medianincome'].count()) cdf = np.cumsum(n_counts) # cdf not normalized, despite above scale = 1.0 / cdf[-1] ncdf = scale * cdf fig_cdf_anv = plt.figure(figsize=(7,5)) plt.plot(np.sort(df_belgium_1['medianincome']), ncdf) plt.title("Fonction de répartition empirique: \n revenus médians de la province d'Anvers") plt.xlim(12000,25000) filename = "belgianmunicipalities_anvers_cdf" image_name = os.path.join(dirname, filename + imageformat) fig_cdf_anv.savefig(image_name, bbox_inches='tight') # In[31]: np.percentile(df_belgium['averageincome'], [100 / 4, 100 / 2, 100 * 3 / 4]) # In[32]: np.percentile(df_belgium['medianincome'], [100 / 4, 100 / 2, 3 * 100 / 4]) # In[33]: np.var(df_belgium['medianincome']) # In[34]: np.std(df_belgium['medianincome']) # In[35]: quantiles = np.percentile(df_belgium['medianincome'], [25, 75]) quantiles[1] - quantiles[0] # In[36]: quantiles = np.percentile(df_belgium['averageincome'], [25, 75]) quantiles[1] - quantiles[0] # # Boxplot / Boîte à moustache # In[37]: fig_boxplot = plt.figure() ax = sns.boxplot(x=df_belgium['medianincome']) filename = "belgianmunicipalities_boxplot" image_name = os.path.join(dirname, filename + imageformat) fig_boxplot.savefig(image_name, bbox_inches='tight') # In[38]: exec(open('plot_species_kde.py').read()) # create a fig object my_saving_display(fig, dirname, "KDE2D", imageformat)