#!/usr/bin/env python # coding: utf-8 # # HLMA 408: Échantillonage aléatoire # # *** # > __Auteur__: Joseph Salmon # > # # ## Sommaire # # * __[Introduction et présentation](#intro)__
# # # # Introduction et présentation # ## Import des packages usuels: # In[1]: import platform import sys from download import download import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from ipywidgets import interact # widget manipulation from scipy.special import comb, binom from scipy.stats import norm # ## Commande "magique" pour un affichage plus avancé en Jupyter: # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # ## Préparation pour l'affichage graphique et sauvegarder les images: # In[3]: # saving tools for the course: sns.set_context("paper", font_scale=1) sns.set_style("ticks") sns.set_palette("colorblind") # colors brown = (0.64, 0.16, 0.16) purple = (148. / 255, 0, 211. / 255) dirname = "../prebuiltimages/" imageformat = ".pdf" # ## Grossesses et cigarettes, impact sur la santé du nouveau né # # Description des données: # # "This dataset is found from # http://www.stat.berkeley.edu/users/statlabs/labs.html. # It accompanies the excellent text Stat Labs: Mathematical Statistics through Applications # Springer-Verlag (2001) by Deborah Nolan and Terry Speed." # # Plus de détails: # https://www.stat.berkeley.edu/users/statlabs/papers/sample.pdf # (notamment sur les biais de collectes des données...) # ## Téléchargement et import pour sauvegarder les données # In[4]: # to use the default values of utils for instance saving = True path_target = "./utils.py" url_shared_files = "http://josephsalmon.eu/enseignement/Montpellier/HLMA408/sharedcode/utils.py" download(url_shared_files, path_target, replace=False) from utils import my_saving_display # ## Téléchargement et import des données # In[5]: url = "http://josephsalmon.eu/enseignement/datasets/babies23.data" # url = "http://www.stat.berkeley.edu/users/statlabs/data/babies23.data" # backup url, without header. path_target = "./babies23.data" download(url, path_target, replace=False) # ## Option de *pandas* et de pré-traitement: # In[6]: # Preoprocessing: only run once or big trouble (think about it!) is_preprocessing_done = 0 # init at 0,if greater don't redo it pd.options.display.max_rows = 8 # set not to display to many lines in pandas pd.set_option('precision', 0) # set to display number at precision 0 in pandas # ## Lecture de la base de données et constructions d'un dataframe: # In[7]: df_babies = pd.read_csv("babies23.data", skiprows=38, sep='\s+') # \s+ : for hanlding spaces df_babies.head(n=10) # df stands for Data Frame # # Pré-traitement: # Lire l'entête du fichier pour comprendre tout ça... les unités, les données manquantes etc. # In[8]: if is_preprocessing_done<1: print("You have to do the pre-processing only once, to avoid unit issues") # Remark: use inplace option to avoid useless copies for nans df_babies['smoke'].replace(9, np.nan, inplace=True) # handle missing values df_babies['smoke'].replace(0, False, inplace=True) df_babies['smoke'].replace(1, True, inplace=True) df_babies['smoke'].replace(2, True, inplace=True) df_babies['smoke'].replace(3, True, inplace=True) df_babies.dropna(inplace=True) print("This is done only because {} < 1".format(is_preprocessing_done)) is_preprocessing_done +=1 # # Analyse du sexe de l'enfant # In[9]: df_babies.head(20) # In[10]: plt.figure(figsize=(6,4)) df_babies.groupby("smoke").wt.plot(kind='kde') plt.legend() plt.xlabel('Poids du bébé (en kg)') plt.ylabel('Proportion') plt.title("Densité du poids du bébé selon le statut tabagique") # In[11]: smoke_by_type = df_babies.smoke.value_counts() n_total = df_babies.smoke.count() print("Il y a {} bébés dont on connait le statut tabagique de la mère".format(n_total)) # In[12]: smoke_by_type # In[13]: smoke_ratio= smoke_by_type[True] / df_babies.smoke.count() # In[14]: print("La proportion de mères fumeuses est de {0:.2f} % dans la popultation totale.".format(smoke_ratio * 100)) # ## Tirage aléatoire de n_samples échantillons (parmi les 1226) # In[15]: n_samples = 91 # change the random_state value to get another random sample # df_extract = df_babies.sample(n=n_samples, random_state=45) df_extract = df_babies.sample(n=n_samples) smoke_by_type_extract = df_extract.smoke.value_counts() smoke_ratio_extract = smoke_by_type_extract[True] / df_extract.smoke.count() print("La proportion de mères fumeuses est de {0:.2f} % dans la popultation extraite.".format(smoke_ratio_extract * 100)) # In[16]: comb(n_total, n_samples, exact=True) # In[17]: n_digits = sys.getsizeof(comb(n_total, n_samples, exact=True)) # In[18]: print(r"Il y a environ 10^{2} façon de choisir {0} nombres parmi {1}".format(n_samples, n_total, n_digits)) # ## Intervalle de confiance avec le TCL # # In[19]: n_samples = 1000 X = np.random.random(n_samples) meanX = np.mean(X) sdX = np.std(X) # In[20]: fig = plt.figure(figsize=(10,4)) ax = sns.stripplot(x=X,alpha=0.3,jitter = 0.1, size=10, marker=".") plt.title(r"Échantillon (loi uniforme, $n ={0}$)".format(n_samples)) plt.axvline(meanX, linestyle='--', color='k', label="$\\bar{x}_n$") plt.legend() plt.show() # In[21]: Z = (X - meanX) / sdX # In[22]: fig = plt.figure(figsize=(10,4)) ax = sns.stripplot(x=Z,alpha=0.3,jitter = 0.1, size=10, marker=".") plt.title(r"Échantillon standardiseé (loi uniforme, $n ={0}$)".format(n_samples)) plt.axvline(0, linestyle='--', color='k', label="$\\bar{x}_n$") plt.legend() plt.show() # In[23]: alpha = 0.00001 c = norm.ppf(1- alpha/ 2) IC = np.array([meanX - c * sdX / np.sqrt(n_samples), meanX + c * sdX / np.sqrt(n_samples)]) # In[ ]: # In[24]: IC, meanX, c # In[25]: from matplotlib.ticker import StrMethodFormatter fig = plt.figure(figsize=(10, 4)) p = plt.axvspan(IC[0], IC[1], facecolor=sns.color_palette()[ 5], alpha=0.5, label="Intervalle à {}%".format(1-alpha)) ax = sns.stripplot(x=X, alpha=0.3, jitter=0.1, size=10, marker=".") plt.title(r"Échantillon (loi uniforme, $n ={0}$)".format(n_samples)) plt.axvline(meanX, linestyle='--', color='k', label="$\\bar{x}_n$") bottom, top = plt.ylim() xtixcks_val = np.arange(0, 1.1, 0.1) plt.xticks(xtixcks_val, xtixcks_val) plt.gca().xaxis.set_major_formatter(StrMethodFormatter('{x:,.2f}')) # 2 decimal places plt.legend() plt.show() my_saving_display(fig, dirname, "jitter_IC_TCL", imageformat, saving=saving) # In[26]: def IC_by_TCL(alpha=0.05, n_samples=1000): X = np.random.random(n_samples) meanX = np.mean(X) sdX = np.std(X) Z = (X - meanX) / sdX c = norm.ppf(1 - alpha / 2) IC = np.array([meanX - c * sdX / np.sqrt(n_samples), meanX + c * sdX / np.sqrt(n_samples)]) fig = plt.figure(figsize=(10, 4)) p = plt.axvspan(IC[0], IC[1], facecolor=sns.color_palette()[ 5], alpha=0.5, label="Intervalle à {}%".format(1 - alpha)) ax = sns.stripplot(x=X, alpha=0.3, jitter=0.1, size=10, marker=".") plt.title(r"Échantillon (loi uniforme, $n ={0}$)".format(n_samples)) plt.axvline(meanX, linestyle='--', color='k', label="$\\bar{x}_n$") bottom, top = plt.ylim() xtixcks_val = np.arange(0, 1.1, 0.1) plt.xticks(xtixcks_val, xtixcks_val) plt.gca().xaxis.set_major_formatter( StrMethodFormatter('{x:,.2f}')) # 2 decimal places plt.legend() plt.show() # # Visualiseur de l'impact de $\alpha$ et de $n$ sur les IC du TCL # In[27]: interact(IC_by_TCL, alpha=(0.001, .999, 0.001),n_samples=(100,2000,100)) # change the first and second value to check more quantiles # In[28]: # np.shape(np.where(np.logical_and(X>=IC[0], X<=IC[1])))[1] # # On répète la création de la statisque sur 500 tirages d'échantillons de taille 1000: # # In[29]: is_IC_has_param = 0 # before starting no IC contains the true parameter n_repetition = 500 for i in range(n_repetition): n_samples = 1000 X = np.random.random(n_samples) meanX = np.mean(X) sdX = np.std(X) Z = (X - meanX) / sdX alpha = 0.04 c = norm.ppf(1- alpha/ 2) IC = np.array([meanX - c * sdX / np.sqrt(n_samples), meanX + c * sdX / np.sqrt(n_samples)]) if (IC[0]<0.5) and (IC[1]>0.5): is_IC_has_param += 1 # else: # print(IC) "Proportion de fois que l'échantiollon contient le vrai paramètre (=espérance): {}".format(is_IC_has_param/n_repetition) # In[ ]: