#!/usr/bin/env python # coding: utf-8 # # HLMA 408: Estimation et Tests # # *** # > __Auteur__: Joseph Salmon # > # # # # Introduction et présentation # ## Import des packages usuels # In[1]: import platform import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from ipywidgets import interact # widget manipulation from scipy.special import comb, binom from scipy.stats import poisson, binned_statistic, chisquare, chi2 # ## Commande "magique" pour un affichage plus avancé en Jupyter # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # ## Préparation pour l'affichage graphique et sauvegarder les images # In[3]: # saving tools for the course: sns.set_context("paper", font_scale=1) sns.set_style("ticks") sns.set_palette("colorblind") # colors brown = (0.64, 0.16, 0.16) purple = (148. / 255, 0, 211. / 255) dirname = "../prebuiltimages/" imageformat = ".pdf" # ## Palindromes, ADN et cytomegalovirus (CMV) # # Description des données: # # "This dataset is found from # http://www.stat.berkeley.edu/users/statlabs/labs.html. # It accompanies the excellent text Stat Labs: Mathematical Statistics through Applications # Springer-Verlag (2001) by Deborah Nolan and Terry Speed." # # Plus de détails: # https://www.stat.berkeley.edu/users/statlabs/papers/sample.pdf # (notamment sur les biais de collecte des données...) # ## Téléchargement et import pour sauvegarder les données # In[4]: Eleve = False # à changer en True pour un étudiant title_font = {'size': '16'} axis_font = {'size': '14'} if not Eleve: from matplotlib import rc rc('text', usetex=True) font = {'family': 'sans-serif'} rc('font', **font) saving = True from download import download else: saving = False # In[5]: # Pour utiliser les paramètres par défaut qui sont dans `utils.py` from utils import my_saving_display saving = True path_target = "./utils.py" url_shared_files = "http://josephsalmon.eu/enseignement/Montpellier/HLMA408/sharedcode/utils.py" download(url_shared_files, path_target, replace=False) # In[6]: url = "http://josephsalmon.eu/enseignement/datasets/hcmv.data" # url = "http://www.stat.berkeley.edu/users/statlabs/data/hcmv.data" # backup url, without header. path_target = "./hcmv.data" download(url, path_target, replace=False) # In[7]: df_mcv = pd.read_csv("hcmv.data", sep='\s+') # \s+ : gestion des espaces df_mcv.head(n=10) # df = Data Frame # In[8]: fig = plt.figure(figsize=(9, 2.5)) ax = sns.stripplot(x=df_mcv["location"], alpha=0.5, jitter=0.2, size=10, marker="d") plt.ylim([-0.4, 0.4]) ax.set_xlabel('Position',**axis_font) sns.despine() plt.tight_layout() plt.show() my_saving_display(fig, dirname, "jitter_location", imageformat, saving=saving) # # Loi de Poisson et processus de Poisson # # https://fr.wikipedia.org/wiki/Loi_de_Poisson # In[9]: for i in [1, 2, 5, 9]: fig = plt.figure(figsize=(8, 4)) theta = i plt.xlabel('Entiers', **axis_font) plt.ylabel('Probabilité', **axis_font) plt.title( "Probabilités des entiers pour une loi de Poisson de paramètre $\\theta={}$".format(theta), **title_font) x = np.arange(18) plt.ylim([0, 0.4]) plt.stem(x, poisson.pmf(x, theta), basefmt="k--") plt.xticks(x, x) plt.axvline(i, linestyle='--', color='k', label="$\\theta$") plt.tight_layout() plt.legend() plt.show() my_saving_display(fig, dirname, "poisson_distrib_{}".format(theta), imageformat, saving=saving) # In[10]: def poisson_distribution(theta=0.9): """Visualisation de la loi de Poisson.""" fig, ax1 = plt.subplots(1, 1, figsize=(8, 4)) ax1.set_xlabel('Entiers',**axis_font) ax1.set_ylabel('Probabilité',**axis_font) ax1.set_title( "Probabilités des entiers pour une loi de Poisson de paramètre $\\theta={}$".format(theta), **title_font) x = np.arange(18) ax1.set_ylim([0, 0.4]) plt.stem(x, poisson.pmf(x, theta), basefmt="k--") plt.xticks(x, x) ax1.axvline(theta, linestyle='--', color='k', label="$\\theta$") plt.tight_layout() plt.legend() plt.show() # In[11]: interact(poisson_distribution, theta=(0.1, 20, 0.2)); # # Processus de Poisson # In[12]: n_basis = 229354 n_palindromes = df_mcv.count()['location'] lambda_tot = n_palindromes/n_basis # In[13]: for i in range(4): # Determiner le nombre de points à tirer pour le processus n_to_sample = np.random.poisson(lambda_tot * n_basis) points_from_poisson_prcs = np.random.uniform( 0, n_basis+1, size=n_to_sample) fig = plt.figure(figsize=(9, 2.5)) plt.title("Visualisation d'un tirage d'un processus de Poisson".format(theta), **title_font) ax = sns.stripplot(x=points_from_poisson_prcs, alpha=0.5, jitter=0.2, size=10, marker="d") plt.xlim([0, n_basis+1]) plt.ylim([-0.4, 0.4]) sns.despine() plt.tight_layout() plt.show() my_saving_display(fig, dirname, "jitter_location_poisson_prcs{}".format(i), imageformat, saving=saving) # # Traitement des données pour obtenir un compte par plage de 4000 bp # In[14]: lambda_hat = n_palindromes/n_basis * 4000 lambda_hat # In[15]: n_palindromes # In[16]: n_palindromes/n_basis # In[17]: bins = np.arange(1, n_basis, step=4000) n_bins = bins.shape[0] n_bins # In[18]: bins[-3:] # check the last will be [228001, \infty].... bins.shape # In[19]: # display bins: fig = plt.figure(figsize=(9, 2.5)) ax = sns.stripplot(x=df_mcv["location"], alpha=0.5, jitter=0.2, size=10, marker="d") ax.vlines(bins, np.repeat(-0.4, n_bins), np.repeat(0.4, n_bins), linestyle='--', color='k') plt.ylim([-0.4, 0.4]) ax.set_xlabel('Position',**axis_font) sns.despine() plt.tight_layout() plt.show() my_saving_display(fig, dirname, "jitter_location_bins", imageformat, saving=saving) # In[20]: mcv_array = df_mcv['location'].values mcv_array mcv_count = np.zeros(n_basis,) mcv_count[mcv_array-1] = 1 # In[21]: a = pd.cut(df_mcv['location'], bins, right=False, labels=bins[:-1]) # In[22]: counts_by_boxes = a.value_counts(sort=False) counts_by_boxes.mean() # In[23]: b = pd.cut(counts_by_boxes, [0, 3, 4, 5, 6, 7, 8, 9, 15], right=False, labels=[ '0-2', '3', '4', '5', '6', '7', '8', '9-']) # In[24]: counts_by_boxes_bis = b.value_counts(sort=False) counts_by_boxes_bis.sum() # In[25]: counts_by_boxes_bis # In[26]: poisson.cdf(2, lambda_hat) # In[27]: poisson.pmf(0, lambda_hat) + poisson.pmf(1, lambda_hat) + poisson.pmf(2, lambda_hat) # same as before # In[28]: f_exp = np.array([poisson.cdf(2, lambda_hat), poisson.pmf(3, lambda_hat), poisson.pmf(4, lambda_hat), poisson.pmf(5, lambda_hat), poisson.pmf(6, lambda_hat), poisson.pmf(7, lambda_hat), poisson.pmf(8, lambda_hat), 1 - poisson.cdf(8, lambda_hat)]) * counts_by_boxes_bis.sum() for i,val in enumerate(f_exp): print("Fréquence théorique de la case {0} vaut {1:0.1f}".format(i,val)) # In[29]: chi2_val,chi2_pval = chisquare(counts_by_boxes_bis, f_exp=f_exp) # In[30]: chi2_val # In[31]: # Vérification de la formule du cours : hattheta = counts_by_boxes.mean() print(counts_by_boxes_bis.sum() * np.exp(-hattheta) * hattheta**3 / (3*2)) # # Chi2 # # In[32]: x = np.linspace(-1, 30, 300) for df in [3, 5, 9]: alpha = 0.05 quantile = chi2.ppf(1 - alpha, df) fig, ax1 = plt.subplots(1, 1) ax1.plot(x, chi2.pdf(x, df), 'k-', lw=2, label=r"densité") ax1.set_ylim(0, 0.25) ax1.set_xlim(0, 30) ax1.fill_between(x, 0, chi2.pdf(x, df), where=x <= - quantile, color=sns.color_palette()[0]) ax1.fill_between(x, 0, chi2.pdf(x, df), where=x >= quantile, color=sns.color_palette()[0]) quantile_string = "{0:0.2f}".format(quantile) plt.title(r"Visualisation d'un quantile de $\chi^2({})$".format(df) + "\n" + r"Aire $ \alpha = {0:.2f},$ seuil = $".format(alpha, quantile, df)+quantile_string.zfill(5) + "$", **title_font) plt.legend() plt.show() my_saving_display(fig, dirname, "chi2_quantile005_" + str(df), imageformat, saving=saving) # In[33]: # Pour forcer d'avoir 6 case d'affichage pour un nombre quantile_string.zfill(6) # In[34]: def chi2_quantile(alpha=0.9, df=5): """Visualize Chi2 quantiles""" quantile = chi2.ppf(1 - alpha, df) fig, ax1 = plt.subplots(1, 1) ax1.plot(x, chi2.pdf(x, df), 'k-', lw=2, label=r"densité") ax1.set_ylim(0, 0.25) ax1.set_xlim(0, 30) ax1.fill_between(x, 0, chi2.pdf(x, df), where=x <= - quantile, color=sns.color_palette()[0]) ax1.fill_between(x, 0, chi2.pdf(x, df), where=x >= quantile, color=sns.color_palette()[0]) quantile_string = "{0:0.2f}".format(quantile) plt.title(r"Visualisation d'un quantile de $\chi^2({})$".format(df) + "\n" + r"Aire $ \alpha = {0:.2f},$ seuil = $".format(alpha, quantile, df) + quantile_string.zfill(5) + "$", **title_font) plt.legend() plt.show() # In[35]: interact(chi2_quantile, alpha=(0.001, .999, 0.001),df=(1,20,1)); # change the first and second value to check more quantiles # In[36]: df = 6 1-chi2.cdf(chi2_val, df) # In[ ]: