#!/usr/bin/env python
# coding: utf-8

# Effect Size
# ======================
# 
# Credits: Forked from [CompStats](https://github.com/AllenDowney/CompStats) by Allen Downey.  License: [Creative Commons Attribution 4.0 International](http://creativecommons.org/licenses/by/4.0/).

# In[1]:


from __future__ import print_function, division

import numpy
import scipy.stats

import matplotlib.pyplot as pyplot

from IPython.html.widgets import interact, fixed
from IPython.html import widgets

# seed the random number generator so we all get the same results
numpy.random.seed(17)

# some nice colors from http://colorbrewer2.org/
COLOR1 = '#7fc97f'
COLOR2 = '#beaed4'
COLOR3 = '#fdc086'
COLOR4 = '#ffff99'
COLOR5 = '#386cb0'

get_ipython().run_line_magic('matplotlib', 'inline')


# To explore statistics that quantify effect size, we'll look at the difference in height between men and women.  I used data from the Behavioral Risk Factor Surveillance System (BRFSS) to estimate the mean and standard deviation of height in cm for adult women and men in the U.S.
# 
# I'll use `scipy.stats.norm` to represent the distributions.  The result is an `rv` object (which stands for random variable).

# In[2]:


mu1, sig1 = 178, 7.7
male_height = scipy.stats.norm(mu1, sig1)


# In[3]:


mu2, sig2 = 163, 7.3
female_height = scipy.stats.norm(mu2, sig2)


# The following function evaluates the normal (Gaussian) probability density function (PDF) within 4 standard deviations of the mean.  It takes and rv object and returns a pair of NumPy arrays.

# In[4]:


def eval_pdf(rv, num=4):
    mean, std = rv.mean(), rv.std()
    xs = numpy.linspace(mean - num*std, mean + num*std, 100)
    ys = rv.pdf(xs)
    return xs, ys


# Here's what the two distributions look like.

# In[5]:


xs, ys = eval_pdf(male_height)
pyplot.plot(xs, ys, label='male', linewidth=4, color=COLOR2)

xs, ys = eval_pdf(female_height)
pyplot.plot(xs, ys, label='female', linewidth=4, color=COLOR3)
pyplot.xlabel('height (cm)')
None


# Let's assume for now that those are the true distributions for the population.  Of course, in real life we never observe the true population distribution.  We generally have to work with a random sample.
# 
# I'll use `rvs` to generate random samples from the population distributions.  Note that these are totally random, totally representative samples, with no measurement error!

# In[6]:


male_sample = male_height.rvs(1000)


# In[7]:


female_sample = female_height.rvs(1000)


# Both samples are NumPy arrays.  Now we can compute sample statistics like the mean and standard deviation.

# In[8]:


mean1, std1 = male_sample.mean(), male_sample.std()
mean1, std1


# The sample mean is close to the population mean, but not exact, as expected.

# In[9]:


mean2, std2 = female_sample.mean(), female_sample.std()
mean2, std2


# And the results are similar for the female sample.
# 
# Now, there are many ways to describe the magnitude of the difference between these distributions.  An obvious one is the difference in the means:

# In[10]:


difference_in_means = male_sample.mean() - female_sample.mean()
difference_in_means # in cm


# On average, men are 14--15 centimeters taller.  For some applications, that would be a good way to describe the difference, but there are a few problems:
# 
# * Without knowing more about the distributions (like the standard deviations) it's hard to interpret whether a difference like 15 cm is a lot or not.
# 
# * The magnitude of the difference depends on the units of measure, making it hard to compare across different studies.
# 
# There are a number of ways to quantify the difference between distributions.  A simple option is to express the difference as a percentage of the mean.
# 

# In[11]:


# Exercise: what is the relative difference in means, expressed as a percentage?

relative_difference = difference_in_means / male_sample.mean()
relative_difference * 100   # percent


# But a problem with relative differences is that you have to choose which mean to express them relative to.

# In[12]:


relative_difference = difference_in_means / female_sample.mean()
relative_difference * 100    # percent


# Part Two
# ========
# 
# An alternative way to express the difference between distributions is to see how much they overlap.  To define overlap, we choose a threshold between the two means.  The simple threshold is the midpoint between the means:

# In[13]:


simple_thresh = (mean1 + mean2) / 2
simple_thresh


# A better, but slightly more complicated threshold is the place where the PDFs cross.

# In[14]:


thresh = (std1 * mean2 + std2 * mean1) / (std1 + std2)
thresh


# In this example, there's not much difference between the two thresholds.
# 
# Now we can count how many men are below the threshold:

# In[15]:


male_below_thresh = sum(male_sample < thresh)
male_below_thresh


# And how many women are above it:

# In[16]:


female_above_thresh = sum(female_sample > thresh)
female_above_thresh


# The "overlap" is the total area under the curves that ends up on the wrong side of the threshold.

# In[17]:


overlap = male_below_thresh / len(male_sample) + female_above_thresh / len(female_sample)
overlap


# Or in more practical terms, you might report the fraction of people who would be misclassified if you tried to use height to guess sex:

# In[18]:


misclassification_rate = overlap / 2
misclassification_rate


# Another way to quantify the difference between distributions is what's called "probability of superiority", which is a problematic term, but in this context it's the probability that a randomly-chosen man is taller than a randomly-chosen woman.

# In[19]:


# Exercise: suppose I choose a man and a woman at random.
# What is the probability that the man is taller?
sum(x > y for x, y in zip(male_sample, female_sample)) / len(male_sample)


# Overlap (or misclassification rate) and "probability of superiority" have two good properties:
# 
# * As probabilities, they don't depend on units of measure, so they are comparable between studies.
# 
# * They are expressed in operational terms, so a reader has a sense of what practical effect the difference makes.
# 
# There is one other common way to express the difference between distributions.  Cohen's $d$ is the difference in means, standardized by dividing by the standard deviation.  Here's a function that computes it:
# 

# In[20]:


def CohenEffectSize(group1, group2):
    """Compute Cohen's d.

    group1: Series or NumPy array
    group2: Series or NumPy array

    returns: float
    """
    diff = group1.mean() - group2.mean()

    n1, n2 = len(group1), len(group2)
    var1 = group1.var()
    var2 = group2.var()

    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    d = diff / numpy.sqrt(pooled_var)
    return d


# Computing the denominator is a little complicated; in fact, people have proposed several ways to do it.  This implementation uses the "pooled standard deviation", which is a weighted average of the standard deviations of the two groups.
# 
# And here's the result for the difference in height between men and women.

# In[21]:


CohenEffectSize(male_sample, female_sample)


# Most people don't have a good sense of how big $d=1.9$ is, so let's make a visualization to get calibrated.
# 
# Here's a function that encapsulates the code we already saw for computing overlap and probability of superiority.

# In[22]:


def overlap_superiority(control, treatment, n=1000):
    """Estimates overlap and superiority based on a sample.
    
    control: scipy.stats rv object
    treatment: scipy.stats rv object
    n: sample size
    """
    control_sample = control.rvs(n)
    treatment_sample = treatment.rvs(n)
    thresh = (control.mean() + treatment.mean()) / 2
    
    control_above = sum(control_sample > thresh)
    treatment_below = sum(treatment_sample < thresh)
    overlap = (control_above + treatment_below) / n
    
    superiority = sum(x > y for x, y in zip(treatment_sample, control_sample)) / n
    return overlap, superiority


# Here's the function that takes Cohen's $d$, plots normal distributions with the given effect size, and prints their overlap and superiority.

# In[23]:


def plot_pdfs(cohen_d=2):
    """Plot PDFs for distributions that differ by some number of stds.
    
    cohen_d: number of standard deviations between the means
    """
    control = scipy.stats.norm(0, 1)
    treatment = scipy.stats.norm(cohen_d, 1)
    xs, ys = eval_pdf(control)
    pyplot.fill_between(xs, ys, label='control', color=COLOR3, alpha=0.7)

    xs, ys = eval_pdf(treatment)
    pyplot.fill_between(xs, ys, label='treatment', color=COLOR2, alpha=0.7)
    
    o, s = overlap_superiority(control, treatment)
    print('overlap', o)
    print('superiority', s)


# Here's an example that demonstrates the function:

# In[24]:


plot_pdfs(2)


# And an interactive widget you can use to visualize what different values of $d$ mean:

# In[25]:


slider = widgets.FloatSliderWidget(min=0, max=4, value=2)
interact(plot_pdfs, cohen_d=slider)
None


# Cohen's $d$ has a few nice properties:
# 
# * Because mean and standard deviation have the same units, their ratio is dimensionless, so we can compare $d$ across different studies.
# 
# * In fields that commonly use $d$, people are calibrated to know what values should be considered big, surprising, or important.
# 
# * Given $d$ (and the assumption that the distributions are normal), you can compute overlap, superiority, and related statistics.

# In summary, the best way to report effect size often depends on the audience and your goals.  There is often a tradeoff between summary statistics that have good technical properties and statistics that are meaningful to a general audience.