import pandas as pd, numpy as np
%matplotlib inline
%pylab inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
Populating the interactive namespace from numpy and matplotlib
from scipy.stats import norm
# quantile function in python
# R
# https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Normal.html
# PYTHON
# https://stackoverflow.com/questions/24695174/python-equivalent-of-qnorm-qf-and-qchi2-of-r
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html
# qnorm(0.975) IN R
norm.ppf(.975)
1.959963984540054
# assume
# sample mean = 5,
# standard deviation = 2
# sample size = 20.
# use a 95% confidence level
a = 5
s = -2
n = 20
error = norm.ppf(.975)*s/np.sqrt(20)
left = a-error
right = a+error
print ('left: {}'.format(left))
print ('right: {}'.format(right))
left: 5.876522540576581 right: 4.123477459423419
The mean of predicting values are within confidence interval between 4.12 and 5.88 with 95% confidence interval, data is normally distributed and samples are independent
from scipy.stats import t
# quantile function in the The Student t Distribution
# R
# https://stat.ethz.ch/R-manual/R-devel/library/stats/html/TDist.html
# PYTHON
# https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.t.html
# https://stackoverflow.com/questions/19339305/python-function-to-get-the-t-statistic
# qt(0.975,df=n-1) IN R
t.ppf(.975, df=n-1)
2.093024054408263
# assume
# sample mean = 5,
# standard deviation = 2
# sample size = 20.
# use a 95% confidence level
a = 5
s = -2
n = 20
error = t.ppf(.975, df=n-1)*s/np.sqrt(20)
left = a-error
right = a+error
print ('left: {}'.format(left))
print ('right: {}'.format(right))
left: 5.936028812839819 right: 4.063971187160181
The true mean has a probability of 95% of being in the interval between 4.06 and 5.94 assuming that the original random variable is normally distributed, and the samples are independent.
df = pd.read_csv('http://www.cyclismo.org/tutorial/R/_static/w1.dat')
df.head(3)
vals | |
---|---|
0 | 0.43 |
1 | 0.40 |
2 | 0.45 |
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
vals | 54.0 | 0.765 | 0.378122 | 0.13 | 0.48 | 0.72 | 1.0075 | 1.76 |
error = t.ppf(.975, df=len(df)-1)*std(df)/np.sqrt(len(df))
print ('error : {}'.format(error))
# R
# error <- qt(0.975,df=length(w1$vals)-1)*std(df)/sqrt(length(w1$vals))
error : vals 0.102247 dtype: float64
left = mean(df)-error
right = mean(df)+error
print ('left: {}'.format(left))
print ('right: {}'.format(right))
left: vals 0.662753 dtype: float64 right: vals 0.867247 dtype: float64
There is a 95% probability that the true mean is between 0.66 and 0.87 assuming that the original random variable is normally distributed, and the samples are independent.
# df1
df_Comparison1 = pd.DataFrame({'Mean':[10,15],
'Std. Dev.':[3,2.5],
'Number':[300,230]})
df_Comparison1 = df_Comparison1.rename({0: 'Group1', 1: 'Group1'})
# df2
df_Comparison2 = pd.DataFrame({'Mean':[12,13],
'Std. Dev.':[4,5.3],
'Number':[210,340]})
df_Comparison2 = df_Comparison2.rename({0: 'Group1', 1: 'Group1'})
# df3
df_Comparison3 = pd.DataFrame({'Mean':[30,28.5],
'Std. Dev.':[4.5,3],
'Number':[420,400]})
df_Comparison3 = df_Comparison3.rename({0: 'Group1', 1: 'Group1'})
df_Comparison1
Mean | Number | Std. Dev. | |
---|---|---|---|
Group1 | 10 | 300 | 3.0 |
Group1 | 15 | 230 | 2.5 |
df_Comparison2
Mean | Number | Std. Dev. | |
---|---|---|---|
Group1 | 12 | 210 | 4.0 |
Group1 | 13 | 340 | 5.3 |
df_Comparison3
Mean | Number | Std. Dev. | |
---|---|---|---|
Group1 | 30.0 | 420 | 4.5 |
Group1 | 28.5 | 400 | 3.0 |
# R
# pmin() function returns the parallel minima vector of multiple vectors or matrix.
# http://www.endmemo.com/program/R/pmin.php
# e.g.
#> x <- c(3, 26, 122, 6)
#> y <- c(43,2,54,8)
#> z <- c(9,32,1,9)
#> pmax(x,y,z)
#[1] 43 32 122 9
#> pmin(x,y,z)
#[1] 3 2 1 6
# PYTHON
# np.minimum()
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.minimum.html
m1 = np.array([10,12,30])
m2 = np.array([10.5,13,28.5])
sd1 = np.array([3,4,4.5])
sd2 = np.array([2.5,5.3,3])
num1 = np.array([300,210,420])
num2 = np.array([230,340,400])
se = np.sqrt(sd1*sd1/num1+sd2*sd2/num2)
#error = qt(0.975,df=pmin(num1,num2)-1)*se
error = t.ppf(.975, df=np.minimum(num1,num2) -1)*se
se
array([ 0.23911067, 0.39850737, 0.26592158])
error
array([ 0.47113823, 0.78560924, 0.52278249])
left = (m1-m2)-error
right = (m1-m2)+error
print ('left: {}'.format(left))
print ('right: {}'.format(right))
left: [-0.97113823 -1.78560924 0.97721751] right: [-0.02886177 -0.21439076 2.02278249]
This gives the confidence intervals for each of the three tests. For example, in the first experiment the 95% confidence interval is between -0.97 and -0.03 assuming that the random variables are normally distributed, and the samples are independent.