In [1]:

from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 18¶

Student's lament¶

In [2]:

scores = Table.read_table('scores_by_section.csv')
scores

Out[2]:

Section	Midterm
1	22
2	12
2	23
2	14
1	20
3	25
4	19
1	24
5	8
6	14

... (349 rows omitted)

In [3]:

scores.group('Section')

Out[3]:

Section	count
1	32
2	32
3	27
4	30
5	33
6	32
7	24
8	29
9	30
10	34

... (2 rows omitted)

In [4]:

scores.group('Section', np.average).show()

Section	Midterm average
1	15.5938
2	15.125
3	13.6667
4	14.7667
5	17.4545
6	15.0312
7	16.625
8	16.3103
9	14.5667
10	15.2353
11	15.8077
12	15.7333

In [5]:

random_sample = scores.sample(27, with_replacement = False)
random_sample

Out[5]:

Section	Midterm
9	18
1	18
4	19
4	21
4	13
5	15
10	20
5	16
5	11
9	19

... (17 rows omitted)

In [6]:

np.average(random_sample.column('Midterm'))

Out[6]:

16.185185185185187

In [7]:

random_sample = scores.sample(27, with_replacement = False)
np.average(random_sample.column('Midterm'))

Out[7]:

13.37037037037037

In [8]:

averages = make_array()

for i in np.arange(50000):
    random_sample = scores.sample(27, with_replacement = False)
    new_average = np.average(random_sample.column('Midterm'))
    averages = np.append(averages, new_average)    

In [9]:

observed_average = 13.6667

In [10]:

Table().with_column('Random Sample Average', averages).hist(bins = 25)
plots.scatter(observed_average, 0, color = 'red', s=40);

In [11]:

#################

In [12]:

np.count_nonzero(averages <= observed_average) / 50000

Out[12]:

0.0594

In [13]:

np.count_nonzero(averages <= 13.60) / 50000

Out[13]:

0.05234

In [14]:

Table().with_column('Random Sample Average', averages).hist(bins = 25)
plots.scatter(observed_average, 0, color='red', s=30)
plots.plot([13.6, 13.6], [0, 0.35], color='gold', lw=2);

In [ ]: