from __future__ import print_function, division
%matplotlib inline
import pandas as pd
import thinkstats2
import thinkplot
# http://schools.nyc.gov/AboutUs/schools/data/classsize/classsize_2014_11_14.htm
df = pd.read_excel('CityLevelDistributionSummaryPreliminary2015.xlsx', skiprows=6)
df
GRADE LEVEL | PROGRAM TYPE | CORE SUBJECT (MS CORE and 09-12 ONLY) | SERVICE CATEGORY (0K-08 ONLY) | CLASS SIZE | NUMBER OF CLASSES | NUMBER OF STUDENTS | PERCENT OF STUDENTS IN BOROUGH / PROGRAM / GRADE / SUBJECT | |
---|---|---|---|---|---|---|---|---|
0 | 0K | GEN ED / ICT / G&T | - | - | <15 | 86 | 1002 | 0.014110 |
1 | 0K | GEN ED / ICT / G&T | - | - | 15 | 29 | 435 | 0.006126 |
2 | 0K | GEN ED / ICT / G&T | - | - | 16 | 35 | 560 | 0.007886 |
3 | 0K | GEN ED / ICT / G&T | - | - | 17 | 59 | 1003 | 0.014124 |
4 | 0K | GEN ED / ICT / G&T | - | - | 18 | 95 | 1710 | 0.024080 |
5 | 0K | GEN ED / ICT / G&T | - | - | 19 | 98 | 1862 | 0.026221 |
6 | 0K | GEN ED / ICT / G&T | - | - | 20 | 177 | 3540 | 0.049851 |
7 | 0K | GEN ED / ICT / G&T | - | - | 21 | 232 | 4872 | 0.068608 |
8 | 0K | GEN ED / ICT / G&T | - | - | 22 | 298 | 6556 | 0.092322 |
9 | 0K | GEN ED / ICT / G&T | - | - | 23 | 364 | 8372 | 0.117896 |
10 | 0K | GEN ED / ICT / G&T | - | - | 24 | 444 | 10656 | 0.150059 |
11 | 0K | GEN ED / ICT / G&T | - | - | 25 | 865 | 21625 | 0.304526 |
12 | 0K | GEN ED / ICT / G&T | - | - | 26 | 191 | 4966 | 0.069932 |
13 | 0K | GEN ED / ICT / G&T | - | - | 27 | 68 | 1836 | 0.025855 |
14 | 0K | GEN ED / ICT / G&T | - | - | 28 | 42 | 1176 | 0.016561 |
15 | 0K | GEN ED / ICT / G&T | - | - | 29 | 10 | 290 | 0.004084 |
16 | 0K | GEN ED / ICT / G&T | - | - | 30 | 12 | 360 | 0.005070 |
17 | 0K | GEN ED / ICT / G&T | - | - | 31 | 3 | 93 | 0.001310 |
18 | 0K | GEN ED / ICT / G&T | - | - | 32 | 1 | 32 | 0.000451 |
19 | 0K | GEN ED / ICT / G&T | - | - | 33 | 2 | 66 | 0.000929 |
20 | 01 | GEN ED / ICT / G&T | - | - | <15 | 46 | 582 | 0.008027 |
21 | 01 | GEN ED / ICT / G&T | - | - | 15 | 25 | 375 | 0.005172 |
22 | 01 | GEN ED / ICT / G&T | - | - | 16 | 37 | 592 | 0.008165 |
23 | 01 | GEN ED / ICT / G&T | - | - | 17 | 35 | 595 | 0.008206 |
24 | 01 | GEN ED / ICT / G&T | - | - | 18 | 41 | 738 | 0.010178 |
25 | 01 | GEN ED / ICT / G&T | - | - | 19 | 75 | 1425 | 0.019653 |
26 | 01 | GEN ED / ICT / G&T | - | - | 20 | 107 | 2140 | 0.029514 |
27 | 01 | GEN ED / ICT / G&T | - | - | 21 | 147 | 3087 | 0.042575 |
28 | 01 | GEN ED / ICT / G&T | - | - | 22 | 198 | 4356 | 0.060077 |
29 | 01 | GEN ED / ICT / G&T | - | - | 23 | 210 | 4830 | 0.066614 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
474 | 09-12 | SPEC ED | Math | - | 11 | 55 | 605 | 0.087719 |
475 | 09-12 | SPEC ED | Math | - | 12 | 52 | 624 | 0.090474 |
476 | 09-12 | SPEC ED | Math | - | 13 | 67 | 871 | 0.126287 |
477 | 09-12 | SPEC ED | Math | - | 14 | 80 | 1120 | 0.162389 |
478 | 09-12 | SPEC ED | Math | - | 15 | 101 | 1515 | 0.219661 |
479 | 09-12 | SPEC ED | Math | - | >15 | 41 | 685 | 0.099319 |
480 | 09-12 | SPEC ED | Science | - | <6 | 18 | 90 | 0.015169 |
481 | 09-12 | SPEC ED | Science | - | 6 | 33 | 198 | 0.033373 |
482 | 09-12 | SPEC ED | Science | - | 7 | 27 | 189 | 0.031856 |
483 | 09-12 | SPEC ED | Science | - | 8 | 20 | 160 | 0.026968 |
484 | 09-12 | SPEC ED | Science | - | 9 | 28 | 252 | 0.042474 |
485 | 09-12 | SPEC ED | Science | - | 10 | 41 | 410 | 0.069105 |
486 | 09-12 | SPEC ED | Science | - | 11 | 37 | 407 | 0.068599 |
487 | 09-12 | SPEC ED | Science | - | 12 | 41 | 492 | 0.082926 |
488 | 09-12 | SPEC ED | Science | - | 13 | 51 | 663 | 0.111748 |
489 | 09-12 | SPEC ED | Science | - | 14 | 64 | 896 | 0.151020 |
490 | 09-12 | SPEC ED | Science | - | 15 | 85 | 1275 | 0.214900 |
491 | 09-12 | SPEC ED | Science | - | >15 | 53 | 901 | 0.151862 |
492 | 09-12 | SPEC ED | Social Studies | - | <6 | 32 | 160 | 0.022679 |
493 | 09-12 | SPEC ED | Social Studies | - | 6 | 39 | 234 | 0.033168 |
494 | 09-12 | SPEC ED | Social Studies | - | 7 | 50 | 350 | 0.049610 |
495 | 09-12 | SPEC ED | Social Studies | - | 8 | 35 | 280 | 0.039688 |
496 | 09-12 | SPEC ED | Social Studies | - | 9 | 60 | 540 | 0.076541 |
497 | 09-12 | SPEC ED | Social Studies | - | 10 | 42 | 420 | 0.059532 |
498 | 09-12 | SPEC ED | Social Studies | - | 11 | 55 | 605 | 0.085755 |
499 | 09-12 | SPEC ED | Social Studies | - | 12 | 59 | 708 | 0.100354 |
500 | 09-12 | SPEC ED | Social Studies | - | 13 | 66 | 858 | 0.121616 |
501 | 09-12 | SPEC ED | Social Studies | - | 14 | 60 | 840 | 0.119064 |
502 | 09-12 | SPEC ED | Social Studies | - | 15 | 93 | 1395 | 0.197732 |
503 | 09-12 | SPEC ED | Social Studies | - | >15 | 40 | 665 | 0.094259 |
504 rows × 8 columns
grouped = df.groupby('GRADE LEVEL')
for name, group in grouped:
print(name)
01 02 03 04 05 06 07 08 09-12 0K 0K-09 MS Core
grade8 = grouped.get_group('08')
size = grade8['CLASS SIZE'].replace(['<15', '>34'], [14, 35]).astype(int)
classes = grade8['NUMBER OF CLASSES']
pmf = thinkstats2.Pmf(dict(zip(size, classes)))
thinkplot.Pmf(pmf)
pmf.Mean()
27.406021505376348
def BiasPmf(pmf, label):
new_pmf = pmf.Copy(label=label)
for x, p in pmf.Items():
new_pmf.Mult(x, x)
new_pmf.Normalize()
return new_pmf
biased = BiasPmf(pmf, 'biased')
thinkplot.Pmf(pmf)
thinkplot.Pmf(biased)
biased.Mean()
28.088749038748254
# https://www.purdue.edu/datadigest/2013-14/InstrStuLIfe/DistUGClasses.html
sizes = [1, 5, 15, 25, 35, 45, 75, 125]
counts = [138, 635, 1788, 1979, 796, 354, 487, 333]
xlim = [-5, 130]
formats=['png', 'pdf']
pmf = thinkstats2.Pmf(dict(zip(sizes, counts)), label='actual')
thinkplot.PrePlot(2)
thinkplot.Hist(pmf)
thinkplot.Config(xlabel='class size', ylabel='PMF', xlim=xlim, loc='upper right')
thinkplot.Save('purdue1', formats=formats)
Writing purdue1.png Writing purdue1.pdf
<matplotlib.figure.Figure at 0x7f68eda74690>
biased = BiasPmf(pmf, 'biased')
thinkplot.PrePlot(2)
thinkplot.Hist(pmf, align='right')
thinkplot.Hist(biased, color='orange', align='left')
thinkplot.Config(xlabel='class size', ylabel='PMF', xlim=xlim, loc='upper right')
thinkplot.Save('purdue2', formats=formats)
Writing purdue2.png Writing purdue2.pdf
<matplotlib.figure.Figure at 0x7f68ed11e350>
thinkplot.PrePlot(2)
thinkplot.Cdf(pmf.MakeCdf())
thinkplot.Cdf(biased.MakeCdf(), color='orange')
thinkplot.Config(xlabel='class size', ylabel='CDF', xlim=xlim, loc='lower right')
thinkplot.Save('purdue3', formats=formats)
Writing purdue3.png Writing purdue3.pdf
<matplotlib.figure.Figure at 0x7f68edb981d0>
pmf.Mean(), biased.Mean()
(30.959754224270352, 56.01463671185027)