Simpson's paradox

Example from article

Actual data

In [1]:
admission_rate = (3738 + 1494) / 12763
admission_rate
Out[1]:
0.4099349682676487
In [2]:
total_men = 3738 + 4704
total_men
Out[2]:
8442
In [3]:
expect_men_admitted = total_men * admission_rate
expect_men_admitted
Out[3]:
3460.6710021154904
In [4]:
total_women = 1494 + 2827
total_women
Out[4]:
4321
In [5]:
expect_women_admitted = total_women * admission_rate
expect_women_admitted
Out[5]:
1771.32899788451

Data in the example

In [6]:
admission_rate = (250 + 250) / (250 + 250 + 300 + 400)
admission_rate
Out[6]:
0.4166666666666667
In [7]:
total_men = 250 + 300
total_men
Out[7]:
550
In [8]:
total_women = 250 + 400
total_women
Out[8]:
650
In [9]:
expect_men_admitted = total_men * admission_rate
expect_men_admitted
Out[9]:
229.16666666666669
In [10]:
expect_women_admitted = total_women * admission_rate
expect_women_admitted
Out[10]:
270.83333333333337

Regression example

In [11]:
import pandas as pd
import seaborn as sns
import numpy as np
In [12]:
men = pd.DataFrame({"height": np.random.normal(1.8, 0.05, 100), "weight": np.random.normal(85, 1, 100)})
In [13]:
men['bmi'] = men['weight'] / men['height']**2
In [14]:
men
Out[14]:
height weight bmi
0 1.846326 84.350208 24.743971
1 1.705580 83.648994 28.755193
2 1.756489 84.277095 27.316110
3 1.848740 84.762614 24.800061
4 1.828322 85.054365 25.444335
5 1.792602 85.289775 26.541740
6 1.790928 86.048167 26.827812
7 1.809917 83.658219 25.538256
8 1.804939 84.539923 25.949953
9 1.755725 86.369401 28.018621
10 1.817654 85.530689 25.888063
11 1.798073 84.488833 26.132729
12 1.789017 85.597579 26.744371
13 1.736679 85.642032 28.395390
14 1.717038 84.934525 28.808750
15 1.754468 85.220435 27.685526
16 1.745762 86.236397 28.295705
17 1.738966 83.296604 27.545156
18 1.744043 84.484906 27.775682
19 1.814423 87.287624 26.514010
20 1.696548 85.193880 29.598924
21 1.834508 86.174038 25.605727
22 1.847978 85.316498 24.982692
23 1.838084 84.265763 24.941385
24 1.791247 86.197007 26.864651
25 1.738719 85.308274 28.218388
26 1.810138 84.111496 25.670353
27 1.828684 84.670985 25.319619
28 1.822241 85.705984 25.810675
29 1.727342 87.253849 29.243388
... ... ... ...
70 1.812735 85.353152 25.974731
71 1.820731 85.452422 25.777023
72 1.850460 84.517401 24.682373
73 1.758177 84.955599 27.483171
74 1.720298 83.389695 28.177659
75 1.818238 84.249254 25.483829
76 1.801602 85.523859 26.349323
77 1.738942 85.662463 28.328297
78 1.828085 84.353179 25.241129
79 1.853273 84.966655 24.738281
80 1.779977 84.804354 26.766354
81 1.851053 84.969054 24.798354
82 1.807441 84.043159 25.726102
83 1.856138 85.852565 24.919121
84 1.837212 85.128751 25.220738
85 1.794949 86.020067 26.699028
86 1.771492 84.545193 26.940797
87 1.882290 86.189506 24.326592
88 1.817829 85.770484 25.955643
89 1.799267 85.202430 26.318481
90 1.846834 84.033850 24.637605
91 1.773097 85.051595 27.053130
92 1.732057 84.835168 28.278197
93 1.800811 84.196582 25.963185
94 1.850443 85.347839 24.925330
95 1.721525 83.980465 28.336855
96 1.798503 83.329209 25.761735
97 1.835831 86.217741 25.581795
98 1.790648 85.394169 26.632243
99 1.855777 84.535031 24.546248

100 rows × 3 columns

In [15]:
sns.lmplot(x='height', y='bmi', data=men)
C:\Users\mclou\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[15]:
<seaborn.axisgrid.FacetGrid at 0x1e360ad6c18>
In [16]:
women = pd.DataFrame({"height": np.random.normal(1.6, 0.05, 100), "weight": np.random.normal(60, 1, 100)})
In [17]:
women['bmi'] = women['weight'] / women['height']**2
In [18]:
women
Out[18]:
height weight bmi
0 1.635764 60.580269 22.640706
1 1.561134 59.842627 24.554452
2 1.600291 58.834190 22.973758
3 1.541475 59.004968 24.832234
4 1.617897 60.302164 23.037285
5 1.614042 61.837639 23.736871
6 1.593266 59.645419 23.496357
7 1.590521 60.987380 24.107989
8 1.619571 58.195426 22.186507
9 1.634805 59.926340 22.422582
10 1.526086 60.773087 26.094743
11 1.636303 60.233596 22.496313
12 1.585557 59.798904 23.786453
13 1.604561 60.725505 23.586247
14 1.618125 60.008555 22.918658
15 1.609263 61.205441 23.633928
16 1.614334 60.998740 23.406365
17 1.692908 59.084046 20.615958
18 1.545380 58.278846 24.402824
19 1.572084 59.759869 24.180088
20 1.543847 59.237603 24.853593
21 1.574731 60.450504 24.377392
22 1.581154 58.665411 23.465723
23 1.627137 60.157900 22.721890
24 1.573138 60.682653 24.520576
25 1.567199 60.141314 24.486370
26 1.584278 60.423229 24.073614
27 1.603280 60.133212 23.393512
28 1.648226 59.678417 21.967655
29 1.600238 58.983817 23.033699
... ... ... ...
70 1.652888 57.802395 21.157243
71 1.596816 62.396018 24.470728
72 1.657528 61.250404 22.293958
73 1.673148 58.042489 20.733715
74 1.507153 60.806251 26.769101
75 1.619881 59.094099 22.520506
76 1.622517 59.199302 22.487330
77 1.595906 60.075127 23.587412
78 1.659761 58.601985 21.272637
79 1.711083 59.453638 20.306556
80 1.632763 58.053691 21.776266
81 1.612295 59.658539 22.950060
82 1.624213 60.653229 22.991522
83 1.542336 59.840166 25.155607
84 1.591212 59.762734 23.603385
85 1.575955 59.963829 24.143583
86 1.623107 60.997220 23.153438
87 1.658766 60.111609 21.846824
88 1.561854 60.627608 24.853624
89 1.577502 60.144334 24.168787
90 1.623968 59.850311 22.694030
91 1.597063 61.065658 23.941591
92 1.682761 59.906006 21.155602
93 1.641190 60.122483 22.321282
94 1.581411 60.415878 24.158047
95 1.546597 60.029364 25.096279
96 1.591066 60.472437 23.888081
97 1.490416 59.096371 26.603924
98 1.555316 59.681624 24.671940
99 1.515912 61.311259 26.680384

100 rows × 3 columns

In [19]:
sns.lmplot(x='height', y='bmi', data=women)
C:\Users\mclou\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x1e361e48c50>
In [20]:
men['gender'] = 'male'
women['gender'] = 'female'
people = pd.concat([men, women])
In [21]:
people
Out[21]:
height weight bmi gender
0 1.846326 84.350208 24.743971 male
1 1.705580 83.648994 28.755193 male
2 1.756489 84.277095 27.316110 male
3 1.848740 84.762614 24.800061 male
4 1.828322 85.054365 25.444335 male
5 1.792602 85.289775 26.541740 male
6 1.790928 86.048167 26.827812 male
7 1.809917 83.658219 25.538256 male
8 1.804939 84.539923 25.949953 male
9 1.755725 86.369401 28.018621 male
10 1.817654 85.530689 25.888063 male
11 1.798073 84.488833 26.132729 male
12 1.789017 85.597579 26.744371 male
13 1.736679 85.642032 28.395390 male
14 1.717038 84.934525 28.808750 male
15 1.754468 85.220435 27.685526 male
16 1.745762 86.236397 28.295705 male
17 1.738966 83.296604 27.545156 male
18 1.744043 84.484906 27.775682 male
19 1.814423 87.287624 26.514010 male
20 1.696548 85.193880 29.598924 male
21 1.834508 86.174038 25.605727 male
22 1.847978 85.316498 24.982692 male
23 1.838084 84.265763 24.941385 male
24 1.791247 86.197007 26.864651 male
25 1.738719 85.308274 28.218388 male
26 1.810138 84.111496 25.670353 male
27 1.828684 84.670985 25.319619 male
28 1.822241 85.705984 25.810675 male
29 1.727342 87.253849 29.243388 male
... ... ... ... ...
70 1.652888 57.802395 21.157243 female
71 1.596816 62.396018 24.470728 female
72 1.657528 61.250404 22.293958 female
73 1.673148 58.042489 20.733715 female
74 1.507153 60.806251 26.769101 female
75 1.619881 59.094099 22.520506 female
76 1.622517 59.199302 22.487330 female
77 1.595906 60.075127 23.587412 female
78 1.659761 58.601985 21.272637 female
79 1.711083 59.453638 20.306556 female
80 1.632763 58.053691 21.776266 female
81 1.612295 59.658539 22.950060 female
82 1.624213 60.653229 22.991522 female
83 1.542336 59.840166 25.155607 female
84 1.591212 59.762734 23.603385 female
85 1.575955 59.963829 24.143583 female
86 1.623107 60.997220 23.153438 female
87 1.658766 60.111609 21.846824 female
88 1.561854 60.627608 24.853624 female
89 1.577502 60.144334 24.168787 female
90 1.623968 59.850311 22.694030 female
91 1.597063 61.065658 23.941591 female
92 1.682761 59.906006 21.155602 female
93 1.641190 60.122483 22.321282 female
94 1.581411 60.415878 24.158047 female
95 1.546597 60.029364 25.096279 female
96 1.591066 60.472437 23.888081 female
97 1.490416 59.096371 26.603924 female
98 1.555316 59.681624 24.671940 female
99 1.515912 61.311259 26.680384 female

200 rows × 4 columns

In [22]:
sns.lmplot(x='height', y='bmi', data=people)
C:\Users\mclou\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x1e361e48860>

End