Statistics¶

Some matplotlib and pandas experiments based on "Head First Statistics" from O'Reilly Media, Inc.

In [207]:

from pandas import *

1. First Impression¶

Why learn statistics? (p. 3)¶

In [208]:

profit_data = [("jul", 2.0), ("aug", 2.1), ("sep", 2.2), ("okt", 2.1), ("nov", 2.3), ("dez", 2.4)]
profit_data = DataFrame(profit_data, columns=["month", "profit"])
profit_data

Out[208]:

	month	profit
0	jul	2.0
1	aug	2.1
2	sep	2.2
3	okt	2.1
4	nov	2.3
5	dez	2.4

In [209]:

profit_fig, profit_axes = plt.subplots(nrows=1, ncols=2)
profit_fig.set_size_inches(15,5)
profit_axes[0].set_ylim(0, 2.5)
profit_data.plot(ax = profit_axes[0], x="month").set_title("Stagnating profits.")
profit_axes[1].set_ylim(2.0, 2.5)
profit_data.plot(ax = profit_axes[1], x="month").set_title("Awesome profits!");

The figures use different scaling and offset of the x axes, thus giving a different impression of the profit.

Manic Mango's Diagrams¶

In [210]:

mango_data = [("sports", 27500, 0.99), ("strategy", 11500, 0.9), ("action", 6000, 0.85), ("shooter", 3500, 0.95), ("others", 1500, 0.85)]
mango_data = DataFrame(mango_data, columns=["type", "sales", "satisfaction"])
mango_data

Out[210]:

	type	sales	satisfaction
0	sports	27500	0.99
1	strategy	11500	0.90
2	action	6000	0.85
3	shooter	3500	0.95
4	others	1500	0.85

In [211]:

mango_colors=["#F3E761", "#F0F0F0", "#97C2B9", "#E9ABC7", "#CAA882"]
mango_fig, mango_axes = subplots(nrows=1, ncols=2)
mango_fig.set_size_inches(16,7)
_mango_sales = mango_data.sort("sales")
mango_axes[0].pie(_mango_sales.sales, labels=_mango_sales.type + "\n" + _mango_sales.sales.astype(str), startangle=90, colors=mango_colors) # autopct='%1.1f%%'
mango_axes[0].set_title("sales by type");
mango_axes[1].pie(_mango_sales.satisfaction, labels=_mango_sales.type + "\n" + (_mango_sales.satisfaction*100).astype(str) + "%", startangle=90, colors=mango_colors)
mango_axes[1].set_title("satisfied players by type");

Pie charts are only useful if the values are not all similar, and if they are all parts of a common whole.

In [212]:

mango_regions = [("A", 1000), ("B", 5000), ("C", 7500), ("D", 8000), ("E", 9500)]
mango_regions = DataFrame(mango_regions, columns=["region", "sales"])
mango_regions

Out[212]:

	region	sales
0	A	1000
1	B	5000
2	C	7500
3	D	8000
4	E	9500

In [213]:

fig, axes = subplots(nrows=2, ncols=2)
fig.set_size_inches(16,11)
mango_regions_index = arange(len(mango_regions))
mango_width=0.8
axes[0,0].bar(mango_regions_index - mango_width/2, mango_regions.sales, width=mango_width, color="#cccccc")
axes[0,0].set_xticks(mango_regions_index)
axes[0,0].set_xticklabels(mango_regions.region);
axes[0,0].set_title("sales by region");
mango_data_index = arange(len(mango_data))

axes[0,1].barh(mango_data_index - mango_width/2, list(reversed(list(100*mango_data.satisfaction))), height=mango_width, color="#cccccc")
axes[0,1].set_yticks(mango_data_index)
axes[0,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[0,1].set_title("satisfaction by type");

mango_data_sat = list(reversed(list(mango_data.satisfaction*mango_data.sales)))
mango_data_notsat = list(reversed(list((1-mango_data.satisfaction)*mango_data.sales)))
mango_b1 = axes[1,0].barh(mango_data_index, mango_data_sat, height=mango_width/2, color="#8888ff")
mango_b2 = axes[1,0].barh(mango_data_index - mango_width/2, mango_data_notsat, height=mango_width/2, color="#88ff88")
axes[1,0].set_yticks(mango_data_index)
axes[1,0].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,0].set_title("satisfaction by type (absolute)");
axes[1,0].legend([mango_b1, mango_b2], ["satisfied", "unsatisfied"], loc="lower right");

mango_b3 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_sat, height=mango_width, color="#8888ff")
mango_b4 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_notsat, left=mango_data_sat, height=mango_width, color="#88ff88")
axes[1,1].set_yticks(mango_data_index)
axes[1,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,1].set_title("satisfaction by type (absolute)");
axes[1,1].legend([mango_b3, mango_b4], ["satisfied", "unsatisfied"], loc="lower right");

Histogram¶

In [214]:

score_data = [(0,199,5), (200,399,29), (400,599, 56), (600, 799, 17), (800,999,3)]
score_data = DataFrame(score_data, columns=("points_from", "points_to", "frequency"))
score_data

Out[214]:

	points_from	points_to	frequency
0	0	199	5
1	200	399	29
2	400	599	56
3	600	799	17
4	800	999	3

In [215]:

score_fig, score_axes = subplots(nrows=1, ncols=2)
score_fig.set_size_inches(16,4)
score_axes[0].bar(score_data.points_from, score_data.frequency, width=score_data.points_to-score_data.points_from+1, color="#cccccc");
score_axes[0].set_title("Frequency of games by points");
score_axes[0].set_xlabel("Points");
score_axes[0].set_ylabel("Frequency");
score_list = (score_data.points_to+score_data.points_from)/2
score_scores = []
for score in score_list:
    score_scores.extend(repeat(score, score_data[(score_data.points_from <= score) & (score_data.points_to >= score)].frequency))
score_axes[1].hist(score_scores, bins=sorted(set(list(score_data.points_from) + list(score_data.points_to+1))), color="#cccccc")
score_axes[1].set_title("Frequency of games by points");
score_axes[1].set_xlabel("Points");
score_axes[1].set_ylabel("Frequency");

In [216]:

gamelen_data = [(0,1,4300), (1,3,6900), (3,5,4900), (5,10,2000), (10,24,2100)]
gamelen_data = DataFrame(gamelen_data, columns=("len_from", "len_to", "frequency"))
gamelen_data

Out[216]:

	len_from	len_to	frequency
0	0	1	4300
1	1	3	6900
2	3	5	4900
3	5	10	2000
4	10	24	2100

In [217]:

gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=2)
gamelen_fig.set_size_inches(16,4)
gamelen_axis[0].bar(gamelen_data.len_from, gamelen_data.frequency, width=gamelen_data.len_to-gamelen_data.len_from, color="#cccccc");
gamelen_axis[0].set_title("Frequency of games by length");
gamelen_axis[0].set_xlabel("Length/h");
gamelen_axis[0].set_ylabel("Frequency");
gamelen_axis[0].spines['right'].set_color('none')
gamelen_axis[0].spines['top'].set_color('none')
gamelen_axis[0].xaxis.set_ticks_position('bottom')
gamelen_axis[0].yaxis.set_ticks_position('left')
gamelen_list = (gamelen_data.len_to+gamelen_data.len_from)/2
gamelen_list2 = []
for gamelen in gamelen_list:
    gamelen_list2.extend(repeat(gamelen, gamelen_data[(gamelen_data.len_from <= gamelen) & (gamelen_data.len_to >= gamelen)].frequency))
gamelen_axis[1].hist(gamelen_list2, bins=sorted(set(list(gamelen_data.len_from) + list(gamelen_data.len_to))), color="#cccccc", normed=True)
gamelen_axis[1].set_title("Frequency of games by length (normalized)");
gamelen_axis[1].set_xlabel("Length/h");
gamelen_axis[1].set_ylabel("Frequency");
gamelen_axis[1].spines['right'].set_color('none')
gamelen_axis[1].spines['top'].set_color('none')
gamelen_axis[1].xaxis.set_ticks_position('bottom')
gamelen_axis[1].yaxis.set_ticks_position('left')

In [218]:

gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=1)
gamelen_fig.set_size_inches(7,4)
gamelen_axis.plot([0] + list(gamelen_data.len_to), [0] + list(gamelen_data.frequency.cumsum()));
gamelen_axis.set_title("Cumulative sum of games by length");
gamelen_axis.set_xlabel("Length/h");
gamelen_axis.set_ylabel("Cumulative sum");
gamelen_axis.spines['right'].set_color('none')
gamelen_axis.spines['top'].set_color('none')
gamelen_axis.xaxis.set_ticks_position('bottom')
gamelen_axis.yaxis.set_ticks_position('left')
gamelen_axis.set_xlim(0)
gamelen_axis.set_ylim(0, 22500);
gamelen_axis.yaxis.set_major_locator(MultipleLocator(2500.0))

2. Averages¶

Arithmetic Mean¶

In [219]:

power_ages = DataFrame([19,20,20,20,21], columns=["age"])
power_ages

Out[219]:

	age
0	19
1	20
2	20
3	20
4	21

Arithmetic mean: \begin{equation} \textrm{mean}(x_1,\...,x_n)=\overline x={1\over n}\sum_{i=1}^n x_i \end{equation}

In [220]:

mean(power_ages.age)

Out[220]:

20.0

In [221]:

power_agefreqs = DataFrame([(19, 1), (20, 3), (21, 1)], columns=["age", "frequency"])
power_agefreqs

Out[221]:

	age	frequency
0	19	1
1	20	3
2	21	1

In [222]:

def avg(frame):
    return sum(1. * frame.age * frame.frequency) / sum(frame.frequency)
avg(power_agefreqs)

Out[222]:

20.0

In [223]:

kungfu_agefreqs = DataFrame([(19, 3), (20, 5), (21, 3), (136, 1), (138, 1)], columns=["age", "frequency"])
kungfu_agefreqs

Out[223]:

	age	frequency
0	19	3
1	20	5
2	21	3
3	136	1
4	138	1

In [224]:

avg(kungfu_agefreqs)

Out[224]:

38.0

In [225]:

agefreq_fig, agefreq_axes = subplots(nrows=1, ncols=2)
agefreq_fig.set_size_inches(16,4)
agefreq_axes[0].set_title("Age in Power-Workout");
agefreq_axes[0].bar(power_agefreqs.age, power_agefreqs.frequency, width=1, color="#cccccc");
agefreq_axes[0].set_xlim(18,24)
agefreq_axes[0].set_ylim(0, 5)
agefreq_axes[1].set_title("Age in Kung-Fu");
agefreq_axes[1].get_xaxis().set_visible(False)
agefreq_axes[1].get_yaxis().set_visible(False)
agefreq_axes[1].spines['bottom'].set_visible(False)
agefreq_axes[1].spines['top'].set_visible(False)
# Subdivide the right axes vertically: 1 row, 4 columns, select 3rd and 4th.
agefreq1 = agefreq_fig.add_subplot(143)
agefreq2 = agefreq_fig.add_subplot(144)
agefreq1.set_ylim(0, 10)
agefreq2.set_ylim(0, 10)
agefreq1.set_xlim(18,24)
agefreq2.set_xlim(135,140)
agefreq1.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq2.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq1.spines['right'].set_visible(False)
agefreq1.get_yaxis().tick_left()
agefreq2.spines['left'].set_visible(False)
agefreq2.get_yaxis().tick_right()
agefreq2.tick_params(labelright='off')

Median¶

Note that the mean may not occur in the data. The median has the advantage that it exists as a data point:

In [226]:

kungfu_ages = []
for index, data in kungfu_agefreqs.iterrows():
    kungfu_ages.extend(repeat(data["age"], data["frequency"]))
mean(kungfu_ages), median(kungfu_ages)

Out[226]:

(38.0, 20.0)

However, if the number of data points is even, the media is the value halfway between the two values around the middle:

In [227]:

median([10,20])

Out[227]:

15.0

In [228]:

skew_fig, skew_axes = subplots(nrows=1, ncols=3)
skew_fig.set_size_inches(17,4)
skew1 = randn(1000000)
skew2 = exp(1+0.4*skew1)
skew3 = -skew2
skew_axes[0].set_title("Unskewed normal, mean = median")
skew_axes[0].hist(skew1, bins=30, normed=True, color="#cccccc");
skew_axes[0].plot(mean(skew1), 0.2, 'go');
skew_axes[0].plot(median(skew1), 0.15, 'rD');
skew_axes[0].set_xlim(-4,4)
skew_axes[1].set_title("Right-skewed log-normal, median < mean");
skew_axes[1].hist(skew2, bins=30, normed=True, color="#cccccc");
skew_axes[1].plot(mean(skew2), 0.2, 'go');
skew_axes[1].plot(median(skew2), 0.15, 'rD');
skew_axes[1].set_xlim(0,10)
skew_axes[2].set_title("Left-skewed (mirrored) log-normal, median > mean");
skew_axes[2].hist(skew3, bins=30, normed=True, color="#cccccc");
skew_axes[2].plot(mean(skew3), 0.2, 'go');
skew_axes[2].plot(median(skew3), 0.15, 'rD');
skew_axes[2].set_xlim(-10,0);

In [229]:

swim_agefreqs = DataFrame([(1, 3), (2, 4), (3, 2), (31, 2), (32, 4), (33, 3)], columns=["age", "frequency"])
swim_agefreqs

Out[229]:

	age	frequency
0	1	3
1	2	4
2	3	2
3	31	2
4	32	4
5	33	3

The mean and median are both misleading in this case:

In [230]:

swim_ages = []
for index, data in swim_agefreqs.iterrows():
    swim_ages.extend(repeat(data["age"], data["frequency"]))
mean(swim_ages), median(swim_ages)

Out[230]:

(17.0, 17.0)

In [231]:

import scipy.stats
val, cnt = scipy.stats.mstats.mode(array(swim_ages))
val

Out[231]:

array([ 2.])

Note that 32 is another mode.

3. Variance and Deviation¶

Span, Interquantildistance, Percentils¶

In [232]:

player1 = DataFrame([(7,1),(8,1),(9,2),(10,3),(11,2),(12,1),(13,1)], columns=("points", "frequency"))
player2 = DataFrame([(7,1),(9,2),(10,5),(11,2),(13,1)], columns=("points", "frequency"))
player3 = DataFrame([(3,2),(6,1),(7,2),(10,3),(11,1),(13,1),(30,1)], columns=("points", "frequency"))
player1["player"] = 1
player2["player"] = 2
player3["player"] = 3
player_stats = pandas.concat([player1, player2, player3])
player_stats

Out[232]:

	points	frequency	player
0	7	1	1
1	8	1	1
2	9	2	1
3	10	3	1
4	11	2	1
5	12	1	1
6	13	1	1
0	7	1	2
1	9	2	2
2	10	5	2
3	11	2	2
4	13	1	2
0	3	2	3
1	6	1	3
2	7	2	3
3	10	3	3
4	11	1	3
5	13	1	3
6	30	1	3

In [233]:

player_data = []

def get_stats(pstats):
    player, stats = pstats
    vals = []
    for index, data in stats.iterrows():
        vals.extend(repeat(data["points"], data["frequency"]))
    vals = array(vals)
    player_data.append(vals)
    return player, mean(vals), median(vals), scipy.stats.mstats.mode(vals)[0][0], amin(vals), amax(vals), percentile(vals,25), percentile(vals,75)

player_avgs = DataFrame(map(get_stats, player_stats.groupby(player_stats.player)),
                        columns=("player", "mean", "median", "mode", "minpoints", "maxpoints", "q1","q3"))
player_avgs

Out[233]:

	player	mean	median	mode	minpoints	maxpoints	q1	q3
0	1	10	10	10	7	13	9.0	11.0
1	2	10	10	10	7	13	9.5	10.5
2	3	10	10	10	3	30	6.5	10.5

In [234]:

player_avgs['span'] = player_avgs.maxpoints - player_avgs.minpoints
player_avgs['interq'] = player_avgs.q3 - player_avgs.q1
player_avgs[['player', 'span', 'interq']]

Out[234]:

	player	span	interq
0	1	6	2
1	2	6	1
2	3	27	4

In [235]:

player_fig, player_axes = subplots()
player_fig.set_size_inches(7,4)
player_axes.boxplot(player_data, vert=False, whis=1.5);
player_axes.set_title("Results of basketball players");
player_axes.set_xlim(0,33)

Out[235]:

(0, 33)

Variance, Standard Deviation¶

\begin{equation} V(x) = {1 \over n} \sum_i(x_i-\overline x)^2 \end{equation}\begin{equation} \sigma(x) = \sqrt{V(x)} \end{equation}

A discussion for possible reasons why we prefer the mean square error can be found here: http://stats.stackexchange.com/questions/118/why-square-the-difference-instead-of-taking-the-absolute-value-in-standard-devia

In [236]:

player_avgs["var"] = var(player_data, axis=1)
player_avgs["std"] = std(player_data, axis=1)
player_avgs[["player", "var", "std"]]

Out[236]:

	player	var	std
0	1	2.727273	1.651446
1	2	2.000000	1.414214
2	3	49.272727	7.019453

Standard Score (z-score)¶

\begin{equation} z(x) = \frac{ x - \overline{x} }{ \sigma(x) } \end{equation}

In [237]:

scipy.stats.mstats.zscore(player_data[0]);
player_points = player_avgs["mean"] + player_avgs["std"]
array([scipy.stats.zmap([10, 11, player_points[0], 12, 30],
 player_data[0]), scipy.stats.zmap([10, 11, 12, 30],
 player_data[1]), scipy.stats.zmap([10, 11, 12, 30], player_data[2])])

Out[237]:

array([ array([  0.        ,   0.60553007,   1.        ,   1.21106014,  12.11060142]),
       array([  0.        ,   0.70710678,   1.41421356,  14.14213562]),
       array([ 0.        ,  0.14246123,  0.28492247,  2.84922466])], dtype=object)