Some matplotlib and pandas experiments based on "Head First Statistics" from O'Reilly Media, Inc.
from pandas import *
profit_data = [("jul", 2.0), ("aug", 2.1), ("sep", 2.2), ("okt", 2.1), ("nov", 2.3), ("dez", 2.4)]
profit_data = DataFrame(profit_data, columns=["month", "profit"])
profit_data
month | profit | |
---|---|---|
0 | jul | 2.0 |
1 | aug | 2.1 |
2 | sep | 2.2 |
3 | okt | 2.1 |
4 | nov | 2.3 |
5 | dez | 2.4 |
profit_fig, profit_axes = plt.subplots(nrows=1, ncols=2)
profit_fig.set_size_inches(15,5)
profit_axes[0].set_ylim(0, 2.5)
profit_data.plot(ax = profit_axes[0], x="month").set_title("Stagnating profits.")
profit_axes[1].set_ylim(2.0, 2.5)
profit_data.plot(ax = profit_axes[1], x="month").set_title("Awesome profits!");
The figures use different scaling and offset of the x axes, thus giving a different impression of the profit.
mango_data = [("sports", 27500, 0.99), ("strategy", 11500, 0.9), ("action", 6000, 0.85), ("shooter", 3500, 0.95), ("others", 1500, 0.85)]
mango_data = DataFrame(mango_data, columns=["type", "sales", "satisfaction"])
mango_data
type | sales | satisfaction | |
---|---|---|---|
0 | sports | 27500 | 0.99 |
1 | strategy | 11500 | 0.90 |
2 | action | 6000 | 0.85 |
3 | shooter | 3500 | 0.95 |
4 | others | 1500 | 0.85 |
mango_colors=["#F3E761", "#F0F0F0", "#97C2B9", "#E9ABC7", "#CAA882"]
mango_fig, mango_axes = subplots(nrows=1, ncols=2)
mango_fig.set_size_inches(16,7)
_mango_sales = mango_data.sort("sales")
mango_axes[0].pie(_mango_sales.sales, labels=_mango_sales.type + "\n" + _mango_sales.sales.astype(str), startangle=90, colors=mango_colors) # autopct='%1.1f%%'
mango_axes[0].set_title("sales by type");
mango_axes[1].pie(_mango_sales.satisfaction, labels=_mango_sales.type + "\n" + (_mango_sales.satisfaction*100).astype(str) + "%", startangle=90, colors=mango_colors)
mango_axes[1].set_title("satisfied players by type");
Pie charts are only useful if the values are not all similar, and if they are all parts of a common whole.
mango_regions = [("A", 1000), ("B", 5000), ("C", 7500), ("D", 8000), ("E", 9500)]
mango_regions = DataFrame(mango_regions, columns=["region", "sales"])
mango_regions
region | sales | |
---|---|---|
0 | A | 1000 |
1 | B | 5000 |
2 | C | 7500 |
3 | D | 8000 |
4 | E | 9500 |
fig, axes = subplots(nrows=2, ncols=2)
fig.set_size_inches(16,11)
mango_regions_index = arange(len(mango_regions))
mango_width=0.8
axes[0,0].bar(mango_regions_index - mango_width/2, mango_regions.sales, width=mango_width, color="#cccccc")
axes[0,0].set_xticks(mango_regions_index)
axes[0,0].set_xticklabels(mango_regions.region);
axes[0,0].set_title("sales by region");
mango_data_index = arange(len(mango_data))
axes[0,1].barh(mango_data_index - mango_width/2, list(reversed(list(100*mango_data.satisfaction))), height=mango_width, color="#cccccc")
axes[0,1].set_yticks(mango_data_index)
axes[0,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[0,1].set_title("satisfaction by type");
mango_data_sat = list(reversed(list(mango_data.satisfaction*mango_data.sales)))
mango_data_notsat = list(reversed(list((1-mango_data.satisfaction)*mango_data.sales)))
mango_b1 = axes[1,0].barh(mango_data_index, mango_data_sat, height=mango_width/2, color="#8888ff")
mango_b2 = axes[1,0].barh(mango_data_index - mango_width/2, mango_data_notsat, height=mango_width/2, color="#88ff88")
axes[1,0].set_yticks(mango_data_index)
axes[1,0].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,0].set_title("satisfaction by type (absolute)");
axes[1,0].legend([mango_b1, mango_b2], ["satisfied", "unsatisfied"], loc="lower right");
mango_b3 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_sat, height=mango_width, color="#8888ff")
mango_b4 = axes[1,1].barh(mango_data_index - mango_width/2, mango_data_notsat, left=mango_data_sat, height=mango_width, color="#88ff88")
axes[1,1].set_yticks(mango_data_index)
axes[1,1].set_yticklabels(list(reversed(list(mango_data.type))))
axes[1,1].set_title("satisfaction by type (absolute)");
axes[1,1].legend([mango_b3, mango_b4], ["satisfied", "unsatisfied"], loc="lower right");
score_data = [(0,199,5), (200,399,29), (400,599, 56), (600, 799, 17), (800,999,3)]
score_data = DataFrame(score_data, columns=("points_from", "points_to", "frequency"))
score_data
points_from | points_to | frequency | |
---|---|---|---|
0 | 0 | 199 | 5 |
1 | 200 | 399 | 29 |
2 | 400 | 599 | 56 |
3 | 600 | 799 | 17 |
4 | 800 | 999 | 3 |
score_fig, score_axes = subplots(nrows=1, ncols=2)
score_fig.set_size_inches(16,4)
score_axes[0].bar(score_data.points_from, score_data.frequency, width=score_data.points_to-score_data.points_from+1, color="#cccccc");
score_axes[0].set_title("Frequency of games by points");
score_axes[0].set_xlabel("Points");
score_axes[0].set_ylabel("Frequency");
score_list = (score_data.points_to+score_data.points_from)/2
score_scores = []
for score in score_list:
score_scores.extend(repeat(score, score_data[(score_data.points_from <= score) & (score_data.points_to >= score)].frequency))
score_axes[1].hist(score_scores, bins=sorted(set(list(score_data.points_from) + list(score_data.points_to+1))), color="#cccccc")
score_axes[1].set_title("Frequency of games by points");
score_axes[1].set_xlabel("Points");
score_axes[1].set_ylabel("Frequency");
gamelen_data = [(0,1,4300), (1,3,6900), (3,5,4900), (5,10,2000), (10,24,2100)]
gamelen_data = DataFrame(gamelen_data, columns=("len_from", "len_to", "frequency"))
gamelen_data
len_from | len_to | frequency | |
---|---|---|---|
0 | 0 | 1 | 4300 |
1 | 1 | 3 | 6900 |
2 | 3 | 5 | 4900 |
3 | 5 | 10 | 2000 |
4 | 10 | 24 | 2100 |
gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=2)
gamelen_fig.set_size_inches(16,4)
gamelen_axis[0].bar(gamelen_data.len_from, gamelen_data.frequency, width=gamelen_data.len_to-gamelen_data.len_from, color="#cccccc");
gamelen_axis[0].set_title("Frequency of games by length");
gamelen_axis[0].set_xlabel("Length/h");
gamelen_axis[0].set_ylabel("Frequency");
gamelen_axis[0].spines['right'].set_color('none')
gamelen_axis[0].spines['top'].set_color('none')
gamelen_axis[0].xaxis.set_ticks_position('bottom')
gamelen_axis[0].yaxis.set_ticks_position('left')
gamelen_list = (gamelen_data.len_to+gamelen_data.len_from)/2
gamelen_list2 = []
for gamelen in gamelen_list:
gamelen_list2.extend(repeat(gamelen, gamelen_data[(gamelen_data.len_from <= gamelen) & (gamelen_data.len_to >= gamelen)].frequency))
gamelen_axis[1].hist(gamelen_list2, bins=sorted(set(list(gamelen_data.len_from) + list(gamelen_data.len_to))), color="#cccccc", normed=True)
gamelen_axis[1].set_title("Frequency of games by length (normalized)");
gamelen_axis[1].set_xlabel("Length/h");
gamelen_axis[1].set_ylabel("Frequency");
gamelen_axis[1].spines['right'].set_color('none')
gamelen_axis[1].spines['top'].set_color('none')
gamelen_axis[1].xaxis.set_ticks_position('bottom')
gamelen_axis[1].yaxis.set_ticks_position('left')
gamelen_fig, gamelen_axis = subplots(nrows=1, ncols=1)
gamelen_fig.set_size_inches(7,4)
gamelen_axis.plot([0] + list(gamelen_data.len_to), [0] + list(gamelen_data.frequency.cumsum()));
gamelen_axis.set_title("Cumulative sum of games by length");
gamelen_axis.set_xlabel("Length/h");
gamelen_axis.set_ylabel("Cumulative sum");
gamelen_axis.spines['right'].set_color('none')
gamelen_axis.spines['top'].set_color('none')
gamelen_axis.xaxis.set_ticks_position('bottom')
gamelen_axis.yaxis.set_ticks_position('left')
gamelen_axis.set_xlim(0)
gamelen_axis.set_ylim(0, 22500);
gamelen_axis.yaxis.set_major_locator(MultipleLocator(2500.0))
power_ages = DataFrame([19,20,20,20,21], columns=["age"])
power_ages
age | |
---|---|
0 | 19 |
1 | 20 |
2 | 20 |
3 | 20 |
4 | 21 |
Arithmetic mean: \begin{equation} \textrm{mean}(x_1,\...,x_n)=\overline x={1\over n}\sum_{i=1}^n x_i \end{equation}
mean(power_ages.age)
20.0
power_agefreqs = DataFrame([(19, 1), (20, 3), (21, 1)], columns=["age", "frequency"])
power_agefreqs
age | frequency | |
---|---|---|
0 | 19 | 1 |
1 | 20 | 3 |
2 | 21 | 1 |
def avg(frame):
return sum(1. * frame.age * frame.frequency) / sum(frame.frequency)
avg(power_agefreqs)
20.0
kungfu_agefreqs = DataFrame([(19, 3), (20, 5), (21, 3), (136, 1), (138, 1)], columns=["age", "frequency"])
kungfu_agefreqs
age | frequency | |
---|---|---|
0 | 19 | 3 |
1 | 20 | 5 |
2 | 21 | 3 |
3 | 136 | 1 |
4 | 138 | 1 |
avg(kungfu_agefreqs)
38.0
agefreq_fig, agefreq_axes = subplots(nrows=1, ncols=2)
agefreq_fig.set_size_inches(16,4)
agefreq_axes[0].set_title("Age in Power-Workout");
agefreq_axes[0].bar(power_agefreqs.age, power_agefreqs.frequency, width=1, color="#cccccc");
agefreq_axes[0].set_xlim(18,24)
agefreq_axes[0].set_ylim(0, 5)
agefreq_axes[1].set_title("Age in Kung-Fu");
agefreq_axes[1].get_xaxis().set_visible(False)
agefreq_axes[1].get_yaxis().set_visible(False)
agefreq_axes[1].spines['bottom'].set_visible(False)
agefreq_axes[1].spines['top'].set_visible(False)
# Subdivide the right axes vertically: 1 row, 4 columns, select 3rd and 4th.
agefreq1 = agefreq_fig.add_subplot(143)
agefreq2 = agefreq_fig.add_subplot(144)
agefreq1.set_ylim(0, 10)
agefreq2.set_ylim(0, 10)
agefreq1.set_xlim(18,24)
agefreq2.set_xlim(135,140)
agefreq1.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq2.bar(kungfu_agefreqs.age, kungfu_agefreqs.frequency, width=1, color="#cccccc");
agefreq1.spines['right'].set_visible(False)
agefreq1.get_yaxis().tick_left()
agefreq2.spines['left'].set_visible(False)
agefreq2.get_yaxis().tick_right()
agefreq2.tick_params(labelright='off')
Note that the mean may not occur in the data. The median has the advantage that it exists as a data point:
kungfu_ages = []
for index, data in kungfu_agefreqs.iterrows():
kungfu_ages.extend(repeat(data["age"], data["frequency"]))
mean(kungfu_ages), median(kungfu_ages)
(38.0, 20.0)
However, if the number of data points is even, the media is the value halfway between the two values around the middle:
median([10,20])
15.0
skew_fig, skew_axes = subplots(nrows=1, ncols=3)
skew_fig.set_size_inches(17,4)
skew1 = randn(1000000)
skew2 = exp(1+0.4*skew1)
skew3 = -skew2
skew_axes[0].set_title("Unskewed normal, mean = median")
skew_axes[0].hist(skew1, bins=30, normed=True, color="#cccccc");
skew_axes[0].plot(mean(skew1), 0.2, 'go');
skew_axes[0].plot(median(skew1), 0.15, 'rD');
skew_axes[0].set_xlim(-4,4)
skew_axes[1].set_title("Right-skewed log-normal, median < mean");
skew_axes[1].hist(skew2, bins=30, normed=True, color="#cccccc");
skew_axes[1].plot(mean(skew2), 0.2, 'go');
skew_axes[1].plot(median(skew2), 0.15, 'rD');
skew_axes[1].set_xlim(0,10)
skew_axes[2].set_title("Left-skewed (mirrored) log-normal, median > mean");
skew_axes[2].hist(skew3, bins=30, normed=True, color="#cccccc");
skew_axes[2].plot(mean(skew3), 0.2, 'go');
skew_axes[2].plot(median(skew3), 0.15, 'rD');
skew_axes[2].set_xlim(-10,0);
swim_agefreqs = DataFrame([(1, 3), (2, 4), (3, 2), (31, 2), (32, 4), (33, 3)], columns=["age", "frequency"])
swim_agefreqs
age | frequency | |
---|---|---|
0 | 1 | 3 |
1 | 2 | 4 |
2 | 3 | 2 |
3 | 31 | 2 |
4 | 32 | 4 |
5 | 33 | 3 |
The mean and median are both misleading in this case:
swim_ages = []
for index, data in swim_agefreqs.iterrows():
swim_ages.extend(repeat(data["age"], data["frequency"]))
mean(swim_ages), median(swim_ages)
(17.0, 17.0)
import scipy.stats
val, cnt = scipy.stats.mstats.mode(array(swim_ages))
val
array([ 2.])
Note that 32 is another mode.
player1 = DataFrame([(7,1),(8,1),(9,2),(10,3),(11,2),(12,1),(13,1)], columns=("points", "frequency"))
player2 = DataFrame([(7,1),(9,2),(10,5),(11,2),(13,1)], columns=("points", "frequency"))
player3 = DataFrame([(3,2),(6,1),(7,2),(10,3),(11,1),(13,1),(30,1)], columns=("points", "frequency"))
player1["player"] = 1
player2["player"] = 2
player3["player"] = 3
player_stats = pandas.concat([player1, player2, player3])
player_stats
points | frequency | player | |
---|---|---|---|
0 | 7 | 1 | 1 |
1 | 8 | 1 | 1 |
2 | 9 | 2 | 1 |
3 | 10 | 3 | 1 |
4 | 11 | 2 | 1 |
5 | 12 | 1 | 1 |
6 | 13 | 1 | 1 |
0 | 7 | 1 | 2 |
1 | 9 | 2 | 2 |
2 | 10 | 5 | 2 |
3 | 11 | 2 | 2 |
4 | 13 | 1 | 2 |
0 | 3 | 2 | 3 |
1 | 6 | 1 | 3 |
2 | 7 | 2 | 3 |
3 | 10 | 3 | 3 |
4 | 11 | 1 | 3 |
5 | 13 | 1 | 3 |
6 | 30 | 1 | 3 |
player_data = []
def get_stats(pstats):
player, stats = pstats
vals = []
for index, data in stats.iterrows():
vals.extend(repeat(data["points"], data["frequency"]))
vals = array(vals)
player_data.append(vals)
return player, mean(vals), median(vals), scipy.stats.mstats.mode(vals)[0][0], amin(vals), amax(vals), percentile(vals,25), percentile(vals,75)
player_avgs = DataFrame(map(get_stats, player_stats.groupby(player_stats.player)),
columns=("player", "mean", "median", "mode", "minpoints", "maxpoints", "q1","q3"))
player_avgs
player | mean | median | mode | minpoints | maxpoints | q1 | q3 | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 10 | 10 | 10 | 7 | 13 | 9.0 | 11.0 |
1 | 2 | 10 | 10 | 10 | 7 | 13 | 9.5 | 10.5 |
2 | 3 | 10 | 10 | 10 | 3 | 30 | 6.5 | 10.5 |
player_avgs['span'] = player_avgs.maxpoints - player_avgs.minpoints
player_avgs['interq'] = player_avgs.q3 - player_avgs.q1
player_avgs[['player', 'span', 'interq']]
player | span | interq | |
---|---|---|---|
0 | 1 | 6 | 2 |
1 | 2 | 6 | 1 |
2 | 3 | 27 | 4 |
player_fig, player_axes = subplots()
player_fig.set_size_inches(7,4)
player_axes.boxplot(player_data, vert=False, whis=1.5);
player_axes.set_title("Results of basketball players");
player_axes.set_xlim(0,33)
(0, 33)
A discussion for possible reasons why we prefer the mean square error can be found here: http://stats.stackexchange.com/questions/118/why-square-the-difference-instead-of-taking-the-absolute-value-in-standard-devia
player_avgs["var"] = var(player_data, axis=1)
player_avgs["std"] = std(player_data, axis=1)
player_avgs[["player", "var", "std"]]
player | var | std | |
---|---|---|---|
0 | 1 | 2.727273 | 1.651446 |
1 | 2 | 2.000000 | 1.414214 |
2 | 3 | 49.272727 | 7.019453 |
scipy.stats.mstats.zscore(player_data[0]);
player_points = player_avgs["mean"] + player_avgs["std"]
array([scipy.stats.zmap([10, 11, player_points[0], 12, 30],
player_data[0]), scipy.stats.zmap([10, 11, 12, 30],
player_data[1]), scipy.stats.zmap([10, 11, 12, 30], player_data[2])])
array([ array([ 0. , 0.60553007, 1. , 1.21106014, 12.11060142]), array([ 0. , 0.70710678, 1.41421356, 14.14213562]), array([ 0. , 0.14246123, 0.28492247, 2.84922466])], dtype=object)