南京大学《数据新闻》课程
# 原始数据
data = [29, 21, 28, 32, 25]
# 数据的“长度”:有几个数值?
len(data)
5
# 数值的总和是多少?
data_sum= 0
for i in data:
data_sum += i
print data_sum
135
# 你也可以调用一个求和的函数
import numpy as np
np.sum(data)
135
# 均值是多少?
float(data_sum)/len(data)
27.0
# 当然可以直接调用一个求均值的函数
np.mean(data)
27.0
# 原始数据(奇数个数值)
data = [23, 29, 32, 23, 21, 33, 25]
print data
[23, 29, 32, 23, 21, 33, 25]
# 排列数据后取中间的一个数值
data = sorted(data)
data
[21, 23, 23, 25, 29, 32, 33]
index = len(data)/2 # 从0开始计算
data[index]
25
# 或者直接调用中位数的函数
np.median(data)
25.0
# 原始数据(偶数个数值)
data = [23, 29, 32, 23, 21, 33, 25, 27]
data = sorted(data)
data
[21, 23, 23, 25, 27, 29, 32, 33]
np.median(data)
26.0
def get_median(data):
data = sorted(data)
if len(data) %2 == 0:
index = len(data)/2
md = float(data[index] + data[index-1])/2
elif len(data) %2 == 1:
index = len(data)/2
md = data[index]
return md
data1 = [23, 29, 32, 23, 21, 33, 25]
data2 = [23, 29, 32, 23, 21, 33, 25, 27]
print get_median(data1), get_median(data2)
25 26.0
data = [23, 29, 32, 23, 21, 33, 25]
data = sorted(data)
data
[21, 23, 23, 25, 29, 32, 33]
from collections import defaultdict
freqdict = defaultdict(int)
for i in data:
freqdict[i] += 1
freqdict
defaultdict(int, {21: 1, 23: 2, 25: 1, 29: 1, 32: 1, 33: 1})
# 按照value从大到小的递减的顺序排序
freqlist = sorted(freqdict.iteritems(), key=lambda (k,v): -v)
print freqlist,'\n', freqlist[0],'\n', freqlist[0][0]
[(23, 2), (32, 1), (33, 1), (21, 1), (25, 1), (29, 1)] (23, 2) 23
# 直接调用求众数的函数
from scipy import stats
print stats.mode(data)
print stats.mode(data).mode
ModeResult(mode=array([23]), count=array([2])) [23]
data = [29,21,28,32,25]
print np.max(data) # 最大值
print np.min(data) # 最小值
print np.max(data) - np.min(data) # 极差
print float(np.max(data) + np.min(data))/2 # 中程数
print np.mean(data) # 均值
32 21 11 26.5 27.0
mean = np.mean(data)
variance = np.sum([(i-mean)**2 for i in data])/len(data)
std = np.sqrt(variance)
print variance, std
14.0 3.74165738677
# 直接调用求方差和标准差的函数
print np.var(data)
print np.std(data)
14.0 3.74165738677
income_male = [1000, 1500, 2000, 3000, 2500, 4000, 5000, 3500]
income_female=[6000, 6200, 7000, 7100, 9000, 10000, 12000]
income_male_median = np.median(income_male)
income_female_median = np.median(income_female)
print income_male_median, income_female_median
2750.0 7100.0
# Plot the boxplot to see
# minimum value, 25%,50%,75% percentile, maximum value
%matplotlib inline
import matplotlib.pyplot as plt
plt.boxplot([income_male, income_female], # meanline=True,showmeans=True,
labels = ['$male$', '$female$'])
plt.show()
from scipy import stats
stats.ttest_ind(income_male, income_female)
Ttest_indResult(statistic=-5.7570564639816144, pvalue=6.6314258174265091e-05)
centers = [5, 5.3, 4.7]
std1 = 0.1
colors = 'brg'
data1 = []
for ii in range(3):
data1.append(stats.norm(centers[ii], std1).rvs(100))
plt.plot(arange(len(data1[ii]))+ii*len(data1[0]), data1[ii], '.', color=colors[ii])
std2 = 2
data2 = []
for ii in range(3):
data2.append(stats.norm(centers[ii], std2).rvs(100))
plot(arange(len(data1[ii]))+ii*len(data2[0]), data2[ii], '.', color=colors[ii])
# https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_altman/altman_910.txt
data = pd.read_csv('/Users/chengjun/github/statsintro_python/ipynb/Data/data_altman/altman_910.txt',
header = None)
data=data.rename(columns = {0:'value', 1:'group'})
data.head()
value | group | |
---|---|---|
0 | 243 | 1 |
1 | 251 | 1 |
2 | 275 | 1 |
3 | 291 | 1 |
4 | 347 | 1 |
group1 = data.value[data.group==1]
group2 = data.value[data.group==2]
group3 = data.value[data.group==3]
plt.boxplot([group1, group2, group3], # meanline=True,showmeans=True,
labels = ['$Group\,1$', '$Group\,2$', '$Group\,3$'])
plt.show()
F_statistic, pVal = stats.f_oneway(group1, group2, group3)
print('The results from the one-way ANOVA, with the data from Altman 910: F={0:.1f}, p={1:.5f}'.format(F_statistic, pVal))
if pVal < 0.05:
print('One of the groups is significantly different.')
The results from the one-way ANOVA, with the data from Altman 910: F=3.7, p=0.04359 One of the groups is significantly different.
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
# the "C" indicates categorical data
model = ols('value ~ C(group)', data).fit()
print(anova_lm(model))
df sum_sq mean_sq F PR(>F) C(group) 2.0 15515.766414 7757.883207 3.711336 0.043589 Residual 19.0 39716.097222 2090.320906 NaN NaN
from scipy.stats.stats import pearsonr
xi = [1, 2, 3, 4, 5]
y = [3, 5, 9, 13, 16]
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(xi, y, 'g-s')
plt.xlabel('$x_i$', fontsize = 20)
plt.ylabel('$y$', fontsize = 20)
plt.title('$Scatter\,Plot$', fontsize = 20)
plt.show()
pearsonr(data1, data2)
(0.99484975116710994, 0.00044334353831205819)
def corr(data1, data2):
"data1 & data2 should be numpy arrays."
mean1 = data1.mean()
mean2 = data2.mean()
std1 = data1.std()
std2 = data2.std()
corr = ((data1*data2).mean()-mean1*mean2)/(std1*std2)
return corr
data1 = np.array(data1)
data2 = np.array(data2)
corr(data1, data2)
0.99484975116710983
https://en.wikipedia.org/wiki/Chi-squared_test
Suppose we look at the House of Representatives for the 113th Congress. This data is taken from www.senate.gov.
Republican | Democrat | Total | |
---|---|---|---|
Male | 215 | 143 | 358 |
Female | 19 | 64 | 83 |
Total | 234 | 207 | 441 |
total = 441
republican_prob = 234.0/441
male_prob = 358.0/441
male_republican_expected = republican_prob * male_prob * total
print male_republican_expected
189.959183673
import scipy
house = [ [ 215, 143 ], [ 19, 64 ] ]
chi2, p, ddof, expected = scipy.stats.chi2_contingency( house )
msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n"
print( msg.format( chi2, p, ddof ) )
print( expected )
Test Statistic: 35.8877686481 p-value: 2.09016744218e-09 Degrees of Freedom: 1 [[ 189.95918367 168.04081633] [ 44.04081633 38.95918367]]
Suppose there is a city of 1 million residents with four neighborhoods: A, B, C, and D.
A random sample of 650 residents of the city is taken and their occupation is recorded as "blue collar", "white collar", or "no collar".
The null hypothesis is that each person's neighborhood of residence is independent of the person's occupational classification. The data are tabulated as:
A | B | C | D | Total | |
---|---|---|---|---|---|
White collar | 90 | 60 | 104 | 95 | 349 |
Blue collar | 30 | 50 | 51 | 20 | 151 |
No coloar | 30 | 40 | 45 | 35 | 150 |
Total | 150 | 150 | 200 | 150 | 650 |
$ \frac{150}{650} \frac{349}{650} 650 = 80.54 $
Then in that "cell" of the table, we have
$\frac{(\text{observed}-\text{expected})^2}{\text{expected}} = \frac{(90-80.54)^2}{80.54}$.
The sum of these quantities over all of the cells is the test statistic.
Under the null hypothesis, it has approximately a chi-square distribution whose number of degrees of freedom are
$ (\text{number of rows}-1)(\text{number of columns}-1) = (3-1)(4-1) = 6. $
house = [ [ 90, 60, 104, 95 ], [ 30, 50, 51, 20 ], [30, 40, 45, 35] ]
chi2, p, ddof, expected = scipy.stats.chi2_contingency( house )
msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n"
print( msg.format( chi2, p, ddof ) )
print( expected )
Test Statistic: 24.5712028586 p-value: 0.00040984258611 Degrees of Freedom: 6 [[ 80.53846154 80.53846154 107.38461538 80.53846154] [ 34.84615385 34.84615385 46.46153846 34.84615385] [ 34.61538462 34.61538462 46.15384615 34.61538462]]
xi = np.array([1, 2, 3, 4, 5])
y = [3, 5, 9, 12, 18]
slope, intercept, r_value, p_value, std_err = stats.linregress(xi,y)
print slope, intercept, r_value, p_value
# plotting the line
y_fit = slope*xi+intercept
plt.scatter(xi,y)
plt.plot(xi,y_fit,'r-', label = '$Linear\,Fit$')
plt.xlabel('$x_i$', fontsize = 20)
plt.ylabel('$y$', fontsize = 20)
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
3.7 -1.7 0.984655646513 0.00227643170424
import pandas as pd
data = {'xi':range(1, 12), 'y':[3, 5, 9, 12, 18, 22, 28, 35, 49, 60,65]}
df = pd.DataFrame(data)
pd.DataFrame({'A': range(5), 'B': np.random.randn(5)})
A | B | |
---|---|---|
0 | 0 | 1.101434 |
1 | 1 | -0.237321 |
2 | 2 | 1.033037 |
3 | 3 | 1.994974 |
4 | 4 | -0.839041 |
df.head()
xi | y | |
---|---|---|
0 | 1 | 3 |
1 | 2 | 5 |
2 | 3 | 9 |
3 | 4 | 12 |
4 | 5 | 18 |
df.tail()
xi | y | |
---|---|---|
6 | 7 | 28 |
7 | 8 | 35 |
8 | 9 | 49 |
9 | 10 | 60 |
10 | 11 | 65 |
import seaborn as sns
plt.show(sns.boxplot(df))
plt.show(sns.violinplot(df,widths=0.5))
plt.show(sns.distplot(df.y,rug=True,bins=15))
with sns.axes_style("white"):
plt.show(sns.jointplot(df.xi,df.y,kind="kde"))
plt.show(sns.lmplot("xi","y",df))
%reload_ext version_information
%version_information numpy, matplotlib
Software | Version |
---|---|
Python | 2.7.12 64bit [GCC 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)] |
IPython | 4.1.2 |
OS | Darwin 14.0.0 x86_64 i386 64bit |
numpy | 1.11.0 |
matplotlib | 1.5.1 |
Mon Oct 24 23:26:24 2016 CST |
http://nbviewer.jupyter.org/github/data-journalism/statsintro_python/tree/master/ipynb/