#!/usr/bin/env python # coding: utf-8 # *** # *** # # Introduction to Statistical Thinking # *** # *** # # # #### Table of Contents # # 1. [Introduction](./introduction.py) # 2. A Crash Course in Python # 3. [Visualizing Data](./visualizing_data.py) # 4. [Linear Algebra](./linear_algebra.py) # 5. [Statistics](./statistics.py) # 6. [Probability](./probability.py) # 7. [Hypothesis and Inference](./hypothesis_and_inference.py) # 8. [Gradient Descent](./gradient_descent.py) # 9. [Getting Data](./getting_data.py) # 10. [Working With Data](./working_with_data.py) # 11. [Machine Learning](./machine_learning.py) # # # # 12. [k-Nearest Neighbors](./nearest_neighbors.py) # 13. [Naive Bayes](./naive_bayes.py) # 14. [Simple Linear Regression](./simple_linear_regression.py) # 15. [Multiple Regression](./multiple_regression.py) # 16. [Logistic Regression](./logistic_regression.py) # 17. [Decision Trees](./decision_trees.py) # 18. [Neural Networks](./neural_networks.py) # 19. [Clustering](./clustering.py) # 20. [Natural Language Processing](./natural_language_processing.py) # 21. [Network Analysis](./network_analysis.py) # 22. [Recommender Systems](./recommender_systems.py) # 23. [Databases and SQL](./databases.py) # 24. [MapReduce](./mapreduce.py) # 25. Go Forth And Do Data Science # # # **The School of Athens** by Raphael (1509–1510), fresco at the Apostolic Palace, Vatican City. https://en.wikipedia.org/wiki/Platonic_Academy # # # The School of Athens by Raphael (1509–1510), fresco at the Apostolic Palace, Vatican City. https://en.wikipedia.org/wiki/Platonic_Academy # # # # Plato & Typological Thinking # # - Pythagoras held that # - all things are number # - the cosmos comes from numerical principles. # # `The theory of Forms` or `theory of Ideas` is a philosophical theory, concept, or world-view, attributed to Plato, that the `physical world` is not as real or true as timeless, absolute, unchangeable `ideas`. # # # - 真实的知识存在于普遍而永恒的法则之中。 # - The physical world of becoming is an imitation of the mathematical world of being. # - the realm of Being 本质世界(理念世界) # - perfect, eternal, and changeless forms, # - sensible world of becoming 现实世界 # - imperfect # # # Social Physics: Taking Physics as a Role Model # - French social thinker **Henri de Saint-Simon** 1803 described the idea of describing society using laws similar to those of the physical and biological sciences. # - His student and collaborator was **Auguste Comte**, a French philosopher, the founder of sociology, who first defined the term # # > Social physics is that science which occupies itself with social phenomena, considered in the same light as astronomical, physical, chemical, and physiological phenomena, that is to say as being subject to **natural and invariable laws**, the discovery of which is the special object of its researches. # # - Computational Social Science # # https://en.wikipedia.org/wiki/Social_physics # # # **Lambert Adolphe Jacques Quetelet** (1796-1874) # - introducing statistical methods to the social sciences # - in his book titled **Essays on Social Physics**, # - the concept of the "average man" # - characterized by the mean values # - follow a normal distribution. # - He collected data about many such variables. # - developed the body mass index scale # # > “His goal was to understand the statistical laws underlying such phenomena as crime rates, marriage rates or suicide rates. He wanted to explain the values of these variables by other social factors”. # # # # # **Population Thinking** # # # **Charles Robert Darwin** (12 February 1809 – 19 April 1882) # - the science of evolution. # # > favourable variations would make organisms better at surviving and passing the variations on to their offspring, while unfavourable variations would be lost. # # - Variation is the basis of natural seletion. # # > 在类型逻辑中平均数是主要的内容。在总体逻辑中重要的是差异,平均数只是总体的一个特征值,是探讨真实原因的手段,而不是原因本身。 # # Statisticism or Damn Lies # "统计至上主义"天真地以为统计学是科学方法的完备基础。 # - 改进测量工具 # - 研究设计、概念化 # # Duncan, O.D. 1984. Notes on Social Measurement, Historical and Critical. New York: Russell Sage Fundation, p.226. # # The Paradigm of Demography # # # Otis Dudley Duncan (1921-2004) 确立一种新的学术传统 # - 蔑视模仿自然科学试图寻找普遍规律的做法; # - 记录和理解真实人口中的经验模式是第一要务; # - 变异是人类社会的本质。 # - 柏拉图:变异是对本质世界的拙劣复制。 # # # The School of Athens by Raphael (1509–1510), fresco at the Apostolic Palace, Vatican City. https://en.wikipedia.org/wiki/Platonic_Academy # ## 本体论: 世界的本质 # # # # > “我认为自然科学是以“挖掘”本质的世界中的真理为最终目的,这也是其精华所在。而社会科学是以“了解”形成的世界为最终目的。历史上很多人想在社会科学领域找到一种真理,能够适用于各个方面,并且做过许多这方面的尝试。我认为社会科学不应该是这样的。在社会科学中,我们的目的是要了解现实世界,而不是去挖掘永恒的真理。这可能和你们的想象不一样。......既然差异是世界的本质,那差异就应该是研究的对象。” --- 谢宇 # # - 高尔顿认为凯特莱的社会物理学用处不大,普通人不是万能的。 # - 左手入冰,右手入火,平均温度? # - 高尔顿说(社会)科学的探索必须关注变异和共变。 # - variation & Co-variation # # # ## 本体论: 世界的本质 # The measurements have both # - a central tendency, or mean, and # # - a spread around this central value, or variance. # - In the late 1860s, Galton conceived of a measure to quantify normal variation: # - the **standard deviation**. # - "Regression to mediocrity" # # ## 认识论: 人类知识的起源、本质、方法及局限 # # 谢宇:“你到底能知道什么,你怎样认识世界。" # - 自然科学追求永恒真理,关注典型现象; # - 典型现象 & 平均人 # - 社会科学关注所有个案组成的总体的状况。 # ## 方法论: 使用什么方法 # # 谢宇:“社会科学之所以复杂,是因为我们运用的数据是通过观察所得,而观察所得的数据必然受到外来因素的影响,这些外来因素都可能解释你的数据。“ # - 自然科学使用实验来隔离外来因素的影响; # - “社会科学可以使用统计排除一些外来影响,但你不能排除所有的外来因素”。 # ## Three Basic Principles of Social Science Research # # - Variability Principle # - Social Grouping Principle # - Social Context Principle # ------ # # Statistics for Describing Data # ------ # # The mathematics and techniques with which we understand data. # In[4]: from collections import Counter #from linear_algebra import sum_of_squares, dot import math import numpy as np import matplotlib.pyplot as plt # def dot(v, w): # """v_1 * w_1 + ... + v_n * w_n""" # return sum(v_i * w_i for v_i, w_i in zip(v, w)) # # def sum_of_squares(v): # """v_1 * v_1 + ... + v_n * v_n""" # return dot(v, v) # # In[1]: daily_minutes = [1,68.77,51.25,52.08,38.36,44.54,57.13, 51.4,41.42,31.22,34.76,54.01,38.79, 47.59,49.1,27.66,41.03,36.73,48.65,28.12, 46.62,35.57,32.98,35,26.07,23.77,39.73, 40.57,31.65,31.21,36.32,20.45,21.93,26.02, 27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94, 32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72, 26.41,26.97,36.76,40.32,35.02,29.47,30.2,31, 38.11,38.18,36.31,21.03,30.86,36.07,28.66, 29.08,37.28,15.28,24.17,22.31,30.17,25.53, 19.85,35.37,44.6,17.23,13.47,26.33,35.02, 32.09,24.81,19.33,28.77,24.26,31.98,25.73, 24.86,16.28,34.51,15.23,39.72,40.8,26.06, 35.76,34.76,16.13,44.04,18.03,19.65,32.62, 35.59,39.43,14.18,35.24,40.13,41.82,35.45, 36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21, 26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65, 18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21, 28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06, 36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48, 18.95,33.55,14.24,29.04,32.51,25.63,22.22,19, 32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42, 27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26, 19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13, 27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42, 9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61, 26.89,23.48,8.38,27.81,32.35,23.84] # In[2]: num_friends = [100,49,41,40,25,21,21,19,19,18, 18,16,15,15,15,15,14,14,13,13, 13,13,12,12,11,10,10,10,10,10, 10,10,10,10,10,10,10,10,10,10, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8 ,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, 6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,4,4,4,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1] # ## Distribution and Histogram # In[5]: time_counts = Counter(map(int, daily_minutes)) xs = range(69) ys = [time_counts[x] for x in xs] plt.bar(xs, ys) plt.axis([0,69,0,14]) plt.title("Histogram of Time Counts") plt.xlabel("# of Time") plt.ylabel("# of people") plt.show() # In[6]: friend_counts = Counter(num_friends) xs = range(101) ys = [friend_counts[x] for x in xs] plt.bar(xs, ys) plt.axis([0,101,0,25]) plt.title("Histogram of Friend Counts") plt.xlabel("# of friends") plt.ylabel("# of people") plt.show() # We can also draw them with ``plt.hist`` # In[7]: plt.hist(daily_minutes) plt.xlabel('Daily minutes') plt.ylabel('Frequency') plt.show() # In[8]: plt.hist(num_friends, bins= 30) plt.xlabel("# of friends") plt.ylabel('Frequency') plt.show() # Unfortunately, this chart is still too difficult to interpret. # - So you start generating some statistics. # ## From Max to Min # In[9]: num_points = len(num_friends) # 204 largest_value = max(num_friends) # 100 smallest_value = min(num_friends) # 1 print(num_points, largest_value, smallest_value) # In[10]: sorted_values = sorted(num_friends) smallest_value = sorted_values[0] # 1 second_smallest_value = sorted_values[1] # 1 second_largest_value = sorted_values[-2] # 49 # ## Mean, Median, Mode, and Quantile # In[11]: def mean(x): return sum(x) / len(x) # In[12]: print("mean(num_friends)", mean(num_friends)) # In[13]: np.mean(num_friends) # In[14]: def median(v): """finds the 'middle-most' value of v""" n = len(v) sorted_v = sorted(v) midpoint = n // 2 if n % 2 == 1: # if odd, return the middle value return sorted_v[midpoint] else: # if even, return the average of the middle values lo = midpoint - 1 hi = midpoint return (sorted_v[lo] + sorted_v[hi]) / 2 # In[15]: print("median(num_friends)", median(num_friends)) # In[16]: np.median(num_friends) # In[17]: def quantile(x, p): """returns the pth-percentile value in x""" p_index = int(p * len(x)) return sorted(x)[p_index] # In[18]: print("quantile(num_friends, 0.10)", quantile(num_friends, 0.10)) print("quantile(num_friends, 0.25)", quantile(num_friends, 0.25)) print("quantile(num_friends, 0.75)", quantile(num_friends, 0.75)) print("quantile(num_friends, 0.90)", quantile(num_friends, 0.90)) # In[19]: np.percentile(num_friends, 75) # In[20]: def mode(x): """returns a list, might be more than one mode""" counts = Counter(x) max_count = max(counts.values()) return [x_i for x_i, count in counts.items() if count == max_count] # In[21]: print("mode(num_friends)", mode(num_friends)) # In[22]: np.argmax(np.bincount(num_friends)) # Only the first occurrence is returned. # In[23]: np.bincount(num_friends) # In[24]: from scipy import stats stats.mode(num_friends, axis=None) # In[25]: def data_range(x): return max(x) - min(x) # In[26]: print("data_range(num_friends)", data_range(num_friends)) # In[49]: import seaborn as sns sns.set(style="ticks", palette="pastel") sns.boxplot(y = daily_minutes); # In[50]: import seaborn as sns sns.set(style="ticks", palette="pastel") sns.boxplot(y = num_friends); # ## Variance and Standard Deviation # $$\sigma = \sqrt{\frac{\sum_{i=1}^N (x_i - \overline{x})^2}{N-1} }$$ # # $$\sigma ^ 2 = \frac{\sum_{i=1}^N (x_i - \overline{x})^2}{N-1} $$ # In[27]: def de_mean(x): """translate x by subtracting its mean so the result has mean 0""" x_bar = mean(x) return [x_i - x_bar for x_i in x] # In[28]: def variance(x): """assumes x has at least two elements""" n = len(x) deviations = de_mean(x) return sum_of_squares(deviations) / (n - 1) print("variance(num_friends)", variance(num_friends)) # In[29]: print(np.var(num_friends)) # In[30]: def standard_deviation(x): return math.sqrt(variance(x)) print("standard_deviation(num_friends)", standard_deviation(num_friends)) # In[31]: np.std(num_friends) # ## Covariance, Correlation, and Scatter Plot # In[32]: def interquartile_range(x): return quantile(x, 0.75) - quantile(x, 0.25) print("interquartile_range(num_friends)", interquartile_range(num_friends)) # In[33]: def covariance(x, y): n = len(x) return dot(de_mean(x), de_mean(y)) / (n - 1) print("covariance(num_friends, daily_minutes)", covariance(num_friends, daily_minutes)) # In[34]: np.cov(num_friends, daily_minutes) # In[35]: def correlation(x, y): stdev_x = standard_deviation(x) stdev_y = standard_deviation(y) if stdev_x > 0 and stdev_y > 0: return covariance(x, y) / stdev_x / stdev_y else: return 0 # if no variation, correlation is zero print("correlation(num_friends, daily_minutes)", correlation(num_friends, daily_minutes)) # In[38]: plt.scatter(num_friends, daily_minutes, alpha = .1) plt.xlabel('number of friends') plt.ylabel('daily minutes') plt.title('outliers') plt.show() # In[39]: import seaborn as sns sns.set(style="white") g = sns.jointplot(num_friends, daily_minutes, kind="kde", height=7, space=0) # In[52]: import seaborn as sns sns.set(style="ticks", palette="pastel") sns.boxplot(x=num_friends, y=daily_minutes) sns.despine(offset=10, trim=True) # In[71]: np.corrcoef(num_friends, daily_minutes) # In[72]: from scipy.stats.stats import pearsonr pearsonr(num_friends, daily_minutes) # In[26]: outlier = num_friends.index(100) # index of outlier num_friends_good = [x for i, x in enumerate(num_friends) if i != outlier] daily_minutes_good = [x for i, x in enumerate(daily_minutes) if i != outlier] print("correlation(num_friends_good, daily_minutes_good)", \ correlation(num_friends_good, daily_minutes_good)) # In[ ]: