#!/usr/bin/env python # coding: utf-8 # # HKBU Library 2019 Workshop @ HKBU # ## Ep. 2 A flashtalk on data exploration # # - **Date & Venue**: 10 April 2019, 4/F HKBU Library # # - **Facilitator**: Dr. Xinzhi Zhang (JOUR, Hong Kong Baptist University, [MSc in AI & Digital Media](http://comd.hkbu.edu.hk/masters/en/aidm)) # - xzzhang2@gmail.com || http://drxinzhizhang.com/ || https://github.com/xzzhang2 || @xin_zhi_zhang || https://scholar.google.com.hk/citations?user=iOFeIDIAAAAJ&hl=en # # - **Workshop Outcomes** # 1. Installing Python # 2. --> **Data exploration ** <-- # 3. Importing and knowing your data # 0. [Pandas](https://pandas.pydata.org/), [Matplotlib](https://matplotlib.org/), and [Seaborn](https://seaborn.pydata.org/) # 1. Getting the attributes of the your data # 2. Case selection # 3. Basic statistics # 4. Pivot table # 4. Data exploration: exploring the data by visualization # 1. Univariate (unidimensional) and bivariate (two-way) data visualization # 2. Multivariate (multidimensional) problem-driven data visualization and data-driven exploration # In[ ]: # # Importing and knowing your data # - in this notebook, we will analyze a ready-made dataset by Gapminder https://www.gapminder.org/ # - more data can be found here: https://www.gapminder.org/data/ # In[ ]: import pandas as pd # In[ ]: # a common way to import the "ready-made" or harvested data, normally in csv format # the data file used in this tutorial is from this tutorial: "https://storage.googleapis.com/learn_pd_like_tidyverse/gapminder.csv" # special thanks for the original author for compiling and sharing this dataset gapminder = pd.read_csv('data/gapminder_cleaned_2.csv') # In[ ]: # checkin the type and take a glance at the head print(type(gapminder)) gapminder.head(5) # ## Examining the attributes of the Data Frame (standard procedures) # # - ```df.shape``` ("dim" in R) # - ```df.columns``` (check the variables, like "names" in R) # - ```df.index``` (check the index of the "rows") # - ```df.info()``` # - ```df.describe()``` (descriptive statistics for numerical variables) # In[ ]: gapminder.shape # (the number of cases/observations, the number of variables) # In[ ]: gapminder.columns # In[ ]: gapminder.index # In[ ]: gapminder.info() # In[ ]: gapminder.describe() # please add the () after it # ## Case selection # ### Selecting cases (rows/observations) using Boolean indexing # In[ ]: gapminder[gapminder['country']=="China"] # In[ ]: gapminder[(gapminder['year'] == 2007) & (gapminder['continent'] == 'Asia')] # ### using a variable list to select multiple variables # In[ ]: gapminder[ ['country', 'continent']].head(30) # ## Basic statistics # In[ ]: # A quick way to get statistics gapminder.describe() # In[ ]: gapminder.describe().loc[['mean','std'],['lifeExp','gdpPercap']] # In[ ]: gapminder[gapminder['continent']=='Asia'].describe().loc[['mean','std'],['lifeExp','gdpPercap']] # In[ ]: gapminder[gapminder['continent']=='Africa'].describe().loc[['mean','std'],['lifeExp','gdpPercap']] # In[ ]: gapminder[gapminder['country'] == 'China'].groupby(by = 'year')[['lifeExp', 'gdpPercap']].mean() # ## Pivot table # - A pivot table is a table of statistics that summarize the data of a more extensive table (such as from a database, spreadsheet, or business intelligence program). This summary might include sums, averages, or other statistics, which the pivot table groups together in a meaningful way (Wiki) # - for more on Pivot table in Pandas, please check this tutorial carefully: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html # In[ ]: from pandas import * # In[ ]: # values: "the dependent variable that you want to observe" # index: the "row" - as an aggregated level variable # columns: the "column" table1 = pivot_table(gapminder, values='lifeExp', index=['country'], columns=['year'], aggfunc=np.mean) table1.head(10) # In[ ]: # more variables in the index # what is the difference between table 2a and table 2b? table2a = pivot_table(gapminder, values='lifeExp', index=['country', 'continent'], columns=['year'], aggfunc={'lifeExp': np.mean}) table2a.head() # In[ ]: table2b = pivot_table(gapminder, values='lifeExp', index=['continent', 'country'], columns=['year'], aggfunc={'lifeExp': np.mean}) table2b.head() # In[ ]: table3 = pivot_table(gapminder, values=['lifeExp','pop'], index=['continent'], columns=['year'], aggfunc={'lifeExp': np.mean, 'pop': [min, max, np.mean]}) table3.head(5) # In[ ]: # # Data exploration and visualization # ## Univariate and bivariate data visualization # - for more data visualization with Python, please see this **must read** tutorial: https://pandas.pydata.org/pandas-docs/stable/visualization.html # In[ ]: import matplotlib.pyplot as plt import seaborn as sns # ### showing the trend: line chart # In[ ]: # selecting the country and plot one single ilne gapminder_China = gapminder[gapminder['country'] == 'China'] gapminder_China[['year', 'lifeExp']].plot(kind = 'line', x = 'year', y = 'lifeExp', title = 'Life expetency across Year in China', legend = False) plt.show() # In[ ]: # plot lines for "muitple groups" gapminder_5asia = gapminder.loc[gapminder['country'].isin(['China', 'Singapore', 'Korea, Rep.', 'Taiwan', 'Hong Kong, China'])] gapminder_5asia_pivot = gapminder_5asia.pivot_table(values = 'lifeExp', columns = 'country', index = 'year') gapminder_5asia_pivot.plot(title = 'Life Expectancies in five Asian societies') plt.show() # In[ ]: # ### Showing the distribution: histogram # In[ ]: gapminder_2007 = gapminder[gapminder['year'] == 2007] gapminder_2007[['pop', 'gdpPercap', 'lifeExp']].hist(bins = 20) plt.show() # In[ ]: # showing different groups gapminder_continent_pivot = gapminder_2007.pivot_table(values = 'gdpPercap', columns = 'continent', index = 'country') gapminder_continent_pivot.plot(kind = 'hist', alpha=0.5, bins = 20, title = 'GDP Per Capita by Continent') plt.show() # ### more on distribution: box plot # - note: what is a [box plot](https://en.wikipedia.org/wiki/Box_plot)? # In[ ]: gapminder_continent_pivot.plot(kind = 'box', title = 'GDP Per Capita by Continent') plt.show() # ### Showing the correlation: scatter plot # In[ ]: gapminder_2007.plot(kind = 'scatter', x = 'gdpPercap', y = 'lifeExp', title = 'Wealth vs. Health in 2007') plt.show() # ### Showing the contrast: bar # In[ ]: # showing the grouped means for one variable summarized_df = gapminder[gapminder['year'] == 2007].groupby(by = 'continent')['pop'].sum() summarized_df.plot(kind = 'bar', rot = 0) plt.show() # note: always use "color" with caution! # In[ ]: # showing the grouped means for multiple variables summarized_df = gapminder[gapminder['year'] == 2007].groupby(by = 'continent')[['lifeExp', 'gdpPercap']].mean() summarized_df.plot(kind = 'barh', subplots = True, layout = (1, 2), sharex = False, sharey = True, legend = False) plt.show() # In[ ]: # ## Multidimensional data visualization # - a.k.a. problem-driven data visualization and data-driven exploration # ### Q1: Globally, what is the relationship between life expectancy and GDP per capita? # 1. Hint: what about a scatter plot? # 2. Hint: try this: https://pythonspot.com/matplotlib-scatterplot/ # In[ ]: import matplotlib.pyplot as plt import numpy as np gapminder.plot(kind = 'scatter', x = 'gdpPercap', y = 'lifeExp', title = 'Wealth vs. Health') plt.show() # ### Will the relationship stipulated in Q1 differ across different continents? # In[ ]: continents = gapminder.groupby('continent').size() print(continents.index) print('num of continents:',len(continents)) # In[ ]: import matplotlib.pyplot as plt import matplotlib import numpy as np fig = plt.figure(figsize=(10,20)) #create a new figure fig.subplots_adjust(hspace=0.6) # gaps between the figures n = 1 for con in continents.index: ax = fig.add_subplot(5,1,n) ax.set_title(con) x = gapminder[gapminder['continent']== con]['gdpPercap'] y = gapminder[gapminder['continent']==con]['lifeExp'] ax.scatter(x,y,s=5) ax.set_xlabel('gdpPercap',fontsize=12) ax.set_ylabel('lifeExp',fontsize=12) n = n+1 plt.show() # ### Will the relationship stipulated in Q1 differ across different countries # In[ ]: countries = gapminder.groupby('country').size() print(countries.index) print('num of countries:',len(countries)) # In[ ]: fig = plt.figure(figsize=(20,80)) fig.subplots_adjust(hspace=0.5, wspace = 0.4) n = 1 for coun in countries.index: ax = fig.add_subplot(25,6,n) x = gapminder[gapminder['country']== coun]['gdpPercap'] y = gapminder[gapminder['country']==coun]['lifeExp'] ax.scatter(x,y,s=5) ax.set_xlabel('gdpPercap',fontsize=12) ax.set_ylabel('lifeExp',fontsize=12) ax.legend([coun],loc = 'upper right') #title n = n+1 plt.show() # In[ ]: # ## Challenge: The myth of social change # # A term called “the third-wave of democracy” was once proposed (see Samuel P. Huntington). It argues that starting from 1974, there is a trend of “major surge of democracy in history” in the world. However, it has been always argued that whether “democracy” as a political belief or as a form of governance could actually bring social well-being (or actually, the progress of social change is driven by other factors). Putting any ideological arguments or opinions aside, let’s see how the data speaks to us.  # # The considerations are: # 1. suppose the year of 1974 is treated as a boundary (a milestone cutting-off point), is there any difference of the social well-being before and after that year? # 2. If a general answer to 1. is “yes” (as the world is becoming healthier and richer), then which countries (or continents) benefit the most? (and which are the least?)  # # In[ ]: import matplotlib.pyplot as plt gapminder.continent.value_counts() # In[ ]: #The growth trend of life expectancy and gdp per capita in Different Continents for i in ('lifeExp','gdpPercap'): gapminder_all_pivot_l = gapminder.pivot_table(values = i, columns = 'continent', index = 'year') gapminder_all_pivot_l.plot(title = i + 'in Different Continents') plt.axvline(1974, color='r', linestyle='--', linewidth=1) plt.show() # In[ ]: fig = plt.figure(figsize=(20,80)) fig.subplots_adjust(hspace=0.5, wspace = 0.4) plt.title('Wealth Growth Trend in Different Countries',fontsize=15) n = 1 for coun in countries.index: ax = fig.add_subplot(25,6,n) x = gapminder[gapminder['country']== coun] gapminder_pivot_g = x.pivot_table(values = 'gdpPercap', columns = 'country', index = 'year') ax.plot(gapminder_pivot_g) ax.set_xlabel('year',fontsize=12) ax.set_ylabel('gdpPercap',fontsize=12) ax.legend([coun],loc = 'upper right') #title #ax.grid(True, which='major',axis='x') n = n+1 plt.show() # In[ ]: # In[ ]: # ## Further readings: # 1. The Art of Effective Visualization of Multi-dimensional Data. Link: https://towardsdatascience.com/the-art-of-effective-visualization-of-multi-dimensional-data-6c7202990c57 (Chinese translation: https://mp.weixin.qq.com/s/mD732PqDtqYdFZSxZWtvvg) # 2. Seaborn gallery: https://seaborn.pydata.org/ and https://seaborn.pydata.org/examples/index.html # 3. More topics: https://jakevdp.github.io/PythonDataScienceHandbook/04.14-visualization-with-seaborn.html # ## **Acknowledgement** # # The codes in this notebook are modified from various sources, including the [official tutorial](http://pandas.pydata.org/pandas-docs/version/0.15/10min.html) and [tutorial 01](https://medium.com/datainpoint/%E5%BE%9E-pandas-%E9%96%8B%E5%A7%8B-python-%E8%88%87%E8%B3%87%E6%96%99%E7%A7%91%E5%AD%B8%E4%B9%8B%E6%97%85-8dee36796d4a). # # Miss He Can and Mr Xu Chen, both from the Department of Computer Science at Hong Kong Baptist University, helped to fine tune and test the codes. 