#!/usr/bin/env python # coding: utf-8 # # HKBU Library 2019 Workshop @ HKBU # ## Ep. 2 A flashtalk on data exploration # # - **Date & Venue**: 10 April 2019, 4/F HKBU Library # # - **Facilitator**: Dr. Xinzhi Zhang (JOUR, Hong Kong Baptist University, [MSc in AI & Digital Media](http://comd.hkbu.edu.hk/masters/en/aidm)) # - xzzhang2@gmail.com || http://drxinzhizhang.com/ || https://github.com/xzzhang2 || @xin_zhi_zhang || https://scholar.google.com.hk/citations?user=iOFeIDIAAAAJ&hl=en # # - **Workshop Outcomes** # 1. Installing Python # 2. --> **Data exploration ** <-- # 3. Importing and knowing your data # 0. [Pandas](https://pandas.pydata.org/), [Matplotlib](https://matplotlib.org/), and [Seaborn](https://seaborn.pydata.org/) # 1. Getting the attributes of the your data # 2. Case selection # 3. Basic statistics # 4. Pivot table # 4. Data exploration: exploring the data by visualization # 1. Univariate (unidimensional) and bivariate (two-way) data visualization # 2. Multivariate (multidimensional) problem-driven data visualization and data-driven exploration # In[ ]: # # Importing and knowing your data # - in this notebook, we will analyze a ready-made dataset by Gapminder https://www.gapminder.org/ # - more data can be found here: https://www.gapminder.org/data/ # In[ ]: import pandas as pd # In[ ]: # a common way to import the "ready-made" or harvested data, normally in csv format # the data file used in this tutorial is from this tutorial: "https://storage.googleapis.com/learn_pd_like_tidyverse/gapminder.csv" # special thanks for the original author for compiling and sharing this dataset gapminder = pd.read_csv('data/gapminder_cleaned_2.csv') # In[ ]: # checkin the type and take a glance at the head print(type(gapminder)) gapminder.head(5) # ## Examining the attributes of the Data Frame (standard procedures) # # - ```df.shape``` ("dim" in R) # - ```df.columns``` (check the variables, like "names" in R) # - ```df.index``` (check the index of the "rows") # - ```df.info()``` # - ```df.describe()``` (descriptive statistics for numerical variables) # In[ ]: gapminder.shape # (the number of cases/observations, the number of variables) # In[ ]: gapminder.columns # In[ ]: gapminder.index # In[ ]: gapminder.info() # In[ ]: gapminder.describe() # please add the () after it # ## Case selection # ### Selecting cases (rows/observations) using Boolean indexing # In[ ]: gapminder[gapminder['country']=="China"] # In[ ]: gapminder[(gapminder['year'] == 2007) & (gapminder['continent'] == 'Asia')] # ### using a variable list to select multiple variables # In[ ]: gapminder[ ['country', 'continent']].head(30) # ## Basic statistics # In[ ]: # A quick way to get statistics gapminder.describe() # In[ ]: gapminder.describe().loc[['mean','std'],['lifeExp','gdpPercap']] # In[ ]: gapminder[gapminder['continent']=='Asia'].describe().loc[['mean','std'],['lifeExp','gdpPercap']] # In[ ]: gapminder[gapminder['continent']=='Africa'].describe().loc[['mean','std'],['lifeExp','gdpPercap']] # In[ ]: gapminder[gapminder['country'] == 'China'].groupby(by = 'year')[['lifeExp', 'gdpPercap']].mean() # ## Pivot table # - A pivot table is a table of statistics that summarize the data of a more extensive table (such as from a database, spreadsheet, or business intelligence program). This summary might include sums, averages, or other statistics, which the pivot table groups together in a meaningful way (Wiki) # - for more on Pivot table in Pandas, please check this tutorial carefully: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html # In[ ]: from pandas import * # In[ ]: # values: "the dependent variable that you want to observe" # index: the "row" - as an aggregated level variable # columns: the "column" table1 = pivot_table(gapminder, values='lifeExp', index=['country'], columns=['year'], aggfunc=np.mean) table1.head(10) # In[ ]: # more variables in the index # what is the difference between table 2a and table 2b? table2a = pivot_table(gapminder, values='lifeExp', index=['country', 'continent'], columns=['year'], aggfunc={'lifeExp': np.mean}) table2a.head() # In[ ]: table2b = pivot_table(gapminder, values='lifeExp', index=['continent', 'country'], columns=['year'], aggfunc={'lifeExp': np.mean}) table2b.head() # In[ ]: table3 = pivot_table(gapminder, values=['lifeExp','pop'], index=['continent'], columns=['year'], aggfunc={'lifeExp': np.mean, 'pop': [min, max, np.mean]}) table3.head(5) # In[ ]: # # Data exploration and visualization # ## Univariate and bivariate data visualization # - for more data visualization with Python, please see this **must read** tutorial: https://pandas.pydata.org/pandas-docs/stable/visualization.html # In[ ]: import matplotlib.pyplot as plt import seaborn as sns # ### showing the trend: line chart # In[ ]: # selecting the country and plot one single ilne gapminder_China = gapminder[gapminder['country'] == 'China'] gapminder_China[['year', 'lifeExp']].plot(kind = 'line', x = 'year', y = 'lifeExp', title = 'Life expetency across Year in China', legend = False) plt.show() # In[ ]: # plot lines for "muitple groups" gapminder_5asia = gapminder.loc[gapminder['country'].isin(['China', 'Singapore', 'Korea, Rep.', 'Taiwan', 'Hong Kong, China'])] gapminder_5asia_pivot = gapminder_5asia.pivot_table(values = 'lifeExp', columns = 'country', index = 'year') gapminder_5asia_pivot.plot(title = 'Life Expectancies in five Asian societies') plt.show() # In[ ]: # ### Showing the distribution: histogram # In[ ]: gapminder_2007 = gapminder[gapminder['year'] == 2007] gapminder_2007[['pop', 'gdpPercap', 'lifeExp']].hist(bins = 20) plt.show() # In[ ]: # showing different groups gapminder_continent_pivot = gapminder_2007.pivot_table(values = 'gdpPercap', columns = 'continent', index = 'country') gapminder_continent_pivot.plot(kind = 'hist', alpha=0.5, bins = 20, title = 'GDP Per Capita by Continent') plt.show() # ### more on distribution: box plot # - note: what is a [box plot](https://en.wikipedia.org/wiki/Box_plot)? # In[ ]: gapminder_continent_pivot.plot(kind = 'box', title = 'GDP Per Capita by Continent') plt.show() # ### Showing the correlation: scatter plot # In[ ]: gapminder_2007.plot(kind = 'scatter', x = 'gdpPercap', y = 'lifeExp', title = 'Wealth vs. Health in 2007') plt.show() # ### Showing the contrast: bar # In[ ]: # showing the grouped means for one variable summarized_df = gapminder[gapminder['year'] == 2007].groupby(by = 'continent')['pop'].sum() summarized_df.plot(kind = 'bar', rot = 0) plt.show() # note: always use "color" with caution! # In[ ]: # showing the grouped means for multiple variables summarized_df = gapminder[gapminder['year'] == 2007].groupby(by = 'continent')[['lifeExp', 'gdpPercap']].mean() summarized_df.plot(kind = 'barh', subplots = True, layout = (1, 2), sharex = False, sharey = True, legend = False) plt.show() # In[ ]: # ## Multidimensional data visualization # - a.k.a. problem-driven data visualization and data-driven exploration # ### Q1: Globally, what is the relationship between life expectancy and GDP per capita? # 1. Hint: what about a scatter plot? # 2. Hint: try this: https://pythonspot.com/matplotlib-scatterplot/ # In[ ]: import matplotlib.pyplot as plt import numpy as np gapminder.plot(kind = 'scatter', x = 'gdpPercap', y = 'lifeExp', title = 'Wealth vs. Health') plt.show() # ### Will the relationship stipulated in Q1 differ across different continents? # In[ ]: continents = gapminder.groupby('continent').size() print(continents.index) print('num of continents:',len(continents)) # In[ ]: import matplotlib.pyplot as plt import matplotlib import numpy as np fig = plt.figure(figsize=(10,20)) #create a new figure fig.subplots_adjust(hspace=0.6) # gaps between the figures n = 1 for con in continents.index: ax = fig.add_subplot(5,1,n) ax.set_title(con) x = gapminder[gapminder['continent']== con]['gdpPercap'] y = gapminder[gapminder['continent']==con]['lifeExp'] ax.scatter(x,y,s=5) ax.set_xlabel('gdpPercap',fontsize=12) ax.set_ylabel('lifeExp',fontsize=12) n = n+1 plt.show() # ### Will the relationship stipulated in Q1 differ across different countries # In[ ]: countries = gapminder.groupby('country').size() print(countries.index) print('num of countries:',len(countries)) # In[ ]: fig = plt.figure(figsize=(20,80)) fig.subplots_adjust(hspace=0.5, wspace = 0.4) n = 1 for coun in countries.index: ax = fig.add_subplot(25,6,n) x = gapminder[gapminder['country']== coun]['gdpPercap'] y = gapminder[gapminder['country']==coun]['lifeExp'] ax.scatter(x,y,s=5) ax.set_xlabel('gdpPercap',fontsize=12) ax.set_ylabel('lifeExp',fontsize=12) ax.legend([coun],loc = 'upper right') #title n = n+1 plt.show() # In[ ]: # ## Challenge: The myth of social change # # A term called “the third-wave of democracy” was once proposed (see Samuel P. Huntington). It argues that starting from 1974, there is a trend of “major surge of democracy in history” in the world. However, it has been always argued that whether “democracy” as a political belief or as a form of governance could actually bring social well-being (or actually, the progress of social change is driven by other factors). Putting any ideological arguments or opinions aside, let’s see how the data speaks to us.  # # The considerations are: # 1. suppose the year of 1974 is treated as a boundary (a milestone cutting-off point), is there any difference of the social well-being before and after that year? # 2. If a general answer to 1. is “yes” (as the world is becoming healthier and richer), then which countries (or continents) benefit the most? (and which are the least?)  # # In[ ]: import matplotlib.pyplot as plt gapminder.continent.value_counts() # In[ ]: #The growth trend of life expectancy and gdp per capita in Different Continents for i in ('lifeExp','gdpPercap'): gapminder_all_pivot_l = gapminder.pivot_table(values = i, columns = 'continent', index = 'year') gapminder_all_pivot_l.plot(title = i + 'in Different Continents') plt.axvline(1974, color='r', linestyle='--', linewidth=1) plt.show() # In[ ]: fig = plt.figure(figsize=(20,80)) fig.subplots_adjust(hspace=0.5, wspace = 0.4) plt.title('Wealth Growth Trend in Different Countries',fontsize=15) n = 1 for coun in countries.index: ax = fig.add_subplot(25,6,n) x = gapminder[gapminder['country']== coun] gapminder_pivot_g = x.pivot_table(values = 'gdpPercap', columns = 'country', index = 'year') ax.plot(gapminder_pivot_g) ax.set_xlabel('year',fontsize=12) ax.set_ylabel('gdpPercap',fontsize=12) ax.legend([coun],loc = 'upper right') #title #ax.grid(True, which='major',axis='x') n = n+1 plt.show() # In[ ]: # In[ ]: # ## Further readings: # 1. The Art of Effective Visualization of Multi-dimensional Data. Link: https://towardsdatascience.com/the-art-of-effective-visualization-of-multi-dimensional-data-6c7202990c57 (Chinese translation: https://mp.weixin.qq.com/s/mD732PqDtqYdFZSxZWtvvg) # 2. Seaborn gallery: https://seaborn.pydata.org/ and https://seaborn.pydata.org/examples/index.html # 3. More topics: https://jakevdp.github.io/PythonDataScienceHandbook/04.14-visualization-with-seaborn.html # ## **Acknowledgement** # # The codes in this notebook are modified from various sources, including the [official tutorial](http://pandas.pydata.org/pandas-docs/version/0.15/10min.html) and [tutorial 01](https://medium.com/datainpoint/%E5%BE%9E-pandas-%E9%96%8B%E5%A7%8B-python-%E8%88%87%E8%B3%87%E6%96%99%E7%A7%91%E5%AD%B8%E4%B9%8B%E6%97%85-8dee36796d4a). # # Miss He Can and Mr Xu Chen, both from the Department of Computer Science at Hong Kong Baptist University, helped to fine tune and test the codes. All codes are for educational purposes only and released under the MIT licence. # In[ ]: