Data Visualization Tools

In this project we will cover:

  • Area plots
  • Histograms
  • Bar Charts
  • Pie Charts
  • Box Plots
  • Scatter Plots
  • Waffle Charts
  • Word Clouds ## The dataset For the sake of simplicity we will use a simple dataset: the Canadian immigration dataset that report immigration data from 1980 to 2013.
In [52]:
# import libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches # needed for waffle Charts
from PIL import Image # converting images into arrays
# use the inline backend to generate the plots within the browser
%matplotlib inline 

# ingore future wanings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
In [2]:
#  import dataset
df= pd.read_excel('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DV0101EN/labs/Data_Files/Canada.xlsx',
                       sheet_name='Canada by Citizenship',
                       skiprows=range(20),
                       skipfooter=2
                      )
In [3]:
df.head()
Out[3]:
Type Coverage OdName AREA AreaName REG RegName DEV DevName 1980 ... 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
0 Immigrants Foreigners Afghanistan 935 Asia 5501 Southern Asia 902 Developing regions 16 ... 2978 3436 3009 2652 2111 1746 1758 2203 2635 2004
1 Immigrants Foreigners Albania 908 Europe 925 Southern Europe 901 Developed regions 1 ... 1450 1223 856 702 560 716 561 539 620 603
2 Immigrants Foreigners Algeria 903 Africa 912 Northern Africa 902 Developing regions 80 ... 3616 3626 4807 3623 4005 5393 4752 4325 3774 4331
3 Immigrants Foreigners American Samoa 909 Oceania 957 Polynesia 902 Developing regions 0 ... 0 0 1 0 0 0 0 0 0 0
4 Immigrants Foreigners Andorra 908 Europe 925 Southern Europe 901 Developed regions 0 ... 0 0 1 1 0 0 0 0 1 1

5 rows × 43 columns

In [4]:
# remove non informative features
df.drop(['AREA', 'REG', 'DEV', 'Type', 'Coverage'], axis=1, inplace=True)
# sanity check
df.head()
Out[4]:
OdName AreaName RegName DevName 1980 1981 1982 1983 1984 1985 ... 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
0 Afghanistan Asia Southern Asia Developing regions 16 39 39 47 71 340 ... 2978 3436 3009 2652 2111 1746 1758 2203 2635 2004
1 Albania Europe Southern Europe Developed regions 1 0 0 0 0 0 ... 1450 1223 856 702 560 716 561 539 620 603
2 Algeria Africa Northern Africa Developing regions 80 67 71 69 63 44 ... 3616 3626 4807 3623 4005 5393 4752 4325 3774 4331
3 American Samoa Oceania Polynesia Developing regions 0 1 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
4 Andorra Europe Southern Europe Developed regions 0 0 0 0 0 0 ... 0 0 1 1 0 0 0 0 1 1

5 rows × 38 columns

In [5]:
# change columns name for a better undesrtanding
df.rename(columns={'OdName':'Country', 'AreaName':'Continent','RegName':'Region'}, inplace=True)
# sanity check
df.head()
Out[5]:
Country Continent Region DevName 1980 1981 1982 1983 1984 1985 ... 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
0 Afghanistan Asia Southern Asia Developing regions 16 39 39 47 71 340 ... 2978 3436 3009 2652 2111 1746 1758 2203 2635 2004
1 Albania Europe Southern Europe Developed regions 1 0 0 0 0 0 ... 1450 1223 856 702 560 716 561 539 620 603
2 Algeria Africa Northern Africa Developing regions 80 67 71 69 63 44 ... 3616 3626 4807 3623 4005 5393 4752 4325 3774 4331
3 American Samoa Oceania Polynesia Developing regions 0 1 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
4 Andorra Europe Southern Europe Developed regions 0 0 0 0 0 0 ... 0 0 1 1 0 0 0 0 1 1

5 rows × 38 columns

In [6]:
# let's set country as dataframe index for better .loc method use
df.set_index('Country', inplace=True)
In [7]:
# now add total feature
df['Total'] = df.sum(axis=1)
# check
df.head()
Out[7]:
Continent Region DevName 1980 1981 1982 1983 1984 1985 1986 ... 2005 2006 2007 2008 2009 2010 2011 2012 2013 Total
Country
Afghanistan Asia Southern Asia Developing regions 16 39 39 47 71 340 496 ... 3436 3009 2652 2111 1746 1758 2203 2635 2004 58639
Albania Europe Southern Europe Developed regions 1 0 0 0 0 0 1 ... 1223 856 702 560 716 561 539 620 603 15699
Algeria Africa Northern Africa Developing regions 80 67 71 69 63 44 69 ... 3626 4807 3623 4005 5393 4752 4325 3774 4331 69439
American Samoa Oceania Polynesia Developing regions 0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 6
Andorra Europe Southern Europe Developed regions 0 0 0 0 0 0 2 ... 0 1 1 0 0 0 0 1 1 15

5 rows × 38 columns

In [8]:
# just for fun see the top 5 countries
df.sort_values(['Total'], ascending=False, axis=0, inplace=True)
df.head()
Out[8]:
Continent Region DevName 1980 1981 1982 1983 1984 1985 1986 ... 2005 2006 2007 2008 2009 2010 2011 2012 2013 Total
Country
India Asia Southern Asia Developing regions 8880 8670 8147 7338 5704 4211 7150 ... 36210 33848 28742 28261 29456 34235 27509 30933 33087 691904
China Asia Eastern Asia Developing regions 5123 6682 3308 1863 1527 1816 1960 ... 42584 33518 27642 30037 29622 30391 28502 33024 34129 659962
United Kingdom of Great Britain and Northern Ireland Europe Northern Europe Developed regions 22045 24796 20620 10015 10170 9564 9470 ... 7258 7140 8216 8979 8876 8724 6204 6195 5827 551500
Philippines Asia South-Eastern Asia Developing regions 6051 5921 5249 4562 3801 3150 4166 ... 18139 18400 19837 24887 28573 38617 36765 34315 29544 511391
Pakistan Asia Southern Asia Developing regions 978 972 1201 900 668 514 691 ... 14314 13127 10124 8994 7217 6811 7468 11227 12603 241600

5 rows × 38 columns

In [9]:
# since years features are handled so awfully...
# let's create a list of years from 1980 to 2013, remeber range() second element non inclusive behaviour
years = list(range(1980,2014))

Starting Simple: Area Plots

In [10]:
# get the top 5 entries
df_top5 = df.head()

# transpose the dataframe
df_top5 = df_top5[years].transpose()

# check
df_top5.head()
Out[10]:
Country India China United Kingdom of Great Britain and Northern Ireland Philippines Pakistan
1980 8880 5123 22045 6051 978
1981 8670 6682 24796 5921 972
1982 8147 3308 20620 5249 1201
1983 7338 1863 10015 4562 900
1984 5704 1527 10170 3801 668
In [11]:
df_top5.index = df_top5.index.map(int) # let's change the index values of df_top5 to type integer for plotting
df_top5.plot(kind='area', 
             stacked=False,
             alpha=.25,
             figsize=(20, 10), # pass a tuple (x, y) size
             )

plt.title('Immigration Trend of Top 5 Countries')
plt.ylabel('Number of Immigrants')
plt.xlabel('Years')

plt.show()

Now check Histograms!

In [12]:
# view 1999 data
df[1999].plot(kind='hist',bins=25, figsize=(8, 5))

plt.title('Histogram of Immigration from 195 Countries in 2013') # add a title to the histogram
plt.ylabel('Number of Countries') # add y-label
plt.xlabel('Number of Immigrants') # add x-label

plt.show()

And finally: Bar Charts

In [13]:
df_iceland = df.loc['Iceland', years]
df_iceland.head()
Out[13]:
1980    17
1981    33
1982    10
1983     9
1984    13
Name: Iceland, dtype: object
In [14]:
df_iceland.plot(kind='bar', figsize=(10, 6))

plt.xlabel('Year') # add to x-label to the plot
plt.ylabel('Number of immigrants') # add y-label to the plot
plt.title('Icelandic immigrants to Canada from 1980 to 2013') # add title to the plot

plt.show()

Now, let's look at something more advanced: Pie Chart

In [15]:
#group by continent and sum all immigrants
df_cont = df.groupby('Continent', axis=0).sum()
# check
df_cont.head()
Out[15]:
1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 ... 2005 2006 2007 2008 2009 2010 2011 2012 2013 Total
Continent
Africa 3951 4363 3819 2671 2639 2650 3782 7494 7552 9894 ... 27523 29188 28284 29890 34534 40892 35441 38083 38543 618948
Asia 31025 34314 30214 24696 27274 23850 28739 43203 47454 60256 ... 159253 149054 133459 139894 141434 163845 146894 152218 155075 3317794
Europe 39760 44802 42720 24638 22287 20844 24370 46698 54726 60893 ... 35955 33053 33495 34692 35078 33425 26778 29177 28691 1410947
Latin America and the Caribbean 13081 15215 16769 15427 13678 15171 21179 28471 21924 25060 ... 24747 24676 26011 26547 26867 28818 27856 27173 24950 765148
Northern America 9378 10030 9074 7100 6661 6543 7074 7705 6469 6790 ... 8394 9613 9463 10190 8995 8142 7677 7892 8503 241142

5 rows × 35 columns

In [16]:
# create the chart and plot it
df_cont['Total'].plot(kind='pie', figsize=(10,10))

plt.title('Immigration to Canada by Continet (1980-2013)')

plt.show()

Now Box plot

In [17]:
# get only japan data
df_japan = df.loc[['Japan'], years].transpose()
# check
df_japan.head()
Out[17]:
Country Japan
1980 701
1981 756
1982 598
1983 309
1984 246
In [18]:
df_japan.plot(kind='box')

plt.title('Box Plot of Japanese Imigration from 1980 to 2013')
plt.ylabel('Number of Immigrants')

plt.show()

Scatter Plots

In [19]:
# get total value
df_total = df[years].sum().to_frame().reset_index(level=0).rename(columns={'index':'years', 0: 'Total'})

# check
df_total.head()
Out[19]:
years Total
0 1980 99137
1 1981 110563
2 1982 104271
3 1983 75550
4 1984 73417
In [20]:
df_total.plot(
kind="scatter",
    x='years',
    y='Total'
)
plt.title(' Total Immigrant population to Canada from 1980 to 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')
plt.show()

Waffle Charts

A Waffle Chart can be used to easy visualize data in relation to a whole or to highlight progress against a given threshold.

In [26]:
# create a subset of samples
df_sub = df.loc[['Denmark','Norway','Sweden'], :]
df_sub
Out[26]:
Continent Region DevName 1980 1981 1982 1983 1984 1985 1986 ... 2005 2006 2007 2008 2009 2010 2011 2012 2013 Total
Country
Denmark Europe Northern Europe Developed regions 272 293 299 106 93 73 93 ... 62 101 97 108 81 92 93 94 81 3901
Norway Europe Northern Europe Developed regions 116 77 106 51 31 54 56 ... 57 53 73 66 75 46 49 53 59 2327
Sweden Europe Northern Europe Developed regions 281 308 222 176 128 158 187 ... 205 139 193 165 167 159 134 140 140 5866

3 rows × 38 columns

In [33]:
# Step 1: determine proportion of each category with respect to the total

# total values
tot_val = sum(df_sub['Total'])

#category proprtion
cat_pro = [(float(value) / tot_val) for value in df_sub['Total']] 

# check proportion 
for i, pro in enumerate(cat_pro):
    print(df_sub.index.values[i] + ": " + str(round(pro,3)))
Denmark: 0.323
Norway: 0.192
Sweden: 0.485
In [34]:
# Step 2: define overall size of the waffle chart
width  = 40
height = 10

# total number of tiles
tot_tiles = width * height 
In [36]:
# Step 3: using a proprtion, determine the amount of tiles for each cat
tiles_per_cat = [round(pro * tot_tiles) for pro in cat_pro]

# check number tiles per category
for i, tiles in enumerate(tiles_per_cat):
    print(df_sub.index.values[i] + ": "+str(tiles))
Denmark: 129
Norway: 77
Sweden: 194
In [37]:
# Step 4: create a matrix that resembles the waffle and populate it

# init the waffle chart as an empty matrix
waffle_chart = np.zeros((height, width))

#define indices to loop through waffle chart
cat_index  = 0
tile_index = 0

# ppulate the waffle
for col in range(width):
    for row in range(height):
        tile_index += 1
        
        # allocalte tiles conditionally
        if tile_index > sum(tiles_per_cat[0:cat_index]):
            # go next
            cat_index += 1
        # set class value as integer, wich increase with class
        waffle_chart[row,col] = cat_index
In [43]:
# Step 5: map waffle chart to visual
fig = plt.figure()

# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
plt.show()
<Figure size 432x288 with 0 Axes>
In [45]:
# Step 6: make the waffle chart better

# instantiate a new figure object
fig = plt.figure()

# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()

# get the axis
ax = plt.gca()

# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
    
# add gridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)

plt.xticks([])
plt.yticks([])

# remove unnececessary text
plt.show()
<Figure size 432x288 with 0 Axes>
In [55]:
# Step 7: create a legend for the chart

# instantiate a new figure object
fig = plt.figure()

# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()

# get the axis
ax = plt.gca()

# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
    
# add gridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)

plt.xticks([])
plt.yticks([])

# compute cumulative sum of individual categories to match color schemes between chart and legend
values_cumsum = np.cumsum(df_sub['Total'])
tot_val = values_cumsum[len(values_cumsum) - 1]

# create legend
legend_handles = []
for i, category in enumerate(df_sub.index.values):
    label_str = category + ' (' + str(df_sub['Total'][i]) + ')'
    color_val = colormap(float(values_cumsum[i])/tot_val)
    legend_handles.append(mpatches.Patch(color=color_val, label=label_str))

# add legend to chart
plt.legend(handles=legend_handles,
           loc='lower center', 
           ncol=len(df_sub.index.values),
           bbox_to_anchor=(0., -0.2, 0.95, .1)
          )

plt.show()
<Figure size 432x288 with 0 Axes>
In [63]:
# Step 8: pack everything into a function
def create_waffle_chart(dataset, categories, values, height, width, colormap, value_sign=''):
    
    # compute the proportion of each category with respect to the total
    total_values = sum(values)
    category_proportions = [(float(value) / total_values) for value in values]

    # compute the total number of tiles
    total_num_tiles = width * height # total number of tiles
    print ('Total number of tiles is', total_num_tiles)
    
    # compute the number of tiles for each catagory
    tiles_per_category = [round(proportion * total_num_tiles) for proportion in category_proportions]

    # print out number of tiles per category
    for i, tiles in enumerate(tiles_per_category):
        print (dataset.index.values[i] + ': ' + str(tiles))
    
    # initialize the waffle chart as an empty matrix
    waffle_chart = np.zeros((height, width))

    # define indices to loop through waffle chart
    category_index = 0
    tile_index = 0

    # populate the waffle chart
    for col in range(width):
        for row in range(height):
            tile_index += 1

            # if the number of tiles populated for the current category 
            # is equal to its corresponding allocated tiles...
            if tile_index > sum(tiles_per_category[0:category_index]):
                # ...proceed to the next category
                category_index += 1       
            
            # set the class value to an integer, which increases with class
            waffle_chart[row, col] = category_index
    
    # instantiate a new figure object
    fig = plt.figure()

    # use matshow to display the waffle chart
    colormap = plt.cm.coolwarm
    plt.matshow(waffle_chart, cmap=colormap)
    plt.colorbar()

    # get the axis
    ax = plt.gca()

    # set minor ticks
    ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
    ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
    
    # add dridlines based on minor ticks
    ax.grid(which='minor', color='w', linestyle='-', linewidth=2)

    plt.xticks([])
    plt.yticks([])

    # compute cumulative sum of individual categories to match color schemes between chart and legend
    values_cumsum = np.cumsum(values)
    total_values = values_cumsum[len(values_cumsum) - 1]

    # create legend
    legend_handles = []
    for i, category in enumerate(categories):
        if value_sign == '%':
            label_str = category + ' (' + str(values[i]) + value_sign + ')'
        else:
            label_str = category + ' (' + value_sign + str(values[i]) + ')'
            
        color_val = colormap(float(values_cumsum[i])/total_values)
        legend_handles.append(mpatches.Patch(color=color_val, label=label_str))

    # add legend to chart
    plt.legend(
        handles=legend_handles,
        loc='lower center', 
        ncol=len(categories),
        bbox_to_anchor=(0., -0.2, 0.95, .1)
    )
In [64]:
width = 40 # width of chart
height = 10 # height of chart

categories = df_sub.index.values # categories
values = df_sub['Total'] # correponding values of categories

colormap = plt.cm.coolwarm # color map class
create_waffle_chart(df_sub, categories, values, height, width, colormap )
Total number of tiles is 400
Denmark: 129
Norway: 77
Sweden: 194
<Figure size 432x288 with 0 Axes>
In [ ]: