#!/usr/bin/env python # coding: utf-8 # fast.ai map of p1v3 participants # In[1]: import numpy as np import pandas as pd #put the path to your data files here #PATH = '...' # In[2]: df = pd.read_csv(f'{PATH}fastaip1v3.csv') df.head() # In[3]: df = df.groupby(['lat','long']).agg({'lat':'count'}) df.rename(columns={'lat': 'count'}, inplace=True) df.reset_index(level=['lat', 'long'], inplace=True) df.head() # In[4]: df.shape, df['count'].sum() # World Cities Database downloaded from kaggle # # https://www.kaggle.com/max-mind/world-cities-database # In[5]: df_places = pd.read_csv(f'{PATH}worldcitiespop.csv', low_memory=False) df_places.head() # In[6]: df_places['lat'] = df_places.Latitude.round() df_places['long'] = df_places.Longitude.round() df_places.head() # In[ ]: #hack (see below) # for checking manual addition of Country City AccentCity where # there are no population figures tmp = df_places.loc[df_places['lat'] == 24.0] tmp = tmp.loc[df_places['long'] == 40.0] tmp.head() # Many places have the same `lat` and `long`. Select the place with the highest population. # In[8]: idx = df_places.groupby(['lat','long'])['Population'].transform(max) == df_places['Population'] df_places = df_places[idx] df_places.head() # In[9]: df_map = df.merge(df_places, how='left', on=['lat','long']) df_map=df_map[['lat','long','count','Country','AccentCity','Region']] df_map.head() # In[10]: df_map.shape, df_map['count'].sum() #fails sanity check # Problem with repeated values of `lat long count` # In[11]: for i in range (1, df_map.shape[0]): if (df_map.loc[i,['lat','long', 'count']] == df_map.loc[i-1,['lat','long', 'count']]).all(): print (i) # In[12]: df_map.iloc[108:110] # In[13]: df_map.drop(df_map.index[109], inplace=True) # In[14]: df_map.iloc[147:149] # In[15]: df_map.drop(df_map.index[148], inplace=True) # In[16]: df_map.iloc[238:240] # In[17]: df_map.drop(df_map.index[239], inplace=True) # In[18]: df_map.iloc[440:442] # In[19]: df_map.drop(df_map.index[441], inplace=True) # In[20]: df_map.iloc[485:487] # In[21]: df_map.drop(df_map.index[486], inplace=True) # In[22]: df_map.shape, df_map['count'].sum() #passes sanity check # Some places had no `Population` values, all `NaN`s # In[23]: df_map.iloc[df_map.isnull().any(1).nonzero()[0]] # Manually allocate `Country AccentCity Region` by looking at # # https://www.findlatitudeandlongitude.com # # and looking at data from `df_places` at the `#hack` above # # 122 24 40 sa Medina 05 # # 172 31 122 cn Shanghai 23 # # 186 33 77 in Darcha 11 # # 200 35 127 kr Gwangju 16 # # 215 36 108 cn Qingyang 15 # # 216 36 127 kr Jeonju 17 # # 217 36 129 kr Yeongcheon 14 # # 234 38 0 es San Juan de Alicante 60 # # 249 39 39 tr Elazig 62 # # 354 47 -89 us Houghton MI # In[24]: df_map.loc[122, ['Country','AccentCity','Region']]=['sa','Medina','05'] df_map.loc[172, ['Country','AccentCity','Region']]=['cn','Shanghai','23'] df_map.loc[186, ['Country','AccentCity','Region']]=['in','Darcha','11'] df_map.loc[200, ['Country','AccentCity','Region']]=['kr','Gwangju','16'] df_map.loc[215, ['Country','AccentCity','Region']]=['cn','Qingyang','15'] df_map.loc[216, ['Country','AccentCity','Region']]=['kr','Jeonju','17'] df_map.loc[217, ['Country','AccentCity','Region']]=['kr','Yeongcheon','14'] df_map.loc[234, ['Country','AccentCity','Region']]=['es', 'San Juan de Alicante', '60'] df_map.loc[249, ['Country','AccentCity','Region']]=['tr', 'Elazig', '62'] df_map.loc[354, ['Country','AccentCity','Region']]=['us', 'Houghton', 'MI'] df_map.loc[122] # Convert two-letter country codes to upper case # In[25]: df_map['Country'] = df_map['Country'].apply(lambda x: x.upper()) df_map.head() # ISO countries and regional codes downloaded from github # # https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes # In[26]: df_iso = pd.read_csv(f'{PATH}all.csv') df_iso.head() # In[27]: df_iso.rename(columns = {'alpha-2':'Country'}, inplace=True) df_map = df_map.merge(df_iso, how='left', on=['Country']) df_map.head() # In[28]: df_map.groupby('sub-region').agg({'count':'sum'}).sort_values('count', ascending=False) # In[30]: df_map.shape, df_map['count'].sum() #passes sanity check # In[29]: df_map.to_csv('map.csv') # Steps in Tableau # # 1. ‘Connect’ to a ‘Text file’ map.csv # 2. Go to worksheet # 3. In ‘Measures’ click on # symbol left of ‘Long’, set ‘Geographic role’ to ‘Longitude’ # 4. Drag ‘Lat’ and ‘Long’ to ‘Dimensions’ # 5. Drag ‘Lat’ and ‘Long’ to ‘Columns’ and ‘Rows’ # 6. Click on world map on the right # 7. Drag to ‘Marks’ : ‘Count’, ‘Accent City’, ‘Country’, ‘Country-Code’, ‘Name’, ‘Sub-Region’ # 8. Drag ‘Sub-Region’ to ‘Color’ box # 9. Drag ‘SUM(Count)’ to ‘Size’ box, adjust with slider #