#!/usr/bin/env python
# coding: utf-8

# fast.ai map of p1v3 participants

# In[1]:


import numpy as np
import pandas as pd

#put the path to your data files here
#PATH = '...'


# In[2]:


df = pd.read_csv(f'{PATH}fastaip1v3.csv')
df.head()


# In[3]:


df = df.groupby(['lat','long']).agg({'lat':'count'})
df.rename(columns={'lat': 'count'}, inplace=True)
df.reset_index(level=['lat', 'long'], inplace=True)
df.head()


# In[4]:


df.shape, df['count'].sum()


# World Cities Database downloaded from kaggle
# 
# https://www.kaggle.com/max-mind/world-cities-database

# In[5]:


df_places = pd.read_csv(f'{PATH}worldcitiespop.csv', low_memory=False)
df_places.head()


# In[6]:


df_places['lat'] = df_places.Latitude.round()
df_places['long'] = df_places.Longitude.round()
df_places.head()


# In[ ]:


#hack (see below)
# for checking manual addition of Country	City	AccentCity where
# there are no population figures
tmp = df_places.loc[df_places['lat'] == 24.0]
tmp = tmp.loc[df_places['long'] == 40.0]
tmp.head()


# Many places have the same `lat` and `long`. Select the place with the highest population.

# In[8]:


idx = df_places.groupby(['lat','long'])['Population'].transform(max) == df_places['Population']
df_places = df_places[idx]
df_places.head()


# In[9]:


df_map = df.merge(df_places, how='left', on=['lat','long'])
df_map=df_map[['lat','long','count','Country','AccentCity','Region']]
df_map.head()


# In[10]:


df_map.shape, df_map['count'].sum() #fails sanity check


# Problem with repeated values of `lat	long	count`

# In[11]:


for i in range (1, df_map.shape[0]):
    if (df_map.loc[i,['lat','long', 'count']] == df_map.loc[i-1,['lat','long', 'count']]).all():
        print (i)


# In[12]:


df_map.iloc[108:110]


# In[13]:


df_map.drop(df_map.index[109], inplace=True)


# In[14]:


df_map.iloc[147:149]


# In[15]:


df_map.drop(df_map.index[148], inplace=True)


# In[16]:


df_map.iloc[238:240]


# In[17]:


df_map.drop(df_map.index[239], inplace=True)


# In[18]:


df_map.iloc[440:442]


# In[19]:


df_map.drop(df_map.index[441], inplace=True)


# In[20]:


df_map.iloc[485:487]


# In[21]:


df_map.drop(df_map.index[486], inplace=True)


# In[22]:


df_map.shape, df_map['count'].sum() #passes sanity check


# Some places had no `Population` values, all `NaN`s

# In[23]:


df_map.iloc[df_map.isnull().any(1).nonzero()[0]]


# Manually allocate `Country AccentCity Region` by looking at
# 
# https://www.findlatitudeandlongitude.com
# 
# and looking at data from `df_places` at the `#hack` above
# 
# 122	24	40 sa Medina 05
# 
# 172	31	122 cn Shanghai 23
# 
# 186	33	77 in Darcha 11
# 
# 200	35	127 kr Gwangju 16
# 
# 215	36	108 cn Qingyang 15
# 
# 216	36	127 kr Jeonju 17
# 
# 217	36	129 kr Yeongcheon 14
# 
# 234	38	0 es San Juan de Alicante 60
# 
# 249	39	39 tr Elazig 62
# 
# 354	47	-89 us Houghton MI

# In[24]:


df_map.loc[122, ['Country','AccentCity','Region']]=['sa','Medina','05']
df_map.loc[172, ['Country','AccentCity','Region']]=['cn','Shanghai','23']
df_map.loc[186, ['Country','AccentCity','Region']]=['in','Darcha','11']
df_map.loc[200, ['Country','AccentCity','Region']]=['kr','Gwangju','16']
df_map.loc[215, ['Country','AccentCity','Region']]=['cn','Qingyang','15']
df_map.loc[216, ['Country','AccentCity','Region']]=['kr','Jeonju','17']
df_map.loc[217, ['Country','AccentCity','Region']]=['kr','Yeongcheon','14']
df_map.loc[234, ['Country','AccentCity','Region']]=['es', 'San Juan de Alicante', '60']
df_map.loc[249, ['Country','AccentCity','Region']]=['tr', 'Elazig', '62']
df_map.loc[354, ['Country','AccentCity','Region']]=['us', 'Houghton', 'MI']
df_map.loc[122]


# Convert two-letter country codes to upper case

# In[25]:


df_map['Country'] = df_map['Country'].apply(lambda x: x.upper())
df_map.head()


# ISO countries and regional codes downloaded from github
# 
# https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes

# In[26]:


df_iso = pd.read_csv(f'{PATH}all.csv')
df_iso.head()


# In[27]:


df_iso.rename(columns = {'alpha-2':'Country'}, inplace=True)
df_map = df_map.merge(df_iso, how='left', on=['Country'])
df_map.head()


# In[28]:


df_map.groupby('sub-region').agg({'count':'sum'}).sort_values('count', ascending=False)


# In[30]:


df_map.shape, df_map['count'].sum() #passes sanity check


# In[29]:


df_map.to_csv('map.csv')


# Steps in Tableau
# 
# 1. ‘Connect’ to a ‘Text file’ map.csv
# 2. Go to worksheet
# 3. In ‘Measures’ click on # symbol left of ‘Long’, set ‘Geographic role’ to ‘Longitude’
# 4. Drag ‘Lat’ and ‘Long’ to ‘Dimensions’
# 5. Drag ‘Lat’ and ‘Long’ to ‘Columns’ and ‘Rows’
# 6. Click on world map on the right
# 7. Drag to ‘Marks’ : ‘Count’, ‘Accent City’, ‘Country’, ‘Country-Code’, ‘Name’, ‘Sub-Region’
# 8. Drag ‘Sub-Region’ to ‘Color’ box
# 9. Drag ‘SUM(Count)’ to ‘Size’ box, adjust with slider
#