#!/usr/bin/env python
# coding: utf-8

#  # Segmenting and Clustering of Neighborhoods in Toronto City
# 
#  As Github does not support folium map, so if you want to see fully rendered notebook, click on this link 
#  https://nbviewer.jupyter.org/github/Mr-Piyush-Kumar/Data_Science_Projects/blob/master/Toronto_City_Neighborhood_Clustring/TorrontoCityNeighborhoodClustring.ipynb

# ### Introduction
# 
# This Notebook is the part of IBM Data Science Capastone Project. In This project, I am going to explore the nearby venues of Toronto City and after I will use machine learning to create clusters of these neighborhoods and will show all these clusters in the map of Toronto City.
# ### Created By:- Piyush Kumar

# # Part - 1 of this Project

# ### Objective
# Displaying Toronto City Neighborhoods dataset after scrapping and cleaning from wikipedia a page.

# In[1]:


# Importing libraries.
import pandas as pd
import numpy as np


# In[2]:


# This line of code will fetch all the tables in this 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 
# wikipedia page.

tabels = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')


# In[3]:


len(tabels)


# As we can see there are three tabels. Lets check which table is reqiured for us.

# In[4]:


# 1st Table, 
# NOTE:- each element in tabels list is a DataFrame.

tabels[0].head()


# As we can see this is the table which we reqiured, so there is no need to check other tabels.

# In[5]:


# Storing tabel 1 in a saparate DataFrame.

Toronto_df = tabels[0]
del(tabels) # deleting tables list, as we don't required it anymore.
Toronto_df.head()


# In[6]:


# Data Wrangling # Data Preprocessing

# 1- Changing DataFrame's columns name according to project instructions
Toronto_df.columns = ['PostalCode','Borough','Neighborhood']

# 2- Removing those rows whoose Borough is Not assigned
Toronto_df = Toronto_df[Toronto_df['Borough']!='Not assigned']

# 3- Grouping of Postal Code
temp_lst = []
for name, group in Toronto_df.groupby('PostalCode'):
    temp_lst.append([name, group['Borough'].unique()[0],", ".join(set(group['Neighborhood'].values))])
Toronto_df = pd.DataFrame(temp_lst, columns = ['PostalCode','Borough','Neighborhood'])

# 4- Replacing Not assigned values in Neighborhood with corresponding Borough
index = Toronto_df['Neighborhood'][Toronto_df['Neighborhood'].apply(lambda x: 'Not assigned' in str(x)) == True].index 
    # index where neighborhood is Not assigned
    
code = Toronto_df.iloc[index]['PostalCode'].values[0] # postal code where neighborhood is Not assigned
Toronto_df['Neighborhood'][Toronto_df['PostalCode']==code] = Toronto_df['Borough'] # replacing 

Toronto_df


# In[7]:


print('No. of rows in Toronto Data Frame are ',Toronto_df.shape[0],'.')


# # Part - 2 of this Project

# ### Objective
# Getting geographical co-ordinates of each neighborhood.

# In[8]:


coordinates_data = pd.read_csv('http://cocl.us/Geospatial_data') # Downloading coordinates data.
coordinates_data.columns = ['PostalCode','Latitude','Longitude']
coordinates_data.head()


# In[9]:


# merging Toronto_df and coordinates_data DataFrame together
Toronto_df = Toronto_df.merge(coordinates_data, how='left',on='PostalCode')
Toronto_df


# # Part - 3 of this Project

# ### Objective
# Exploring and clustering the neighborhoods in Toronto for only those boroughs that contains word Toronto in its name.

# In[10]:


# Getting Latitude and Longitude of Toronto City.
get_ipython().system('conda install -c conda-forge geopy --yes # Installing geopy library, this library helps in getting Latitude and Longitude of a given address.')
from geopy.geocoders import Nominatim # Nominatim converts an address into latitude and longitude values.


# In[11]:


address = 'Toronto, CA'

geolocator = Nominatim(user_agent="Toronto_explorer")
To_location = geolocator.geocode(address)
To_latitude = To_location.latitude
To_longitude = To_location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(To_latitude, To_longitude))


# ### Create a map of Toronto City with all neighborhoods superimposed on top.

# In[12]:


get_ipython().system('conda install -c conda-forge folium=0.5.0 --yes # Installing Folium Library')
import folium # map rendering library


# In[13]:


Toronto_map = folium.Map(location=[To_latitude,To_longitude], zoom_start = 11)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map


# As per the objective I have to show only those neighborhoods whoose borough name contain Toronto word

# In[14]:


# Filtering Data Set, Considering only those rows where column Borough contains Toronto Word.

New_Toronto_df = Toronto_df[Toronto_df['Borough'].apply(lambda x: 'Toronto' in str(x))]
New_Toronto_df.head()


# In[15]:


# list of unique Borough in New Data set
Borough_lst = New_Toronto_df.Borough.unique().tolist()
Borough_lst


# ### Creating map of Toronto city with new defined neighborhood superimposed on top

# In[16]:


Toronto_Map = folium.Map(location=[To_latitude,To_longitude],zoom_start=11)

for lat,lon,borough,neighborhood in zip(New_Toronto_df['Latitude'],New_Toronto_df['Longitude'],New_Toronto_df['Borough'],New_Toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat,lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_Map)

Toronto_Map


# Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

# ### Define Foursquare Credentials and Version

# In[17]:


CLIENT_ID = 'MUINF3SJELTWX0T2R3GWA5P5R3QYAGI2PDFGFR0HCERWTFNH' # my Foursquare ID
CLIENT_SECRET = 'TNBO5TIGKMZR0RR1ARMSUHBMPJ2V0JZBNZQ2G2220FAMS05U' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


# ### Let's explore the first neighborhood in our dataframe.
# Get the neighborhood's name.

# In[18]:


New_Toronto_df = New_Toronto_df.reset_index(drop=True)
New_Toronto_df.loc[0, 'Neighborhood']


# Get the latitude and longitude values of The Beaches.

# In[19]:


Beaches_latitude = New_Toronto_df.loc[0, 'Latitude'] # neighborhood latitude value
Beaches_longitude = New_Toronto_df.loc[0, 'Longitude'] # neighborhood longitude value

print('Latitude and longitude values of The Beaches are {}, {}.'.format(
                                                               Beaches_latitude, 
                                                               Beaches_longitude))


# ### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

# In[20]:


# First, let's create the GET request URL. Name your URL url.
LIMIT = 100 # no. of venues
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    Beaches_latitude, 
    Beaches_longitude, 
    radius, 
    LIMIT)

url 


# Send the GET request and examine the resutls

# In[21]:


import requests # importing request handling library

results = requests.get(url).json()
results


# In[22]:


# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


# Now we are ready to clean the json and structure it into a pandas dataframe.

# In[23]:


from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()


# And how many venues were returned by Foursquare?

# In[24]:


print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))


# # Explore Neighborhoods in The Beaches

# Let's create a function to repeat the same process to all the neighborhoods in Manhattan

# In[25]:


def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print('Processing ',name,'.....')
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


# ### Now write the code to run the above function on each neighborhood and create a new dataframe called Filtered_Toronto_Venues.

# In[26]:


Filtered_Toronto_Venues = getNearbyVenues(names=New_Toronto_df['Neighborhood'],
                                   latitudes=New_Toronto_df['Latitude'],
                                   longitudes=New_Toronto_df['Longitude']
                                  )
Filtered_Toronto_Venues.head()


# #### Let's check the size of the resulting dataframe

# In[27]:


print(Filtered_Toronto_Venues.shape)


# Let's check how many venues were returned for each neighborhood

# In[28]:


Filtered_Toronto_Venues.groupby('Neighborhood').count().iloc[:,0]


# ## Let's find out how many unique categories can be curated from all the returned venues

# In[29]:


print('There are {} uniques categories.'.format(len(Filtered_Toronto_Venues['Venue Category'].unique())))


# # Analyze Each Neighborhood

# In[30]:


# one hot encoding
Toronto_onehot = pd.get_dummies(Filtered_Toronto_Venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Filtered_Toronto_Venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()


# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

# In[31]:


Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped


# In[32]:


Toronto_grouped.shape


# ### Let's print each neighborhood along with the top 5 most common venues

# In[33]:


num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')


# ### Let's put that into a pandas dataframe
# First, let's write a function to sort the venues in descending order.

# In[34]:


def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


# Now let's create the new dataframe and display the top 10 venues for each neighborhood.

# In[35]:


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


# ## Cluster Neighborhoods

# Run k-means to cluster the neighborhood into 5 clusters.

# In[36]:


from sklearn.cluster import KMeans #importing KMeans

# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


# Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# In[37]:


# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto = New_Toronto_df

Toronto = Toronto.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto.head() 


# #### Finally, let's visualize the resulting clusters

# In[38]:


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[To_latitude, To_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Neighborhood'], Toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


#  ## Examining Clusters

# Now, I can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories,I can then assign a name to each cluster

# In[39]:


Toronto.loc[Toronto['Cluster Labels'] == 0, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]]


# In Cluster = 0 mostly all veneues related to Food and drinking services. 
# Name:- Food Services

# In[40]:


Toronto.loc[Toronto['Cluster Labels'] == 1, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]]


# Cluster = 1, Name:- Home Services

# In[41]:


Toronto.loc[Toronto['Cluster Labels'] == 2, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]]


# cluster=2 Name:- Garments Store

# In[42]:


Toronto.loc[Toronto['Cluster Labels'] == 3, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]]


# Cluster = 3 Name:- Play Grounds

# In[43]:


Toronto.loc[Toronto['Cluster Labels'] == 4, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]]


# Cluster = 4, Name:- Transports