#!/usr/bin/env python # coding: utf-8 # # Segmenting and Clustering of Neighborhoods in Toronto City # # As Github does not support folium map, so if you want to see fully rendered notebook, click on this link # https://nbviewer.jupyter.org/github/Mr-Piyush-Kumar/Data_Science_Projects/blob/master/Toronto_City_Neighborhood_Clustring/TorrontoCityNeighborhoodClustring.ipynb # ### Introduction # # This Notebook is the part of IBM Data Science Capastone Project. In This project, I am going to explore the nearby venues of Toronto City and after I will use machine learning to create clusters of these neighborhoods and will show all these clusters in the map of Toronto City. # ### Created By:- Piyush Kumar # # Part - 1 of this Project # ### Objective # Displaying Toronto City Neighborhoods dataset after scrapping and cleaning from wikipedia a page. # In[1]: # Importing libraries. import pandas as pd import numpy as np # In[2]: # This line of code will fetch all the tables in this 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # wikipedia page. tabels = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M') # In[3]: len(tabels) # As we can see there are three tabels. Lets check which table is reqiured for us. # In[4]: # 1st Table, # NOTE:- each element in tabels list is a DataFrame. tabels[0].head() # As we can see this is the table which we reqiured, so there is no need to check other tabels. # In[5]: # Storing tabel 1 in a saparate DataFrame. Toronto_df = tabels[0] del(tabels) # deleting tables list, as we don't required it anymore. Toronto_df.head() # In[6]: # Data Wrangling # Data Preprocessing # 1- Changing DataFrame's columns name according to project instructions Toronto_df.columns = ['PostalCode','Borough','Neighborhood'] # 2- Removing those rows whoose Borough is Not assigned Toronto_df = Toronto_df[Toronto_df['Borough']!='Not assigned'] # 3- Grouping of Postal Code temp_lst = [] for name, group in Toronto_df.groupby('PostalCode'): temp_lst.append([name, group['Borough'].unique()[0],", ".join(set(group['Neighborhood'].values))]) Toronto_df = pd.DataFrame(temp_lst, columns = ['PostalCode','Borough','Neighborhood']) # 4- Replacing Not assigned values in Neighborhood with corresponding Borough index = Toronto_df['Neighborhood'][Toronto_df['Neighborhood'].apply(lambda x: 'Not assigned' in str(x)) == True].index # index where neighborhood is Not assigned code = Toronto_df.iloc[index]['PostalCode'].values[0] # postal code where neighborhood is Not assigned Toronto_df['Neighborhood'][Toronto_df['PostalCode']==code] = Toronto_df['Borough'] # replacing Toronto_df # In[7]: print('No. of rows in Toronto Data Frame are ',Toronto_df.shape[0],'.') # # Part - 2 of this Project # ### Objective # Getting geographical co-ordinates of each neighborhood. # In[8]: coordinates_data = pd.read_csv('http://cocl.us/Geospatial_data') # Downloading coordinates data. coordinates_data.columns = ['PostalCode','Latitude','Longitude'] coordinates_data.head() # In[9]: # merging Toronto_df and coordinates_data DataFrame together Toronto_df = Toronto_df.merge(coordinates_data, how='left',on='PostalCode') Toronto_df # # Part - 3 of this Project # ### Objective # Exploring and clustering the neighborhoods in Toronto for only those boroughs that contains word Toronto in its name. # In[10]: # Getting Latitude and Longitude of Toronto City. get_ipython().system('conda install -c conda-forge geopy --yes # Installing geopy library, this library helps in getting Latitude and Longitude of a given address.') from geopy.geocoders import Nominatim # Nominatim converts an address into latitude and longitude values. # In[11]: address = 'Toronto, CA' geolocator = Nominatim(user_agent="Toronto_explorer") To_location = geolocator.geocode(address) To_latitude = To_location.latitude To_longitude = To_location.longitude print('The geograpical coordinate of Toronto City are {}, {}.'.format(To_latitude, To_longitude)) # ### Create a map of Toronto City with all neighborhoods superimposed on top. # In[12]: get_ipython().system('conda install -c conda-forge folium=0.5.0 --yes # Installing Folium Library') import folium # map rendering library # In[13]: Toronto_map = folium.Map(location=[To_latitude,To_longitude], zoom_start = 11) # add markers to map for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']): label = '{}, {}'.format(neighborhood, borough) label = folium.Popup(label, parse_html=True) folium.CircleMarker( [lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(Toronto_map) Toronto_map # As per the objective I have to show only those neighborhoods whoose borough name contain Toronto word # In[14]: # Filtering Data Set, Considering only those rows where column Borough contains Toronto Word. New_Toronto_df = Toronto_df[Toronto_df['Borough'].apply(lambda x: 'Toronto' in str(x))] New_Toronto_df.head() # In[15]: # list of unique Borough in New Data set Borough_lst = New_Toronto_df.Borough.unique().tolist() Borough_lst # ### Creating map of Toronto city with new defined neighborhood superimposed on top # In[16]: Toronto_Map = folium.Map(location=[To_latitude,To_longitude],zoom_start=11) for lat,lon,borough,neighborhood in zip(New_Toronto_df['Latitude'],New_Toronto_df['Longitude'],New_Toronto_df['Borough'],New_Toronto_df['Neighborhood']): label = '{}, {}'.format(neighborhood, borough) label = folium.Popup(label,parse_html=True) folium.CircleMarker( [lat,lon], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(Toronto_Map) Toronto_Map # Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them. # ### Define Foursquare Credentials and Version # In[17]: CLIENT_ID = 'MUINF3SJELTWX0T2R3GWA5P5R3QYAGI2PDFGFR0HCERWTFNH' # my Foursquare ID CLIENT_SECRET = 'TNBO5TIGKMZR0RR1ARMSUHBMPJ2V0JZBNZQ2G2220FAMS05U' # my Foursquare Secret VERSION = '20180605' # Foursquare API version print('My credentails:') print('CLIENT_ID: ' + CLIENT_ID) print('CLIENT_SECRET:' + CLIENT_SECRET) # ### Let's explore the first neighborhood in our dataframe. # Get the neighborhood's name. # In[18]: New_Toronto_df = New_Toronto_df.reset_index(drop=True) New_Toronto_df.loc[0, 'Neighborhood'] # Get the latitude and longitude values of The Beaches. # In[19]: Beaches_latitude = New_Toronto_df.loc[0, 'Latitude'] # neighborhood latitude value Beaches_longitude = New_Toronto_df.loc[0, 'Longitude'] # neighborhood longitude value print('Latitude and longitude values of The Beaches are {}, {}.'.format( Beaches_latitude, Beaches_longitude)) # ### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters. # In[20]: # First, let's create the GET request URL. Name your URL url. LIMIT = 100 # no. of venues radius = 500 url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, Beaches_latitude, Beaches_longitude, radius, LIMIT) url # Send the GET request and examine the resutls # In[21]: import requests # importing request handling library results = requests.get(url).json() results # In[22]: # function that extracts the category of the venue def get_category_type(row): try: categories_list = row['categories'] except: categories_list = row['venue.categories'] if len(categories_list) == 0: return None else: return categories_list[0]['name'] # Now we are ready to clean the json and structure it into a pandas dataframe. # In[23]: from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe venues = results['response']['groups'][0]['items'] nearby_venues = json_normalize(venues) # flatten JSON # filter columns filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng'] nearby_venues =nearby_venues.loc[:, filtered_columns] # filter the category for each row nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1) # clean columns nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns] nearby_venues.head() # And how many venues were returned by Foursquare? # In[24]: print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0])) # # Explore Neighborhoods in The Beaches # Let's create a function to repeat the same process to all the neighborhoods in Manhattan # In[25]: def getNearbyVenues(names, latitudes, longitudes, radius=500): venues_list=[] for name, lat, lng in zip(names, latitudes, longitudes): print('Processing ',name,'.....') # create the API request URL url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT) # make the GET request results = requests.get(url).json()["response"]['groups'][0]['items'] # return only relevant information for each nearby venue venues_list.append([( name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results]) nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list]) nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category'] return(nearby_venues) # ### Now write the code to run the above function on each neighborhood and create a new dataframe called Filtered_Toronto_Venues. # In[26]: Filtered_Toronto_Venues = getNearbyVenues(names=New_Toronto_df['Neighborhood'], latitudes=New_Toronto_df['Latitude'], longitudes=New_Toronto_df['Longitude'] ) Filtered_Toronto_Venues.head() # #### Let's check the size of the resulting dataframe # In[27]: print(Filtered_Toronto_Venues.shape) # Let's check how many venues were returned for each neighborhood # In[28]: Filtered_Toronto_Venues.groupby('Neighborhood').count().iloc[:,0] # ## Let's find out how many unique categories can be curated from all the returned venues # In[29]: print('There are {} uniques categories.'.format(len(Filtered_Toronto_Venues['Venue Category'].unique()))) # # Analyze Each Neighborhood # In[30]: # one hot encoding Toronto_onehot = pd.get_dummies(Filtered_Toronto_Venues[['Venue Category']], prefix="", prefix_sep="") # add neighborhood column back to dataframe Toronto_onehot['Neighborhood'] = Filtered_Toronto_Venues['Neighborhood'] # move neighborhood column to the first column fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1]) Toronto_onehot = Toronto_onehot[fixed_columns] Toronto_onehot.head() # Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category # In[31]: Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index() Toronto_grouped # In[32]: Toronto_grouped.shape # ### Let's print each neighborhood along with the top 5 most common venues # In[33]: num_top_venues = 5 for hood in Toronto_grouped['Neighborhood']: print("----"+hood+"----") temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index() temp.columns = ['venue','freq'] temp = temp.iloc[1:] temp['freq'] = temp['freq'].astype(float) temp = temp.round({'freq': 2}) print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues)) print('\n') # ### Let's put that into a pandas dataframe # First, let's write a function to sort the venues in descending order. # In[34]: def return_most_common_venues(row, num_top_venues): row_categories = row.iloc[1:] row_categories_sorted = row_categories.sort_values(ascending=False) return row_categories_sorted.index.values[0:num_top_venues] # Now let's create the new dataframe and display the top 10 venues for each neighborhood. # In[35]: num_top_venues = 10 indicators = ['st', 'nd', 'rd'] # create columns according to number of top venues columns = ['Neighborhood'] for ind in np.arange(num_top_venues): try: columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind])) except: columns.append('{}th Most Common Venue'.format(ind+1)) # create a new dataframe neighborhoods_venues_sorted = pd.DataFrame(columns=columns) neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood'] for ind in np.arange(Toronto_grouped.shape[0]): neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues) neighborhoods_venues_sorted.head() # ## Cluster Neighborhoods # Run k-means to cluster the neighborhood into 5 clusters. # In[36]: from sklearn.cluster import KMeans #importing KMeans # set number of clusters kclusters = 5 Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1) # run k-means clustering kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering) # check cluster labels generated for each row in the dataframe kmeans.labels_[0:10] # Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood. # In[37]: # add clustering labels neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_) Toronto = New_Toronto_df Toronto = Toronto.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood') Toronto.head() # #### Finally, let's visualize the resulting clusters # In[38]: # Matplotlib and associated plotting modules import matplotlib.cm as cm import matplotlib.colors as colors # create map map_clusters = folium.Map(location=[To_latitude, To_longitude], zoom_start=11) # set color scheme for the clusters x = np.arange(kclusters) ys = [i + x + (i*x)**2 for i in range(kclusters)] colors_array = cm.rainbow(np.linspace(0, 1, len(ys))) rainbow = [colors.rgb2hex(i) for i in colors_array] # add markers to the map markers_colors = [] for lat, lon, poi, cluster in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Neighborhood'], Toronto['Cluster Labels']): label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True) folium.CircleMarker( [lat, lon], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_clusters) map_clusters # ## Examining Clusters # Now, I can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories,I can then assign a name to each cluster # In[39]: Toronto.loc[Toronto['Cluster Labels'] == 0, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]] # In Cluster = 0 mostly all veneues related to Food and drinking services. # Name:- Food Services # In[40]: Toronto.loc[Toronto['Cluster Labels'] == 1, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]] # Cluster = 1, Name:- Home Services # In[41]: Toronto.loc[Toronto['Cluster Labels'] == 2, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]] # cluster=2 Name:- Garments Store # In[42]: Toronto.loc[Toronto['Cluster Labels'] == 3, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]] # Cluster = 3 Name:- Play Grounds # In[43]: Toronto.loc[Toronto['Cluster Labels'] == 4, Toronto.columns[[1] + list(range(5, Toronto.shape[1]))]] # Cluster = 4, Name:- Transports