An analysis of public transport quality and accessibility in Santiago, Chile
Iacopo Garizio
This notebook uses census information and GTFS data to analyze public transport quality and accessibility to find underserved communities in Santiago, Chile. The findings show different results depending on the metric used. However, no matter the metric, the outskirts are always worse off than other areas.
Sections
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, box
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
import matplotlib.colors as colors
import random
import zipfile
# Defining a bounding box around Santiago. This will be used throughout the code.
BOX_POLY = box(-70.82, -33.645, -70.47, -33.3)
box_gdf = gpd.GeoDataFrame({'geometry': [BOX_POLY]}, crs="EPSG:4326")
blocks = gpd.read_file("zip://./data/chile_city_blocks.zip")
blocks.to_crs("EPSG:4326", inplace=True)
blocks = blocks[blocks.REGION == 'REGIÓN METROPOLITANA DE SANTIAGO'].copy()
blocks = gpd.overlay(blocks, box_gdf)
Because the census data has only information at a city block level, I simulate the position of each person inside each city block (keeping a margin from the border of the block). The points are uniformly distributed.
def generate_random_points(polygon, n):
points = []
min_x, min_y, max_x, max_y = polygon.bounds
while len(points) < n:
point = Point(random.uniform(min_x, max_x), random.uniform(min_y, max_y))
if polygon.contains(point):
points.append(point)
return points
def concat_gdf_list(gdf_list):
crs = gdf_list[0].crs
concat_df = pd.concat(random_points_list, ignore_index=True)
return gpd.GeoDataFrame(concat_df, crs=crs)
SCALE_FACTOR = 0.9 # This is used to scale each block to generate the outer gap
blocks['scaled_geometry'] = blocks.scale(SCALE_FACTOR, SCALE_FACTOR, SCALE_FACTOR)
random_points_list = []
for block in blocks.itertuples():
random_points = generate_random_points(block.scaled_geometry, block.TOTAL_PERS)
gdf_random_points = gpd.GeoDataFrame(geometry=random_points, crs="EPSG:4326")
gdf_random_points['FID'] = block.FID
random_points_list.append(gdf_random_points)
random_population = concat_gdf_list(random_points_list)
random_population.index.name = 'person_id'
# FID corresponds to the ID of each city block.
random_population.sample(5)
geometry | FID | |
---|---|---|
person_id | ||
4207599 | POINT (-70.63573 -33.38863) | 28366 |
698293 | POINT (-70.72513 -33.41886) | 3864 |
1467173 | POINT (-70.58471 -33.51653) | 9299 |
2397640 | POINT (-70.55137 -33.42735) | 15369 |
465182 | POINT (-70.65245 -33.45636) | 1887 |
Here we can see the randomization inside a selected city block and a selected commune.
SEL_COMMUNE = "ÑUÑOA"
SEL_BLOCK_FID = 24146
block_example = blocks[blocks.FID == SEL_BLOCK_FID]
random_population_block_example = random_population[random_population.FID == SEL_BLOCK_FID]
commune_example = blocks[blocks.COMUNA == SEL_COMMUNE]
random_population_commune_example = random_population[random_population.FID.isin(commune_example.FID)]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
block_example.plot(ax=ax1)
random_population_block_example.plot(ax=ax1, markersize=1, color='r')
commune_example.plot(ax=ax2)
random_population_commune_example.plot(ax=ax2, markersize=0.001, color='r')
ax1.set_title("Randomized population in one city block")
ax2.set_title("Randomized population in an entire commune")
ax1.axis('off')
ax2.axis('off')
plt.show()
zf = zipfile.ZipFile("./data/santiago_gtfs.zip")
stops_df = pd.read_csv(zf.open('stops.txt'))
trips_df = pd.read_csv(zf.open('trips.txt'))
stop_times_df = pd.read_csv(zf.open('stop_times.txt'))
The code keeps the trips that occur only in working days (by doing "service_id == 'L'"). It then uses this information to keep the matching stop_times (this means that we will only get stop_times that happen on working days). The code then counts the number of buses (or trains) and the number of unique routes that pass through each stop (or station). These two measures are going to be used as a proxy for quality, where a higher number of buses, trains, or routes imply better quality.
trips_working_day = trips_df[trips_df.service_id == 'L'].copy()
trips_working_day_id = trips_working_day.trip_id.unique()
stop_times_df_working_day = stop_times_df[stop_times_df.trip_id.isin(trips_working_day_id)].copy()
stop_n_buses = stop_times_df_working_day.groupby('stop_id').size().reset_index(name="n_buses")
stop_trips = pd.merge(trips_working_day, stop_times_df_working_day, on='trip_id', how='left')
stop_routes = stop_trips.groupby('stop_id').route_id.apply(set).reset_index(name="routes")
stops_df = pd.merge(stops_df, stop_n_buses, on='stop_id', how='left')
stops_df = pd.merge(stops_df, stop_routes, on='stop_id', how='left')
stops_df.n_buses.fillna(0, inplace=True) # Some stops in the GTFS don't have buses running through.
stops_df['routes'] = stops_df.routes.apply(lambda d: d if pd.notnull(d) else set()) # Can't do fillna(set()) directly.
stops_geometry = gpd.points_from_xy(stops_df.stop_lon, stops_df.stop_lat)
stops = gpd.GeoDataFrame(stops_df, geometry=stops_geometry, crs='EPSG:4326')
stops.drop(columns=['stop_code', 'stop_lat', 'stop_lon', 'stop_url'], inplace=True)
stgo = blocks.to_crs("EPSG:3857").buffer(100).to_crs("EPSG:4326").unary_union
stops = stops[stops.within(stgo)].copy() # Keeps all stops inside the parts of the city we're interested in.
For this process, I load the stops' locations into a kd-tree and find the k nearest stops for each simulated point. I then calculate a haversine distance between each point and each of the nearest stops.
def haversine_distance(point_1, point_2):
"""
Reference: https://stackoverflow.com/a/29546836/7657658
"""
lon1, lat1 = point_1.x.values, point_1.y.values
lon2, lat2 = point_2.x.values, point_2.y.values
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
c = 2 * np.arcsin(np.sqrt(a))
km = 6371 * c
return km
def get_nearest_stops(points, stops, k):
points = points.copy()
stops = stops.copy()
points_coordinates = list(zip(points.geometry.x, points.geometry.y))
stops_coordinates = list(zip(stops.geometry.x, stops.geometry.y))
btree = cKDTree(stops_coordinates)
if k != 1:
dist, idx = btree.query(points_coordinates, n_jobs=-1, k=k)
else:
dist, idx = btree.query(points_coordinates, n_jobs=-1, k=[1]) # Prevents dimentional squeeze
points_nearest_stops_dfs = []
for k_i in range(k):
stops_close_ki = stops.iloc[idx[:,k_i]].reset_index(drop=True).copy()
stops_close_ki.rename(columns={'geometry': 'stop_geometry'}, inplace=True)
points_nearest_stops = pd.concat([points.reset_index(), stops_close_ki], axis=1)
points_nearest_stops['distance_rank'] = k_i
points_nearest_stops['distance_km'] = haversine_distance(points_nearest_stops.stop_geometry, points_nearest_stops.geometry)
points_nearest_stops_dfs.append(points_nearest_stops)
return pd.concat(points_nearest_stops_dfs).reset_index(drop=True)
pop_nearests_k_stops = get_nearest_stops(random_population, stops, k=10)
pop_nearests_k_stops.sample(5)
person_id | geometry | FID | stop_id | stop_name | n_buses | routes | stop_geometry | distance_rank | distance_km | |
---|---|---|---|---|---|---|---|---|---|---|
7430965 | 1280324 | POINT (-70.71861 -33.47129) | 7743 | PI7 | PI7-Avenida 5 De Abril / Esq. El Altarcillo | 81.0 | {506v, 481, I14, I10, I13, 506, 509, 385, 101} | POINT (-70.71792 -33.47039) | 1 | 0.117982 |
29282315 | 4679751 | POINT (-70.62982 -33.47751) | 31528 | PH791 | PH791-Pintor Cicarelli / Esq. Artemio Gutiérrez | 27.0 | {E04, 385, D07} | POINT (-70.63238 -33.47628) | 4 | 0.273662 |
32424595 | 1671390 | POINT (-70.62925 -33.54955) | 10879 | PE617 | PE617-Sofía Eastman De H. / Esq. San Martín | 9.0 | {E09} | POINT (-70.62968 -33.54745) | 5 | 0.236735 |
32712129 | 1958924 | POINT (-70.64424 -33.57747) | 12932 | PG168 | PG168-Lo Martínez / Esq. Punta Horcón | 9.0 | {F20} | POINT (-70.64188 -33.57608) | 5 | 0.268174 |
31914177 | 1160972 | POINT (-70.65192 -33.52426) | 6967 | PG441 | PG441-General Freire / Esq. Iquique | 9.0 | {E02} | POINT (-70.65041 -33.52242) | 5 | 0.248343 |
For each city block, this code calculates:
This information is then merged into a single DataFrame.
def combine_unique_routes(routes):
return set().union(*routes.values)
DIST_MAX = 0.9 # Maximum distance in Kilometers
pop_nearest_stop = pop_nearests_k_stops[pop_nearests_k_stops.distance_rank == 0]
mean_dist_nearest = pop_nearest_stop.groupby('FID').distance_km.agg(mean_distance_nearest='mean', summed_distance_nearest='sum')
mean_dist_k_nearests = pop_nearests_k_stops.groupby('FID').distance_km.mean().to_frame(name="mean_distance_k_nearest")
pop_nearests_max_dist = pop_nearests_k_stops.copy()
farther_than_max = (pop_nearests_max_dist.distance_km > DIST_MAX)
pop_nearests_max_dist.loc[farther_than_max, 'n_buses'] = 0
pop_nearests_max_dist.loc[farther_than_max, 'routes'] = [set() for __ in range(farther_than_max.sum())]
n_buses_close = pop_nearests_max_dist.groupby(['FID', 'person_id'], as_index=False).n_buses.sum()
mean_n_buses_close = n_buses_close.groupby('FID').n_buses.mean().to_frame(name="mean_n_buses_close")
routes_close = pop_nearests_max_dist.groupby(['FID', 'person_id']).routes.apply(combine_unique_routes).reset_index(name="routes_close")
routes_close['n_routes'] = routes_close.routes_close.apply(len)
mean_n_routes_close = routes_close.groupby('FID').n_routes.mean().to_frame(name="mean_n_routes_close")
blocks_transit = blocks.copy()
blocks_transit = pd.merge(blocks_transit, mean_dist_nearest.reset_index(), on='FID', how='left')
blocks_transit = pd.merge(blocks_transit, mean_dist_k_nearests.reset_index(), on='FID', how='left')
blocks_transit = pd.merge(blocks_transit, mean_n_buses_close.reset_index(), on='FID', how='left')
blocks_transit = pd.merge(blocks_transit, mean_n_routes_close.reset_index(), on='FID', how='left')
blocks_transit['density'] = blocks_transit.TOTAL_PERS / blocks_transit.SHAPE_Area # Useful extra metric.
# blocks_transit_f corresponds to the *filtered* version of blocks_transit. Only keeps blocks with 10 or more people
blocks_transit_f = blocks_transit[blocks_transit.TOTAL_PERS >= 10].copy()
This will be used as a reference in the graphs.
communes = gpd.read_file("zip://./data/chile_communes.zip")
communes.to_crs("EPSG:4326", inplace=True)
communes = communes[communes.Region.isin(['Región Metropolitana de Santiago'])].copy()
def annotate_record(gdf_record, annotation_column, ax, annotation_outline_width):
text = gdf_record[annotation_column]
xy = gdf_record.geometry.representative_point().coords[0]
path_effects = [pe.withStroke(linewidth=annotation_outline_width, foreground="white")]
ax.annotate(text=text, xy=xy, path_effects=path_effects, ha='center')
def annotate_map(gdf, annotation_column, ax, annotation_outline_width):
gdf.apply(lambda r: annotate_record(r, annotation_column, ax, annotation_outline_width=annotation_outline_width), axis=1)
def plot_blocks(gdf, color_column=None, vmin=None, vmax=None,
plot_only_borders=False, linewidth=0.5, cmap='coolwarm',
background_color=None, color=None, cbar_colors='black',
figsize=(16, 16),
dpi=100):
fig, ax = plt.subplots(1, figsize=figsize, facecolor=background_color)
clipped_gdf = gdf.copy()
if plot_only_borders:
clipped_gdf['geometry'] = clipped_gdf.boundary
clipped_gdf = gpd.overlay(clipped_gdf, box_gdf)
clipped_gdf.plot(ax=ax, column=color_column, color=color, cmap=cmap, legend=True, vmin=vmin, vmax=vmax, linewidth=linewidth)
if color_column is not None:
cb_ax = fig.axes[1]
cb_ax.tick_params(colors=cbar_colors)
ax.axis('off')
return ax
def plot_communes(communes_gdf, annotate_communes=True, annotation_column='Comuna', linewidth=0.5, background_color=None,
annotation_outline_width=3, color="#0f0f0f", ax=None, figsize=(16, 16)):
if ax is None:
fig, ax = plt.subplots(1, figsize=figsize, facecolor=background_color)
clipped_communes = communes_gdf.copy()
clipped_communes['geometry'] = gpd.clip(clipped_communes.boundary, box_gdf)
clipped_communes.plot(ax=ax, color=color, linewidth=linewidth)
if annotate_communes:
overlayed_communes = gpd.overlay(communes_gdf, box_gdf, how='intersection')
annotate_map(overlayed_communes, annotation_column, ax, annotation_outline_width)
ax.axis('off')
return ax
def add_author_text(ax, pos_x=0.5, pos_y=0, fontsize=12, color='black'):
fontdict = {'fontsize': fontsize, 'color': color}
plt.text(pos_x, pos_y, 'Author: Iacopo Garizio',
fontdict=fontdict,
horizontalalignment='center',
verticalalignment='center',
transform=ax.transAxes)
The average person has to walk 188 meters to get to the nearest bus stop or Metro station, which approximately translates into 3 minutes walking (assuming a conservative speed of 1 m/s)[^1]. However, this number surpasses the 765 meters for the city blocks above the 98th percentile. Most of these are located on the outskirts.
From the map, it is also interesting to see that in almost every commune there are patches of low-accessibility, which highlights the importance of high-granularity analyses.
* It is important to notice that this would only hold if the path taken was a straight line. For most people, this path would not be so direct. To obtain an approximation for the real distance traveled, some studies have obtained a correction factor of around 1.35.
var_name = 'mean_distance_nearest'
vmin = blocks_transit_f[var_name].quantile(0.05)
vmax = blocks_transit_f[var_name].quantile(0.90)
ax = plot_blocks(blocks_transit_f, var_name, vmin, vmax, cmap='coolwarm')
ax = plot_communes(communes, annotate_communes=True, linewidth=0.3, ax=ax)
title = f"Average distance to the nearest bus stop/Metro station"
ax.set_title(title, {'fontsize': 20})
add_author_text(ax)
plt.show()
print(f"Overall average: {blocks_transit_f.mean_distance_nearest.mean():.3}Km")
print(f"98th percentile: {blocks_transit_f.mean_distance_nearest.quantile(0.98):.3}Km")
blocks_transit_f.mean_distance_nearest.hist(bins=100, range=(0,1))
plt.title("Histogram of the average distance to the nearest bus stop/Metro station")
plt.show()
Overall average: 0.188Km 98th percentile: 0.765Km
Following the same logic as before, the typical user would have to walk 338 meters (on average) to reach each of the ten nearest bus stops or Metro stations. When compared to the last map, this one shows a slight increase in the size of the underserved areas. The outskirts of the city are still the ones with the worst accessibility.
var_name = 'mean_distance_k_nearest'
vmin = blocks_transit_f[var_name].quantile(0.05)
vmax = blocks_transit_f[var_name].quantile(0.90)
ax = plot_blocks(blocks_transit_f, var_name, vmin, vmax, cmap='coolwarm')
ax = plot_communes(communes, annotate_communes=True, linewidth=0.3, ax=ax)
title = f"Average distance to the nearest 10 bus stops/Metro stations"
ax.set_title(title, {'fontsize': 20})
add_author_text(ax)
plt.show()
print(f"Overall average: {blocks_transit_f.mean_distance_k_nearest.mean():.3}Km")
print(f"Top 10%: {blocks_transit_f.mean_distance_k_nearest.quantile(0.9):.3}Km")
blocks_transit_f.mean_distance_k_nearest.hist(bins=100, range=(0,1))
plt.title("Histogram of the average distance to the\nnearest 10 bus stops/Metro stations")
plt.show()
Overall average: 0.338Km Top 10%: 0.419Km
As a way to try to weigh in the fact that some city blocks are more populated than others, I included a metric composed of the sum (not the average) of the distances. This means that if for a given city block we see a summed distance of 45Km, the people who live in that city block would walk collectively 45Km to reach the nearest bus stop/Metro station.
var_name = 'summed_distance_nearest'
vmin = blocks_transit_f[var_name].quantile(0.05)
vmax = blocks_transit_f[var_name].quantile(0.90)
ax = plot_blocks(blocks_transit_f, var_name, vmin, vmax, cmap='coolwarm')
ax = plot_communes(communes, annotate_communes=True, linewidth=0.3, ax=ax)
title = f"Summed distance to the nearest bus stop/Metro station for each city block"
ax.set_title(title, {'fontsize': 20})
add_author_text(ax)
plt.show()
As a proxy for "connectivity" (which aims to be a measure for quality of service), I included the average number of unique bus routes or Metro lines that a person can access within 900m (which, in a straight line, is a 15-minute walk), restricting to the nearest ten stops or stations.
On the average city block, a person can access 7.8 different bus routes or Metro lines. However, this is not uniformly distributed across the city. The bottom 1% of city blocks have access to 0 routes or lines and the bottom 5% to only 2, while the top 10% have access to 13.36 or more buses or lines. It should be noted that (as expected from the other maps) the outskirts of the city have the worst connectivity.
var_name = 'mean_n_routes_close'
vmin = blocks_transit_f[var_name].quantile(0)
vmax = blocks_transit_f[var_name].quantile(0.95)
ax = plot_blocks(blocks_transit_f, var_name, vmin, vmax, cmap='coolwarm_r')
ax = plot_communes(communes, annotate_communes=True, linewidth=0.3, ax=ax)
title = f"Average number of unique bus routes/Metro lines within {int(DIST_MAX*1000)}m"
ax.set_title(title, {'fontsize': 20})
add_author_text(ax)
plt.show()
mean = blocks_transit_f.mean_n_routes_close.mean()
q_01 = blocks_transit_f.mean_n_routes_close.quantile(0.01)
q_05 = blocks_transit_f.mean_n_routes_close.quantile(0.05)
q_90 = blocks_transit_f.mean_n_routes_close.quantile(0.90)
print(f"Average: {mean:.4}\nBottom 1%: {q_01:.4}\nBottom 5%: {q_05:.4}\nTop 10%: {q_90:.4}")
blocks_transit_f.mean_n_routes_close.hist(bins=20)
plt.title("Histogram of the average number of unique bus routes/Metro\nlines accessible to each person within 900m")
plt.show()
Average: 7.802 Bottom 1%: 0.0 Bottom 5%: 2.015 Top 10%: 13.36
Similarly to the previous variable, the average number of buses or trains also tries to be a proxy for quality of service.
Comparing with the last map, there are not many marked differences.
var_name = 'mean_n_buses_close'
vmin = blocks_transit_f[var_name].quantile(0)
vmax = blocks_transit_f[var_name].quantile(0.95)
ax = plot_blocks(blocks_transit_f, var_name, vmin, vmax, cmap='coolwarm_r')
ax = plot_communes(communes, annotate_communes=True, linewidth=0.3, ax=ax)
title = f"Average number of buses/trains for stops within {int(DIST_MAX*1000)}m in a working day"
ax.set_title(title, {'fontsize': 20})
add_author_text(ax)
plt.show()
The analysis has shown that the response to the question "Who is being left out by public transport?" depends on what we understand by being left out. If we understand being left out as having a high distance to the nearest bus stop or Metro station, we see that the outskirts of the city and some dispersed low-connectivity patches are the most underserved. If we now consider being left out as having a low number of buses, trains, or routes available, then the only areas well-served are the ones near main roads. However, despite the differences in the results obtained by each metric, there is one group that is constantly shown as having the worst quality and accessibility in terms of public transport: the outskirts of the city.
ax = plot_blocks(blocks, color='black', plot_only_borders=True, linewidth=0.3, cmap=None)
ax = plot_communes(communes, annotate_communes=False, color="black", linewidth=0.5, ax=ax)
clipped_stops = gpd.clip(stops, box_gdf)
clipped_stops.plot(ax=ax, markersize=5, color='red', alpha=0.5)
title = "Bus stops and Metro stations in Santiago"
ax.set_title(title, {'fontsize': 20, 'color': 'black'})
add_author_text(ax)
plt.show()
var_name = 'mean_n_routes_close'
vmin = blocks_transit_f[var_name].quantile(0.05)
vmax = blocks_transit_f[var_name].quantile(0.90)
ax = plot_blocks(blocks_transit_f, var_name, vmin, vmax,
plot_only_borders=True, background_color='black',
cbar_colors='white', cmap='coolwarm_r')
ax = plot_communes(communes, annotate_communes=False, color="white", linewidth=0.3, ax=ax)
title = f"Average number of unique bus routes/bus lines within {int(DIST_MAX*1000)}m"
ax.set_title(title, {'fontsize': 20, 'color': 'white'})
add_author_text(ax, color='white')
plt.show()
var_name = 'density'
vmin = blocks_transit_f[var_name].quantile(0.05)
vmax = blocks_transit_f[var_name].quantile(0.90)
ax = plot_blocks(blocks_transit_f, var_name, vmin, vmax, cmap='coolwarm')
ax = plot_communes(communes, annotate_communes=True, linewidth=0.3, ax=ax)
title = f"Population density"
ax.set_title(title, {'fontsize': 20})
add_author_text(ax)
plt.show()