To see this analysis live, check out my article "Analyzing Last.fm Listening History"
This notebook loads a set of artists from musicbrainz, created by the musicbrainz_downloader. Then it takes each's place name (ie, either where they're from or where they're most associated with - as determined in other notebook), and geocodes that place name to lat long. Then it maps the artists.
Nominatim API documentation: https://wiki.openstreetmap.org/wiki/Nominatim
Sample Nominatim query: https://nominatim.openstreetmap.org/search?format=json&q=brixton,london,england
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time, requests
from mpl_toolkits.basemap import Basemap
from geopy.distance import great_circle
%matplotlib inline
pause = 0.75
Nominatim and Google APIs
def geocode_nominatim(address):
time.sleep(pause)
url = u'https://nominatim.openstreetmap.org/search?format=json&q={}'
request = url.format(address)
response = requests.get(request)
data = response.json()
if len(data) > 0:
return '{},{}'.format(data[0]['lat'], data[0]['lon'])
def geocode_google(address):
time.sleep(pause)
url = u'http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address={}'
request = url.format(address)
response = requests.get(request)
data = response.json()
if len(data['results']) > 0:
latitude = data['results'][0]['geometry']['location']['lat']
longitude = data['results'][0]['geometry']['location']['lng']
return '{},{}'.format(latitude, longitude)
address = u"Brixton, London, England, United Kingdom"
latlng_google = geocode_google(address)
latlng_nominatim = geocode_nominatim(address)
print '{} google'.format(latlng_google)
print '{} nominatim'.format(latlng_nominatim)
print '{} miles apart'.format(round(great_circle(latlng_google, latlng_nominatim).miles, 2))
51.4612794,-0.1156148 google 51.4568044,-0.1167958 nominatim 0.31 miles apart
artists = pd.read_csv('data/mb.csv', encoding='utf-8')
print '{:,} total artists'.format(len(artists))
# drop nans and get the unique set of places
addresses = pd.Series(artists['place_full'].dropna().sort_values().unique())
print '{:,} unique places'.format(len(addresses))
428 total artists 231 unique places
def get_country_if_more_detail(address):
tokens = address.split(',')
if len(tokens) > 1:
return tokens[-1].strip()
# if a place contains only country name, check if that country name exists with more detail elsewhere in the list of places
# countries_with_more_detail is a list of all the countries that appear at end of comma-separated address strings
countries_with_more_detail = pd.Series(addresses.map(get_country_if_more_detail).dropna().sort_values().unique())
print '{:,} countries with more detail'.format(len(countries_with_more_detail))
# if so, discard the instance that is country name only - this country is represented elsewhere in list with finer grain info
# ie, keep 'estonia' if there is no 'talinn, estonia' elsewhere in list,
# but discard 'russia' if 'moscow, russia' exists elsewhere in the list
addresses_to_geocode = addresses[~addresses.isin(countries_with_more_detail)]
print '{:,} unique addresses to geocode'.format(len(addresses_to_geocode))
24 countries with more detail 217 unique addresses to geocode
# geocode (with nominatim) each retained address (ie, full place name string)
start_time = time.time()
latlng_dict = {}
for address, n in zip(addresses_to_geocode, range(len(addresses_to_geocode))):
if n % 10 == 0: print n,
latlng_dict[address] = geocode_nominatim(address)
finish_time = time.time()
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210
print 'nominatim geocoded {:,} addresses in {:,} seconds'.format(len(addresses_to_geocode), int(finish_time-start_time))
print 'received {:,} non-null lat-longs'.format(len([key for key in latlng_dict if latlng_dict[key] is not None]))
nominatim geocoded 217 addresses in 191 seconds received 208 non-null lat-longs
# which addresses failed to geocode successfully?
addresses_to_geocode = [ key for key in latlng_dict if latlng_dict[key] is None ]
# now geocode (with google) each address that failed with nominatim
if len(addresses_to_geocode) < 2500: #daily google request limit
start_time = time.time()
for address, n in zip(addresses_to_geocode, range(len(addresses_to_geocode))):
if n % 10 == 0: print n,
latlng_dict[address] = geocode_google(address)
finish_time = time.time()
0
print 'google geocoded {:,} addresses in {:,} seconds'.format(len(addresses_to_geocode), int(finish_time-start_time))
print 'received {:,} non-null lat-longs'.format(len([key for key in latlng_dict if latlng_dict[key] is not None]))
google geocoded 9 addresses in 7 seconds received 217 non-null lat-longs
# for each artist, if their place appears in the geocoded dict, pull the latlng value from dict into new df column
def get_latlng_by_address(address):
try:
return latlng_dict[address]
except:
return None
artists['place_latlng'] = artists['place_full'].map(get_latlng_by_address)
artists[['name', 'place_full', 'place_latlng']].sort_values(by='place_full').head()
name | place_full | place_latlng | |
---|---|---|---|
306 | Willie Nelson | Abbott, Hill County, Texas, United States | 31.8848809,-97.073336 |
31 | Linkin Park | Agoura Hills, Los Angeles County, California, ... | 34.1363945,-118.7745347 |
332 | Sum 41 | Ajax, Ontario, Canada | 43.8492143,-79.0241784 |
60 | Demi Lovato | Albuquerque, Bernalillo County, New Mexico, Un... | 35.0841034,-106.650985 |
209 | Jeff Buckley | Anaheim, Orange County, California, United States | 33.8347516,-117.9117319 |
artists.to_csv('data/mb_geocoded.csv', index=False, encoding='utf-8')
# get discrete vectors of lats and lons, for easy x-y scatter-plotting
lats = artists['place_latlng'].dropna().map(lambda x: float(x.split(',')[0]))
lons = artists['place_latlng'].dropna().map(lambda x: float(x.split(',')[1]))
# define map colors
land_color = '#f5f5f3'
water_color = '#cdd2d4'
coastline_color = '#f5f5f3'
border_color = '#bbbbbb'
meridian_color = '#f5f5f3'
marker_fill_color = 'r'
marker_edge_color = 'None'
# create the plot
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, axisbg='#ffffff', frame_on=False)
ax.set_title('Last.fm Artist Origins', fontsize=24, color='#333333')
# draw the basemap and its features
m = Basemap(projection='kav7', lon_0=0, resolution='l', area_thresh=10000)
m.drawmapboundary(color=border_color, fill_color=water_color)
m.drawcoastlines(color=coastline_color)
m.drawcountries(color=border_color)
m.fillcontinents(color=land_color, lake_color=water_color)
m.drawparallels(np.arange(-90., 120., 30.), color=meridian_color)
m.drawmeridians(np.arange(0., 420., 60.), color=meridian_color)
# project our points from each dataset then concatenate and scatter plot them
x, y = m(lons.values, lats.values)
m.scatter(x, y, s=8, color=marker_fill_color, edgecolor=marker_edge_color, alpha=1, zorder=3)
# show the map
plt.savefig('images/lastfm_artists_origins_map.png', dpi=96, bbox_inches='tight', pad_inches=0.2)
plt.show()