%matplotlib inline
import os # used for file system operations
import json # prevalent input/output data format
from hashlib import sha256 # cryptographic hashing for personal data for anonymization
import pandas as pd # working with data frames, a versatile tabular data structure
from geopy.geocoders import Nominatim # use Nominatim, the OpenStreetMap geocoder service (from address to geo-location)
from geopy.extra.rate_limiter import RateLimiter # rate-limit support for geocoder services
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
Original data set from company X, to be anonymized.
data_location = 'data/RAW.csv' # original data in CSV format
df = pd.read_csv(data_location)
df = df.drop('Source', axis='columns') # drop superfluous column
print(f'Original data, {df.shape[0]} rows and {df.shape[1]} columns')
Original data, 4108 rows and 20 columns
Fix Cost
and NightlyRate
columns to be numeric. The source data is formatted as danish kroner.
df.Cost = df.Cost.str.replace('kr','').str.replace(',','').astype(float)
df.NightlyRate = df.NightlyRate.str.replace('kr','').str.replace(',','').astype(float)
Remove duplicate and incomplete rows.
# Drop duplicate bookings (artefact of importing from different systems)
df = df.sort_values('Status').drop_duplicates(subset=['Guest Name', 'Rental', 'Arrive', 'Depart'], keep='first')
# Delete test bookings
df = df.dropna(subset=['Guest Name', 'Rental'], how='any')
print('New shape, rows and columns:', df.shape)
New shape, rows and columns: (4069, 20)
In the original data, the property's exact address is available in the Rental_Address
column.
print('missing property locations:', df.Rental_Address.isna().sum())
unique_property_locations = df.Rental_Address.dropna().unique().tolist()
print('unique property locations:', len(unique_property_locations))
missing property locations: 28 unique property locations: 173
geolocator = Nominatim(user_agent='CovidBnB') # create OpenStreetMap Nominatim geolocator (https://wiki.openstreetmap.org/wiki/Nominatim)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.1) # ensure no more than 10 Nominatim queries per second
location_cache_file = 'cache/location_cache.json'
if os.path.exists(location_cache_file): # if location cache exists,
with open(location_cache_file, 'r') as fd:
location_cache = json.load(fd)
else:
location_cache = {} # if no location cache found, start empty dictionary as location cache
location_not_found = [] # list of location we could not find
def anonymize_address(property_location):
'''Anonymize full input address to postal code.'''
if pd.isna(property_location):
return None
cached_location = location_cache.get(property_location, False) # try to read from location cache
if cached_location:
return cached_location # cache hit, found address in location cache, return cached postal code
else:
try:
location_geo = geocode(property_location, addressdetails=True) # not found in cache, geolocate with address details
postcode = location_geo.raw['address']['postcode'] # extract postal code from address details response
location_cache[property_location] = postcode # add value to cache
return postcode
except:
location_not_found.append(property_location) # if not found, add to list of locations not found
return None
df['postal_code'] = df.Rental_Address.apply(anonymize_address).astype(str)
with open(location_cache_file, 'w') as fd:
json.dump(location_cache, fd) # write cache to file
print(f'Could not find {len(location_not_found)} locations, {len(set(location_not_found))} unique')
Could not find 52 locations, 4 unique
def anonymize_text(text, length=8):
'''Anonymize a given text using the SHA256 cryptographic hash function. Default is to use first 8 characters of hash.'''
byte_text = text.encode('utf8')
text_hash = sha256(byte_text).hexdigest()
return text_hash[:length]
df['Guest Anon'] = df['Guest Name'].apply(anonymize_text)
df['Rental Anon'] = df['Rental'].apply(anonymize_text)
# Drop columns with personally identifiable information (PII)
drop_pii_columns = ['Guest Name', 'First Name', 'Last Name', 'Guest_Name', 'Rental_Name',
'Rental', 'ID', 'Altered', 'Phone', 'Rental_Address']
df = df.drop(drop_pii_columns, axis=1).reset_index(drop=True)
df
Arrive | Depart | Received | Status | Adults | Cost | Nights | Last_modified | NightlyRate | Guest Location | postal_code | Guest Anon | Rental Anon | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5/16/2019 | 5/23/2019 | 5/14/2019 13:20 | canceled | 3.0 | 7682.50 | 7 | 8/28/2019 15:12 | 1019.0 | NaN | 2300 | e8d072e5 | 3fd1e124 |
1 | 4/1/2020 | 4/4/2020 | 2/17/2020 00:26 | canceled | 3.0 | 1037.81 | 3 | 3/12/2020 17:17 | 346.0 | NaN | 2200 | 99076ea9 | c74a5829 |
2 | 7/4/2020 | 7/5/2020 | 3/4/2019 13:37 | canceled | 4.0 | 977.50 | 1 | 8/15/2019 15:28 | 978.0 | NaN | 1799 | 794d4098 | d9dacb3e |
3 | 4/9/2020 | 4/14/2020 | 2/17/2020 00:05 | canceled | 3.0 | 419.10 | 5 | 3/15/2020 18:01 | 84.0 | NaN | 2200 | 245bb108 | c74a5829 |
4 | 7/4/2020 | 7/7/2020 | 2/16/2020 22:59 | canceled | 2.0 | 5537.23 | 3 | 7/8/2020 19:28 | 1679.0 | Houston, TX | 1051 | fcac9b2c | 0fd1eb2e |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4064 | 6/24/2019 | 6/26/2019 | 6/21/2019 10:51 | confirmed | 2.0 | 2827.10 | 2 | 6/21/2019 10:52 | 1239.0 | NaN | 1454 | 0948688e | 6361b74c |
4065 | 6/24/2019 | 6/27/2019 | 6/21/2019 13:57 | confirmed | 1.0 | 4028.15 | 3 | 6/21/2019 13:58 | 1243.0 | NaN | 1454 | d9db51e4 | 154dcb23 |
4066 | 7/5/2019 | 7/7/2019 | 6/21/2019 14:53 | confirmed | 6.0 | 5486.24 | 2 | 6/21/2019 14:58 | 2393.0 | Plymouth, United Kingdom | 2200 | d068cc67 | 7f9711ed |
4067 | 9/2/2019 | 9/4/2019 | 9/1/2019 15:48 | confirmed | 2.0 | 2226.22 | 2 | 9/1/2019 15:52 | 863.0 | St Petersburg, FL | 1063 | 0ef3d2da | eeeb0cf5 |
4068 | 8/8/2020 | 8/11/2020 | 8/7/2020 23:45 | confirmed | 2.0 | 6326.97 | 3 | 8/7/2020 23:47 | 1792.0 | NaN | nan | ecd81df9 | e28934a2 |
4069 rows × 13 columns
postal_code_cache_file = 'cache/postal_code_cache.json'
if os.path.exists(postal_code_cache_file): # if postal code cache exists
with open(postal_code_cache_file, 'r') as fd:
postal_code_cache = json.load(fd) # read postal code cache from file
else:
postal_code_cache = {} # if no postal code cache found, start empty dictionary as cache
def geolocate_postalcode(postalcode):
'''Geolocate a postal code to a pandas Series with lat and lon.'''
if postalcode in ['None', 'nan'] or postalcode is None or pd.isnull(postalcode):
return pd.Series({'property_lat': None, 'property_lon': None})
lat_lon = postal_code_cache.get(postalcode) # try to read lat and lon from postal code cache
if lat_lon:
# if lat lon from postal code cache is a hit, return a pandas Series
return pd.Series({'property_lat': lat_lon[0], 'property_lon': lat_lon[1]})
else:
# if cache is not hit, geocode the postal code (country is always Denmark)
geo = geocode(query={'postalcode': postalcode, 'country': 'Denmark'})
postal_code_cache[postalcode] = (geo.raw['lat'], geo.raw['lon']) # add fetched data to cache
return pd.Series({'property_lat': geo.raw['lat'], 'property_lon': geo.raw['lon']})
lat_lon_df = df.postal_code.apply(geolocate_postalcode) # create data frame for lat lon
with open(postal_code_cache_file, 'w') as fd:
json.dump(postal_code_cache, fd) # write cache to file
df = pd.concat([df, lat_lon_df], axis='columns') # concatenate lat lon data frame by columns (horizontally)
guest_origin_cache_file = 'cache/guest_origin_cache.json'
if os.path.exists(guest_origin_cache_file): # if guest origin cache exists
with open(guest_origin_cache_file, 'r') as fd:
guest_origin_cache = json.load(fd) # read guest origin cache from file
else:
guest_origin_cache = {} # if no guest origin cache found, start empty dictionary as cache
guest_origin_not_found = []
def geolocate_guest_origin(location):
'''Geolocate guest location.'''
if pd.isnull(location):
return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})
lat_lon = guest_origin_cache.get(location)
if lat_lon:
return pd.Series({'guest_lat': lat_lon[0], 'guest_lon': lat_lon[1], 'guest_country': lat_lon[2]})
elif location in guest_origin_not_found:
return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})
else:
try:
geo = geocode(query=location, addressdetails=True)
guest_origin_cache[location] = (geo.raw['lat'], geo.raw['lon'], geo.raw['address']['country_code'])
print(location, (geo.raw['lat'], geo.raw['lon']), geo.raw['address']['country_code'])
return pd.Series({'guest_lat': geo.raw['lat'], 'guest_lon': geo.raw['lon'],
'guest_country': geo.raw['address']['country_code']})
except:
guest_origin_not_found.append(location)
print('Problem locating:', location)
return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})
guest_lat_lon_df = df['Guest Location'].apply(geolocate_guest_origin)
with open(guest_origin_cache_file, 'w') as fd:
json.dump(guest_origin_cache, fd) # write cache to file
df = pd.concat([df, guest_lat_lon_df], axis='columns')
Problem locating: Lombardy, Italy, Lombardy, Italy Problem locating: Sundbyberg, Sweden, Toronto, Canada Problem locating: Stoyanka, Ukraine Problem locating: Nes Municipality, Faroe Islands Problem locating: Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom Problem locating: Gothenburg, Sweden, Oslo, Norway Problem locating: Barcelona, Spain, Barcelona, Spain Problem locating: Berlin, Germany, Berlin, Germany Problem locating: Hong Kong SAR, China
df
Arrive | Depart | Received | Status | Adults | Cost | Nights | Last_modified | NightlyRate | Guest Location | postal_code | Guest Anon | Rental Anon | property_lat | property_lon | guest_lat | guest_lon | guest_country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5/16/2019 | 5/23/2019 | 5/14/2019 13:20 | canceled | 3.0 | 7682.50 | 7 | 8/28/2019 15:12 | 1019.0 | NaN | 2300 | e8d072e5 | 3fd1e124 | 55.651306275214914 | 12.603239181235617 | None | None | None |
1 | 4/1/2020 | 4/4/2020 | 2/17/2020 00:26 | canceled | 3.0 | 1037.81 | 3 | 3/12/2020 17:17 | 346.0 | NaN | 2200 | 99076ea9 | c74a5829 | 55.69475043151038 | 12.550190147092675 | None | None | None |
2 | 7/4/2020 | 7/5/2020 | 3/4/2019 13:37 | canceled | 4.0 | 977.50 | 1 | 8/15/2019 15:28 | 978.0 | NaN | 1799 | 794d4098 | d9dacb3e | 55.66627486937496 | 12.534955500625006 | None | None | None |
3 | 4/9/2020 | 4/14/2020 | 2/17/2020 00:05 | canceled | 3.0 | 419.10 | 5 | 3/15/2020 18:01 | 84.0 | NaN | 2200 | 245bb108 | c74a5829 | 55.69475043151038 | 12.550190147092675 | None | None | None |
4 | 7/4/2020 | 7/7/2020 | 2/16/2020 22:59 | canceled | 2.0 | 5537.23 | 3 | 7/8/2020 19:28 | 1679.0 | Houston, TX | 1051 | fcac9b2c | 0fd1eb2e | 55.679985160377335 | 12.590608396226418 | 29.7589382 | -95.3676974 | us |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4064 | 6/24/2019 | 6/26/2019 | 6/21/2019 10:51 | confirmed | 2.0 | 2827.10 | 2 | 6/21/2019 10:52 | 1239.0 | NaN | 1454 | 0948688e | 6361b74c | 55.67840178857141 | 12.569364442857141 | None | None | None |
4065 | 6/24/2019 | 6/27/2019 | 6/21/2019 13:57 | confirmed | 1.0 | 4028.15 | 3 | 6/21/2019 13:58 | 1243.0 | NaN | 1454 | d9db51e4 | 154dcb23 | 55.67840178857141 | 12.569364442857141 | None | None | None |
4066 | 7/5/2019 | 7/7/2019 | 6/21/2019 14:53 | confirmed | 6.0 | 5486.24 | 2 | 6/21/2019 14:58 | 2393.0 | Plymouth, United Kingdom | 2200 | d068cc67 | 7f9711ed | 55.69475043151038 | 12.550190147092675 | 50.3712659 | -4.1425658 | gb |
4067 | 9/2/2019 | 9/4/2019 | 9/1/2019 15:48 | confirmed | 2.0 | 2226.22 | 2 | 9/1/2019 15:52 | 863.0 | St Petersburg, FL | 1063 | 0ef3d2da | eeeb0cf5 | 55.67796323809524 | 12.583763952380954 | 27.7703796 | -82.6695085 | us |
4068 | 8/8/2020 | 8/11/2020 | 8/7/2020 23:45 | confirmed | 2.0 | 6326.97 | 3 | 8/7/2020 23:47 | 1792.0 | NaN | nan | ecd81df9 | e28934a2 | None | None | None | None | None |
4069 rows × 18 columns
df.to_csv('data/processed.csv', index=False)