Imports¶

In [1]:

%matplotlib inline

import os  # used for file system operations
import json  # prevalent input/output data format
from hashlib import sha256  # cryptographic hashing for personal data for anonymization

import pandas as pd  # working with data frames, a versatile tabular data structure
from geopy.geocoders import Nominatim  # use Nominatim, the OpenStreetMap geocoder service (from address to geo-location)
from geopy.extra.rate_limiter import RateLimiter  # rate-limit support for geocoder services

pd.options.display.max_columns = 50
pd.options.display.max_rows = 100

Loading original data¶

Original data set from company X, to be anonymized.

In [2]:

data_location = 'data/RAW.csv'  # original data in CSV format
df = pd.read_csv(data_location)
df = df.drop('Source', axis='columns')  # drop superfluous column
print(f'Original data, {df.shape[0]} rows and {df.shape[1]} columns')

Original data, 4108 rows and 20 columns

Data pre-processing¶

Convert currency to numeric (DKK)¶

Fix Cost and NightlyRate columns to be numeric. The source data is formatted as danish kroner.

In [3]:

df.Cost = df.Cost.str.replace('kr','').str.replace(',','').astype(float)
df.NightlyRate = df.NightlyRate.str.replace('kr','').str.replace(',','').astype(float)

Consolidation¶

Remove duplicate and incomplete rows.

In [4]:

# Drop duplicate bookings (artefact of importing from different systems)
df = df.sort_values('Status').drop_duplicates(subset=['Guest Name', 'Rental', 'Arrive', 'Depart'], keep='first')

# Delete test bookings
df = df.dropna(subset=['Guest Name', 'Rental'], how='any')

print('New shape, rows and columns:', df.shape)

New shape, rows and columns: (4069, 20)

Anonymization¶

Anonymize property location¶

In the original data, the property's exact address is available in the Rental_Address column.

In [5]:

print('missing property locations:', df.Rental_Address.isna().sum())

unique_property_locations = df.Rental_Address.dropna().unique().tolist()
print('unique property locations:', len(unique_property_locations))

missing property locations: 28
unique property locations: 173

In [6]:

geolocator = Nominatim(user_agent='CovidBnB')  # create OpenStreetMap Nominatim geolocator (https://wiki.openstreetmap.org/wiki/Nominatim)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.1)  # ensure no more than 10 Nominatim queries per second

In [7]:

location_cache_file = 'cache/location_cache.json'
if os.path.exists(location_cache_file):  # if location cache exists,
    with open(location_cache_file, 'r') as fd:
        location_cache = json.load(fd)
else:
    location_cache = {}  # if no location cache found, start empty dictionary as location cache
location_not_found = []  # list of location we could not find

In [8]:

def anonymize_address(property_location):
    '''Anonymize full input address to postal code.'''
    if pd.isna(property_location):
        return None
    cached_location = location_cache.get(property_location, False)  # try to read from location cache
    if cached_location:
        return cached_location  # cache hit, found address in location cache, return cached postal code
    else:
        try:
            location_geo = geocode(property_location, addressdetails=True)  # not found in cache, geolocate with address details
            postcode = location_geo.raw['address']['postcode']  # extract postal code from address details response
            location_cache[property_location] = postcode  # add value to cache
            return postcode
        except:
            location_not_found.append(property_location)  # if not found, add to list of locations not found 
            return None

df['postal_code'] = df.Rental_Address.apply(anonymize_address).astype(str)

with open(location_cache_file, 'w') as fd:
    json.dump(location_cache, fd)  # write cache to file

print(f'Could not find {len(location_not_found)} locations, {len(set(location_not_found))} unique')

Could not find 52 locations, 4 unique

Anonymize guest names and host property names¶

In [9]:

def anonymize_text(text, length=8):
    '''Anonymize a given text using the SHA256 cryptographic hash function. Default is to use first 8 characters of hash.'''
    byte_text = text.encode('utf8')
    text_hash = sha256(byte_text).hexdigest()
    return text_hash[:length]

df['Guest Anon'] = df['Guest Name'].apply(anonymize_text)
df['Rental Anon'] = df['Rental'].apply(anonymize_text)

# Drop columns with personally identifiable information (PII)
drop_pii_columns = ['Guest Name', 'First Name', 'Last Name', 'Guest_Name', 'Rental_Name', 
                'Rental', 'ID', 'Altered', 'Phone', 'Rental_Address']
df = df.drop(drop_pii_columns, axis=1).reset_index(drop=True)
df

Out[9]:

	Arrive	Depart	Received	Status	Adults	Cost	Nights	Last_modified	NightlyRate	Guest Location	postal_code	Guest Anon	Rental Anon
0	5/16/2019	5/23/2019	5/14/2019 13:20	canceled	3.0	7682.50	7	8/28/2019 15:12	1019.0	NaN	2300	e8d072e5	3fd1e124
1	4/1/2020	4/4/2020	2/17/2020 00:26	canceled	3.0	1037.81	3	3/12/2020 17:17	346.0	NaN	2200	99076ea9	c74a5829
2	7/4/2020	7/5/2020	3/4/2019 13:37	canceled	4.0	977.50	1	8/15/2019 15:28	978.0	NaN	1799	794d4098	d9dacb3e
3	4/9/2020	4/14/2020	2/17/2020 00:05	canceled	3.0	419.10	5	3/15/2020 18:01	84.0	NaN	2200	245bb108	c74a5829
4	7/4/2020	7/7/2020	2/16/2020 22:59	canceled	2.0	5537.23	3	7/8/2020 19:28	1679.0	Houston, TX	1051	fcac9b2c	0fd1eb2e
...	...	...	...	...	...	...	...	...	...	...	...	...	...
4064	6/24/2019	6/26/2019	6/21/2019 10:51	confirmed	2.0	2827.10	2	6/21/2019 10:52	1239.0	NaN	1454	0948688e	6361b74c
4065	6/24/2019	6/27/2019	6/21/2019 13:57	confirmed	1.0	4028.15	3	6/21/2019 13:58	1243.0	NaN	1454	d9db51e4	154dcb23
4066	7/5/2019	7/7/2019	6/21/2019 14:53	confirmed	6.0	5486.24	2	6/21/2019 14:58	2393.0	Plymouth, United Kingdom	2200	d068cc67	7f9711ed
4067	9/2/2019	9/4/2019	9/1/2019 15:48	confirmed	2.0	2226.22	2	9/1/2019 15:52	863.0	St Petersburg, FL	1063	0ef3d2da	eeeb0cf5
4068	8/8/2020	8/11/2020	8/7/2020 23:45	confirmed	2.0	6326.97	3	8/7/2020 23:47	1792.0	NaN	nan	ecd81df9	e28934a2

4069 rows × 13 columns

Geolocation of anonymized property¶

In [10]:

postal_code_cache_file = 'cache/postal_code_cache.json'
if os.path.exists(postal_code_cache_file):  # if postal code cache exists
    with open(postal_code_cache_file, 'r') as fd:
        postal_code_cache = json.load(fd)  # read postal code cache from file
else:
    postal_code_cache = {}  # if no postal code cache found, start empty dictionary as cache

In [11]:

def geolocate_postalcode(postalcode):
    '''Geolocate a postal code to a pandas Series with lat and lon.'''
    if postalcode in ['None', 'nan'] or postalcode is None or pd.isnull(postalcode):
        return pd.Series({'property_lat': None, 'property_lon': None})
    lat_lon = postal_code_cache.get(postalcode)  # try to read lat and lon from postal code cache
    if lat_lon:
        # if lat lon from postal code cache is a hit, return a pandas Series
        return pd.Series({'property_lat': lat_lon[0], 'property_lon': lat_lon[1]})
    else:
        # if cache is not hit, geocode the postal code (country is always Denmark)
        geo = geocode(query={'postalcode': postalcode, 'country': 'Denmark'})
        postal_code_cache[postalcode] = (geo.raw['lat'], geo.raw['lon'])  # add fetched data to cache
        return pd.Series({'property_lat': geo.raw['lat'], 'property_lon': geo.raw['lon']})

lat_lon_df = df.postal_code.apply(geolocate_postalcode)  # create data frame for lat lon

with open(postal_code_cache_file, 'w') as fd:
    json.dump(postal_code_cache, fd)  # write cache to file
df = pd.concat([df, lat_lon_df], axis='columns')  # concatenate lat lon data frame by columns (horizontally)

Geolocate guest origin¶

In [12]:

guest_origin_cache_file = 'cache/guest_origin_cache.json'
if os.path.exists(guest_origin_cache_file):  # if guest origin cache exists
    with open(guest_origin_cache_file, 'r') as fd:
        guest_origin_cache = json.load(fd)  # read guest origin cache from file
else:
    guest_origin_cache = {}  # if no guest origin cache found, start empty dictionary as cache
guest_origin_not_found = []

In [13]:

def geolocate_guest_origin(location):
    '''Geolocate guest location.'''
    if pd.isnull(location):
        return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})
    lat_lon = guest_origin_cache.get(location)
    if lat_lon:
        return pd.Series({'guest_lat': lat_lon[0], 'guest_lon': lat_lon[1], 'guest_country': lat_lon[2]})
    elif location in guest_origin_not_found:
        return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})
    else:
        try:
            geo = geocode(query=location, addressdetails=True)
            guest_origin_cache[location] = (geo.raw['lat'], geo.raw['lon'], geo.raw['address']['country_code'])
            print(location, (geo.raw['lat'], geo.raw['lon']), geo.raw['address']['country_code'])
            return pd.Series({'guest_lat': geo.raw['lat'], 'guest_lon': geo.raw['lon'], 
                              'guest_country': geo.raw['address']['country_code']})
        except:
            guest_origin_not_found.append(location)
            print('Problem locating:', location)
            return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})

guest_lat_lon_df = df['Guest Location'].apply(geolocate_guest_origin)

with open(guest_origin_cache_file, 'w') as fd:
    json.dump(guest_origin_cache, fd)  # write cache to file
    
df = pd.concat([df, guest_lat_lon_df], axis='columns')

Problem locating: Lombardy, Italy, Lombardy, Italy
Problem locating: Sundbyberg, Sweden, Toronto, Canada
Problem locating: Stoyanka, Ukraine
Problem locating: Nes Municipality, Faroe Islands
Problem locating: Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom
Problem locating: Gothenburg, Sweden, Oslo, Norway
Problem locating: Barcelona, Spain, Barcelona, Spain
Problem locating: Berlin, Germany, Berlin, Germany
Problem locating: Hong Kong SAR, China

Save anonymized data as CSV¶

In [14]:

df

Out[14]:

	Arrive	Depart	Received	Status	Adults	Cost	Nights	Last_modified	NightlyRate	Guest Location	postal_code	Guest Anon	Rental Anon	property_lat	property_lon	guest_lat	guest_lon	guest_country
0	5/16/2019	5/23/2019	5/14/2019 13:20	canceled	3.0	7682.50	7	8/28/2019 15:12	1019.0	NaN	2300	e8d072e5	3fd1e124	55.651306275214914	12.603239181235617	None	None	None
1	4/1/2020	4/4/2020	2/17/2020 00:26	canceled	3.0	1037.81	3	3/12/2020 17:17	346.0	NaN	2200	99076ea9	c74a5829	55.69475043151038	12.550190147092675	None	None	None
2	7/4/2020	7/5/2020	3/4/2019 13:37	canceled	4.0	977.50	1	8/15/2019 15:28	978.0	NaN	1799	794d4098	d9dacb3e	55.66627486937496	12.534955500625006	None	None	None
3	4/9/2020	4/14/2020	2/17/2020 00:05	canceled	3.0	419.10	5	3/15/2020 18:01	84.0	NaN	2200	245bb108	c74a5829	55.69475043151038	12.550190147092675	None	None	None
4	7/4/2020	7/7/2020	2/16/2020 22:59	canceled	2.0	5537.23	3	7/8/2020 19:28	1679.0	Houston, TX	1051	fcac9b2c	0fd1eb2e	55.679985160377335	12.590608396226418	29.7589382	-95.3676974	us
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4064	6/24/2019	6/26/2019	6/21/2019 10:51	confirmed	2.0	2827.10	2	6/21/2019 10:52	1239.0	NaN	1454	0948688e	6361b74c	55.67840178857141	12.569364442857141	None	None	None
4065	6/24/2019	6/27/2019	6/21/2019 13:57	confirmed	1.0	4028.15	3	6/21/2019 13:58	1243.0	NaN	1454	d9db51e4	154dcb23	55.67840178857141	12.569364442857141	None	None	None
4066	7/5/2019	7/7/2019	6/21/2019 14:53	confirmed	6.0	5486.24	2	6/21/2019 14:58	2393.0	Plymouth, United Kingdom	2200	d068cc67	7f9711ed	55.69475043151038	12.550190147092675	50.3712659	-4.1425658	gb
4067	9/2/2019	9/4/2019	9/1/2019 15:48	confirmed	2.0	2226.22	2	9/1/2019 15:52	863.0	St Petersburg, FL	1063	0ef3d2da	eeeb0cf5	55.67796323809524	12.583763952380954	27.7703796	-82.6695085	us
4068	8/8/2020	8/11/2020	8/7/2020 23:45	confirmed	2.0	6326.97	3	8/7/2020 23:47	1792.0	NaN	nan	ecd81df9	e28934a2	None	None	None	None	None

4069 rows × 18 columns

In [15]:

df.to_csv('data/processed.csv', index=False)