#!/usr/bin/env python # coding: utf-8 # # IP anonymization and its impact on visitor localization in Google Analytics # # # ### Tools used # * [pandas](http://pandas.pydata.org/) and its [Google Analytics connector](http://pandas.pydata.org/pandas-docs/version/0.15.2/remote_data.html#remote-data-ga) to fetch and wrangle the data, # * [bokeh](http://bokeh.pydata.org) to visualize it. # # ### Data sources # * The traffic data comes from Google Analytics and concerns [blog.liip.ch](http://blog.liip.ch). Two properties track pageviews on that 'tech' blog, on with IP anonymizing enabled, since september 2015. # ## Exploration 1: What's the impact of IP anonymizing on the user country dimension? # In[221]: import pandas as pd import pandas.io.ga as ga import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') # In[220]: import sys print ("PYTHON ", sys.version) print ("PANDAS ", pd.__version__) # ### Fetching data # # Now that our tools are loaded, let us fetch the data from the two Google Analytics properties. # # The traffic data without anonymization since September 1st, 2015. # In[4]: sources = { 'full' : { 'property_id': "UA-424540-4", 'profile_id': "5334921", }, 'anon' : { 'property_id': "UA-424540-11", 'profile_id': "107030134", }, } # In[5]: source_data = {} for key in sources: source_data[key] = ga.read_ga( property_id = sources[key]['property_id'], profile_id = sources[key]['profile_id'], metrics = "sessions", dimensions = ['country','city'], start_date = "2015-09-01", index_col = ['country','city'], ) print(source_data[key]['sessions'].sum()) # Less than 1% difference in volumes. Since the tracking is not forcefully simultaneous, that was expected. Let's just have a look at one of them. # In[6]: source_data['full'].head() # Let us now join those two dataframes based on their country/city index: # In[222]: data = pd.concat(source_data, axis=1, join='outer') # rename homonymous columns data.columns=['full_ip_sessions', 'anon_ip_sessions'] data.head() # Let's list the countries where the biggest proportional losses & wins happen. # In[225]: # group by level 0 of the index (i.e. countries) and sum columns for groups country_data = data.groupby(level=0).sum() # compute delta and its proportion country_data['delta'] = country_data.anon_ip_sessions - country_data.full_ip_sessions country_data['dprop'] = country_data.delta / country_data.full_ip_sessions # sort by prop. delta, ascending country_data.sort_values(by='dprop', inplace=True) # Countries with proportionally large losses: # In[229]: country_data.query('full_ip_sessions > 200').head(10) # Countries with proportionally large gains: # In[231]: country_data.query('full_ip_sessions > 200').tail(10) # What's the proportion of the fluctuation? # In[232]: country_data.delta.map(abs).sum()/country_data.full_ip_sessions.sum() # The deltas are below 10%, either positive or negative. And there's globally less than 2% of country attribution mismatch. # # One can then say that __Country attribution is largely insensitive to IP anonymization__. # ## Exploration 2 : what does it mean locally in Switzerland? # # Let us dive one level deeper: at city level. We will focus on Switzerland since we have enough traffic from it. # In[233]: country_data.query('country == "Switzerland"') # Less than 1 percent loss at country level for Switzerland, rather stable. But what's happening at city level? # In[234]: # create a clean subset swiss_data = data.query('country == "Switzerland"').copy() swiss_data.sum() # In[235]: swiss_data['delta'] = swiss_data.anon_ip_sessions - swiss_data.full_ip_sessions swiss_data['dprop'] = swiss_data.delta / swiss_data.full_ip_sessions swiss_data.sort_values(by='dprop', inplace=True) # In[237]: swiss_data.query('full_ip_sessions > 50') # In[238]: # absolute sum of delta swiss_data.delta.map(abs).sum()/swiss_data.full_ip_sessions.sum() # __Quite some turmoil at city level!__ For example, Fribourg gains 185% of attributions while Basel loses 30%, Something wild going on in Porrentruy and Ebikon, ... # # Overall, we see __more than 25% mismatch in city attribution__ for Switzerland.