#!/usr/bin/env python # coding: utf-8 # # Whotracks.me May Update # # *This post is one of our regular monthly blogs accompanying an update to the data # displayed on WhoTracks.Me. In these posts we introduce what data has been added as well # as point out interesting trends and case-studies we found in the last month. Previous # month's posts can be found here: [April 2018](./update_apr_2018.html), # [February 2018](./update_feb_2018.html), [January 2018](./update_jan_2018.html), # [December 2017](./update_dec_2017.html).* # # # This month we update the site with data from 340 million page loads during April 2018. We expand # the number of trackers shown to 951, and the number of websites to 1330. As this will be the last # full month before the [GDPR](https://en.wikipedia.org/wiki/General_Data_Protection_Regulation) # comes into force for European users, this will provide a benchmark to assess whether there is an # observable difference on the tracking ecosystem. # # This month also saw our new paper **"WhoTracks.Me: Monitoring the online tracking landscape at scale"** # published on [Arxiv](https://arxiv.org/abs/1804.08959). This paper covers the methodology behind # the data we collect here, and how we ensure no private information can be leaked during this # process. # # In[1]: from plotly.offline import init_notebook_mode, iplot, offline import pandas as pd import cufflinks as cf init_notebook_mode() cf.set_config_file(offline=False, world_readable=True, theme='pearl') # In[2]: from whotracksme.data.loader import DataSource data = DataSource() # ## Notable Changes # As customary, here below are the sites with the most notable changes this month. The # largest increase in the average number of trackers per page load was measured in # [markt.de](https://whotracks.me/websites/markt.de.html), and the largest decrease in # [babbel.com](https://whotracks.me/websites/babbel.com.html). # In[3]: apr_trackers = data.sites.get_snapshot('2018-04').set_index('site')['trackers'] mar_trackers = data.sites.get_snapshot('2018-03').set_index('site')['trackers'] site_diffs = pd.DataFrame({ 'trackers': mar_trackers, 'change': (apr_trackers - mar_trackers) }) site_diffs[(site_diffs.change > 3) | (site_diffs.change < -3)].sort_values('change') # ## Facebook's Tough Month # # [Facebook](../trackers/facebook.html) have been in the news a lot in the last month, and with # the `#deletefacebook` trending, will there have been an effect on their operations and bottom # line? We [already reported](https://www.ghostery.com/blog/ghostery-news/report-have-publishers-banned-facebook-trackers-from-their-pages-after-the-cambridge-analytica-scandal/) # that despite strong criticism in the press, the same news sites did not stop using Facebook's # tracking tools. The data we release this month shows that this continues to be the case, with no # drop in tracking reach for the [Facebook tracker](../trackers/facebook.html). # # In[4]: facebookDf = data.trackers.df[ (data.trackers.df.tracker == "facebook") # & (data.trackers.df.month >= "2018-01") ] facebookDf = facebookDf[['month','reach', 'site_reach']] # In[12]: fig = facebookDf.iplot( subplots=True, shape=(2, 1), x='month', shared_xaxes=True, fill=True, title="Reach and Site Reach", vline=["2018-03"], asFigure=True ) # fig.iplot() # To save the image as svg offline.iplot(fig, image='svg') # Note that `reach` refers to the percentage of total page loads where the Facebook # tracker was seen to be present, whereas `site reach` refers to the percentage of # domains. # # ## Google and the Countdown to GDPR # With GDPR coming into effect on 25th May, we will soon see if it has an impact on the number of # third-party trackers loaded on web pages. [Recent reports indicate](https://adexchanger.com/online-advertising/googles-gdpr-consent-tool-will-limit-publishers-to-12-ad-tech-vendors/) # that Google will encourage publishers to reduce the number of AdTech vendors they use, in order to # increase the chance of getting consent for tracking from users. If this is the case, we should # expect this change to be visible in the WhoTracks.Me data. # In[ ]: dc_sites = data.sites_trackers.df[ (data.sites_trackers.df.tracker == "doubleclick") & (data.sites_trackers.df.month == "2018-04") & (data.sites_trackers.df.country == "global") & (data.sites_trackers.df.site_proportion > 0.5) ].site dc_sites_df = data.sites.df[ (data.sites.df.site.isin(dc_sites)) & (data.sites.df.month >= "2018-02") ] dcsitesDf = pd.DataFrame({ "apr_trackers": dc_sites_df[dc_sites_df.month == '2018-04'].trackers, "mar_trackers": dc_sites_df[dc_sites_df.month == '2018-03'].trackers, }) # In[ ]: fig = dcsitesDf.iplot( kind="histogram", histnorm='percent', title="Distribution of the average number of trackers per site", opacity=.6, bins=20, yTitle="Percentage of Sites", vline={ "kind": "rect", "x0": 12, "x1": 38, "width": 2, "fillcolor": "red", "opacity": 0.1 }, barmode="overlay", bargap=0.2, line_color="#00000000", width=0, asFigure=True, ) fig.iplot() # To save the image as svg offline.plot(fig, image='svg') # As we reported [last month](./update_apr_2018.html), we observe a gradual decline in the average # number of trackers seen on websites. However, looking at sites which use Google's [Doubleclick](../trackers/doubleclick.html) # Ad Network, a large proportion are still well-above this proposed 12 tracker limit. With only a few # weeks to go, there will still be be a significant number of sites over the limit. # If we were to consider the most extreme scenario, where Google compels all customers use their GDPR # consent system for European users, and enforces a 12 vendor limit in the process, this could # have a significant impact on the ecosystem. If we extrapolate from WhoTracks.Me data, capping all # these sites to 12 trackers means that over **1,300 trackers** would disappear from sites. AdTech # companies deeper in the supply chain may be completely cut out unless they have direct publisher # relationships which enable them to make the vendor shortlist. # # Such a sharp change in the ecosystem is unlikely, but it demonstrates the power of Google's market # dominance, that they would be able to unilaterally pull the plug on a lot of their competition. We # will continue to monitor the ecosystem to quantify any changes to tracking, and look forward to # reporting the changes, if any, caused by the new regulation. # If you want to delve deeper, the data is open and available on the [Whotracks.me Github Repository](https://github.com/ghostery/whotracks.me/tree/master/whotracksme/data), and as a [pip package](https://pypi.python.org/pypi/whotracksme/).