Cookie syncing is the process by which two different trackers link the IDs given to a user
In this notebook we look at different domains sharing the same ID
import pandas as pd
import sqlite3
import networkx as nx
from bokeh.io import show, output_notebook
from bokeh.palettes import viridis
from bokeh.plotting import figure
from bokeh.models import (
graphs,
Circle,
EdgesAndLinkedNodes,
GlyphRenderer,
HoverTool,
MultiLine,
NodesAndLinkedEdges,
)
from bokeh.resources import INLINE
from bokeh.transform import factor_cmap
output_notebook(resources=INLINE)
firefox_profile_dir = '/home/bird/.mozilla/firefox/old_profiles/iadzfbcv.default/'
cookies_file = '{}/cookies.sqlite'.format(firefox_profile_dir)
cookies_db = sqlite3.connect(cookies_file)
df = pd.read_sql('SELECT * FROM moz_cookies', cookies_db)
df = df[['baseDomain', 'value']]
df.head()
df.value.value_counts().head()
shared_values = []
for v in df.value.unique():
matches = df[df.value.str.contains(v, regex=False)]
if len(matches.baseDomain.unique()) > 5:
shared_values.append(v)
matching str.contains
shared_values = []
for v in df.value.unique():
matches = df[df.value.str.contains(v, regex=False)]
if len(matches.baseDomain.unique()) > 5:
shared_values.append(v)
potential_ids = [
x for x in shared_values if
(len(x) > 10) & ('com' not in x)
]
print('We have', len(potential_ids), 'potential ids. Things like:')
#potential_ids[0:5]
We have 25 potential ids. Things like:
df[df.value.str.contains(potential_ids[-1])].head()
G = nx.Graph()
for p_id in potential_ids:
domains = df[df.value.str.contains(p_id)].baseDomain.unique()
for domain_from in domains:
for domain_to in domains:
G.add_edge(domain_from, domain_to, p_id=p_id)
palette = viridis(len(potential_ids))
p = figure(
x_range=(-1.1, 1.1), y_range=(-1.1, 1.1),
outline_line_color=None, tools='save, tap', toolbar_location='left',
background_fill_alpha=0, border_fill_alpha=0
)
p.add_tools(HoverTool(tooltips='@index', show_arrow=None))
p.grid.visible = False
p.axis.visible = False
graph_renderer = graphs.from_networkx(G, nx.shell_layout)
graph_renderer.edge_renderer.data_source.data['p_id'] = [edge[2]['p_id'] for edge in G.edges(data=True)]
graph_renderer.node_renderer.glyph = Circle(size=10, fill_alpha=0.2, fill_color='gray', line_color=None)
graph_renderer.node_renderer.selection_glyph = Circle(fill_color='gray', fill_alpha=1)
graph_renderer.node_renderer.hover_glyph = Circle(fill_color='gray', fill_alpha=1)
graph_renderer.edge_renderer.glyph = MultiLine(
line_color=factor_cmap('p_id', factors=potential_ids, palette=palette),
line_alpha=0.3,
)
graph_renderer.edge_renderer.selection_glyph = MultiLine(
line_color=factor_cmap('p_id', factors=potential_ids, palette=palette),
line_width=3,
)
graph_renderer.edge_renderer.hover_glyph = MultiLine(
line_color=factor_cmap('p_id', factors=potential_ids, palette=palette))
graph_renderer.inspection_policy = NodesAndLinkedEdges()
graph_renderer.selection_policy = NodesAndLinkedEdges()
p.renderers.append(graph_renderer)
show(p)
p = figure(
x_range=(-1.1, 1.1), y_range=(-1.1, 1.1), toolbar_location='left',
outline_line_color=None, tools='tap, save, box_zoom, wheel_zoom, reset',
sizing_mode='scale_width'
)
p.add_tools(HoverTool(tooltips='@index', show_arrow=None))
p.grid.visible = False
p.axis.visible = False
graph_renderer = graphs.from_networkx(G, nx.spring_layout, center=(0, 0), scale=1, k=0.4)
graph_renderer.edge_renderer.data_source.data['p_id'] = [edge[2]['p_id'] for edge in G.edges(data=True)]
graph_renderer.node_renderer.glyph = Circle(radius=0.01, fill_color='gray', line_color=None, fill_alpha=0.5)
graph_renderer.node_renderer.selection_glyph = Circle(fill_color='gray', fill_alpha=1)
graph_renderer.node_renderer.hover_glyph = Circle(fill_color='gray', fill_alpha=1)
graph_renderer.edge_renderer.glyph = MultiLine(
line_color=factor_cmap('p_id', factors=potential_ids, palette=palette),
line_alpha=0.8,
)
graph_renderer.edge_renderer.selection_glyph = MultiLine(
line_color=factor_cmap('p_id', factors=potential_ids, palette=palette),
line_width=3,
)
graph_renderer.edge_renderer.hover_glyph = MultiLine(
line_color=factor_cmap('p_id', factors=potential_ids, palette=palette),
line_width=3,
)
graph_renderer.inspection_policy = NodesAndLinkedEdges()
graph_renderer.selection_policy = NodesAndLinkedEdges()
p.renderers.append(graph_renderer)
show(p)