import os
import json
import pandas as pd
import sqlite3
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
from bokeh.resources import INLINE
output_notebook(resources=INLINE)
firefox_profile_dir = '/home/bird/.mozilla/firefox/old_profiles/iadzfbcv.default/' # Update this for your own system
[x for x in os.listdir(firefox_profile_dir) if x.endswith('.sqlite')]
['content-prefs.sqlite', 'places.sqlite', 'kinto.sqlite', 'permissions.sqlite', 'formhistory.sqlite', 'storage-sync.sqlite', 'favicons.sqlite', 'cookies.sqlite', 'storage.sqlite', 'webappsstore.sqlite']
cookies_file = '{}/cookies.sqlite'.format(firefox_profile_dir)
cookies_db = sqlite3.connect(cookies_file)
def list_tables_in_db(db):
print(db.cursor().execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall())
list_tables_in_db(cookies_db)
[('moz_cookies',)]
df = pd.read_sql('SELECT * FROM moz_cookies', cookies_db)
df = df[['baseDomain', 'creationTime']]
df.head()
baseDomain | creationTime | |
---|---|---|
0 | creativecommons.org | 1519539169476615 |
1 | reddit.com | 1519539181733316 |
2 | reddit.com | 1519539181733577 |
3 | scorecardresearch.com | 1519539184029502 |
4 | scorecardresearch.com | 1519539184029626 |
df.columns
Index(['baseDomain', 'creationTime'], dtype='object')
len(df)
3954
df['creationTime'] = pd.to_datetime(df.creationTime * 1000)
df = df.sort_values('creationTime')
df['counter'] = 1
df['count_total'] = df.counter.cumsum()
df = df.append({'creationTime': df.creationTime.max(), 'count_total': 0}, ignore_index=True)
df.tail()
baseDomain | creationTime | counter | count_total | |
---|---|---|---|---|
3950 | google.com | 2018-06-20 07:25:31.563988 | 1.0 | 3951 |
3951 | zeit.de | 2018-06-20 07:25:32.256212 | 1.0 | 3952 |
3952 | google.com | 2018-06-20 07:25:46.644553 | 1.0 | 3953 |
3953 | google.com | 2018-06-20 07:25:46.647741 | 1.0 | 3954 |
3954 | NaN | 2018-06-20 07:25:46.647741 | NaN | 0 |
p = figure(
x_axis_type='datetime', title='My Firefox Cookies',
height=300, sizing_mode='scale_width',
tools='', toolbar_location=None
)
source = ColumnDataSource({
'x': df.creationTime,
'y': df.count_total
})
p.x_range.range_padding = 0
p.y_range.range_padding = 0
p.patch('x', 'y', alpha=0.6, source=source)
p.circle('x', 'y', source=source)
p.xaxis.axis_label = 'Creation Time'
p.yaxis.axis_label = 'n cookies'
show(p)
df.baseDomain.value_counts().head(10)
insightexpressai.com 68 rubiconproject.com 65 pubmatic.com 51 amazon.com 50 google.com 45 stickyadstv.com 40 demdex.net 37 theadvocate.com 30 cnn.com 28 mozilla.org 28 Name: baseDomain, dtype: int64
with open('disconnect_me.json', 'r') as f:
disconnect_json = json.loads(f.read())
print(disconnect_json['categories'].keys())
dict_keys(['Advertising', 'Content', 'Analytics', 'Social', 'Disconnect'])
lookup = {}
for category in disconnect_json['categories'].keys():
for item in disconnect_json['categories'][category]:
for name in item:
for parent_domain in item[name]:
for child_domain in item[name][parent_domain]:
lookup[child_domain] = category
disconnect = pd.DataFrame.from_dict(lookup, orient='index').rename(
columns={0: 'category'}
)
disconnect.head()
category | |
---|---|
2leep.com | Advertising |
33across.com | Advertising |
365dm.com | Advertising |
365media.com | Advertising |
4info.com | Advertising |
cookie_domains = pd.DataFrame(df.baseDomain.value_counts()).rename(
columns={'baseDomain': 'count'}
).merge(disconnect, left_index=True, right_index=True, how='left').fillna('Uncategorized')
cookie_domains.head(10)
count | category | |
---|---|---|
insightexpressai.com | 68 | Advertising |
rubiconproject.com | 65 | Advertising |
pubmatic.com | 51 | Advertising |
amazon.com | 50 | Content |
google.com | 45 | Uncategorized |
stickyadstv.com | 40 | Uncategorized |
demdex.net | 37 | Advertising |
theadvocate.com | 30 | Uncategorized |
cnn.com | 28 | Uncategorized |
mozilla.org | 28 | Uncategorized |
cookie_domains.groupby('category').sum()
count | |
---|---|
category | |
Advertising | 741 |
Analytics | 103 |
Content | 220 |
Disconnect | 33 |
Social | 52 |
Uncategorized | 2805 |