import os
import pandas as pd
import sqlite3
firefox_profile_dir = '/home/bird/.mozilla/firefox/old_profiles/iadzfbcv.default/'
[x for x in os.listdir(firefox_profile_dir) if x.endswith('sqlite')]
['content-prefs.sqlite', 'places.sqlite', 'kinto.sqlite', 'permissions.sqlite', 'formhistory.sqlite', 'storage-sync.sqlite', 'favicons.sqlite', 'cookies.sqlite', 'storage.sqlite', 'webappsstore.sqlite']
storage_file = '{}/webappsstore.sqlite'.format(firefox_profile_dir)
storage_db = sqlite3.connect(storage_file)
# %load '/home/bird/Documents/tracking technologies/notebooks/get_sqlite_tables.py'
def list_tables_in_db(db):
print(db.cursor().execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall())
list_tables_in_db(storage_db)
[('webappsstore2',)]
storage_df = pd.read_sql('SELECT * FROM webappsstore2', storage_db)
#storage_df.head()
storage_df['origin'] = storage_df.originKey.apply(lambda x: x[::-1].split(':.')[1])
#storage_df.head()
cookies_file = '{}/cookies.sqlite'.format(firefox_profile_dir)
cookies_db = sqlite3.connect(cookies_file)
cookied_df = pd.read_sql('SELECT * FROM moz_cookies', cookies_db)
shared_values = []
for v in cookied_df.value.unique():
matches = cookied_df[cookied_df.value.str.contains(v, regex=False)]
if len(matches.baseDomain.unique()) > 5:
shared_values.append(v)
potential_ids = [x for x in shared_values if(len(x) > 10) & ('com' not in x)]
# From cookie table
print('We have', len(potential_ids), 'potential ids. Things like:')
# potential_ids[0:5]
We have 25 potential ids. Things like:
for potential_id in potential_ids:
if len(storage_df[storage_df.value.str.contains(potential_id)]) > 0:
print(potential_id, 'found in local storage')
67936421072632709762729202117726060613 found in local storage 7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc found in local storage
repeated_id = '67936421072632709762729202117726060613'
storage_df[storage_df.value.str.contains(repeated_id)][['value','origin']]
value | origin | |
---|---|---|
291 | {"email":null,"timeIncId":"53fdd635-5007-4f53-... | people.com |
repeated_id = '7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc'
storage_df[storage_df.value.str.contains(repeated_id)][['value', 'origin']]
value | origin | |
---|---|---|
27 | 7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc | www.latimes.com |
227 | 7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc | www.merriam-webster.com |
229 | v2_bf1ca6db22b70291779ce41eb2e5aee5_7620423c-7... | www.merriam-webster.com |
232 | v2_bf1ca6db22b70291779ce41eb2e5aee5_7620423c-7... | tpc.googlesyndication.com |
236 | 7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc | tpc.googlesyndication.com |
357 | v2_9a6c86f52b3f9239067936a17472df9f_7620423c-7... | www.huffingtonpost.com |
359 | 7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc | www.huffingtonpost.com |
407 | v2_66138b9f6bd0ec8912d19cb714efd912_7620423c-7... | www.latimes.com |
984 | 7620423c-7103-4edc-9aee-099c75141b87-tuct18c03dc | www.bloomberg.com |
989 | v2_96ea2f0533c59e2312c1f1112ced8f46_7620423c-7... | www.bloomberg.com |
We can see this id being shared across multiple origins in local storage as well.