#!/usr/bin/env python # coding: utf-8 In this notebook, we describe how to retrieve our browsing history from Safari (Part 1) and produce some simple plots (Part 2) in order to see what websites we visit most often. (Note: this was run on Mac OS X 10.10.2). # ## Part 1: Retrieving our Safari Browsing History # To access our browsing history, we go to ~/Library/Safari and look for the database *History.db*. We make a copy of it in a folder in our workspace, e.g. to *~/Workspace/web_browsing/hs.db*. # In[1]: get_ipython().run_cell_magic('bash', '', 'cp ~/Library/Safari/History.db ~/Workspace/web_browsing/hs.db\n') # Now let us fire up sqlite3 and see what tables are inside the database. # In[2]: get_ipython().run_cell_magic('script', 'sqlite3 hs.db', '.tables\n') # Among the four (4) tables above, we'll be using 'history_items' and 'history_visits'. Let's take a look at their schemata. # In[3]: get_ipython().run_cell_magic('script', 'sqlite3 hs.db', '.schema history_items\n') # In[4]: get_ipython().run_cell_magic('script', 'sqlite3 hs.db', '.schema history_visits \n') # The first thing to note is that each *history visit* table is associated with one *history item*, as we can tell from 'history_item' column in 'history_visits' table. # # `history_item INTEGER NOT NULL REFERENCES history_items(id) ON DELETE CASCADE` # # Moreover, as we can tell from the columns of the two tables, 'history_visits' contains information about individual visits to websites, like the *time* of the visit and the *title* of the website, while *history_items* contains general and aggregate information about the websites we visit, such as their url and number of visits. # # Let's take a look now at the addresses that we visit most often (and despair later)... # In[5]: get_ipython().run_cell_magic('script', 'sqlite3 hs.db', 'SELECT id, url, visit_count FROM history_items ORDER BY visit_count DESC LIMIT 5;\n') # ...As well as our most recent visits. # In[6]: get_ipython().run_cell_magic('script', 'sqlite3 hs.db', 'SELECT V.title, I.url, V.visit_time\nFROM history_items as I,\n(SELECT history_item, title, visit_time FROM history_visits ORDER BY visit_time DESC LIMIT 5) as V\nWHERE I.id = V.history_item;\n') # Note that, from what I can tell at least, time is expressed as the number of seconds since the beginning of 2001-01-01. # Finally, let's dump to text files ('visits.csv') only the information that interests us: for now, that's only the web address visited (url) and the associated timestamp. # In[7]: get_ipython().run_cell_magic('bash', '--out visits', 'sqlite3 hs.db \\\n"SELECT I.url, V.visit_time \\\nFROM history_items as I, \\\n(SELECT history_item, title, visit_time FROM history_visits) as V \\\nWHERE I.id = V.history_item;" \n') # In[8]: f = open('visits.txt', 'w') f.write(visits) f.close() # That was it, our browsing history is in file 'visits.txt'. # In[9]: get_ipython().run_cell_magic('bash', '', 'head -5 visits.txt\necho "... ... ..."\ntail -5 visits.txt\n') # ## Part 2: Plots # In[10]: get_ipython().run_line_magic('matplotlib', 'inline') # In[11]: import sys import matplotlib.pyplot as plt import time import datetime # Let us parse the text file we created and extract the timestamps of our visits. # In[12]: f = open('visits.txt') tt = [] # stores timestamps of our visits for line in f: try: tokens = line.rstrip().split('|') time_sec = float(tokens[-1]) tt.append(time_sec) except: sys.stderr.write(line) sys.stderr.write('\n') f.close() # sort the timestamps tt.sort() # turn into standard POSIX by adding # the first second of 2001-1-1 UTC zero_sec = time.mktime(datetime.datetime(2001, 1, 1).timetuple()) for i in range(len(tt)): tt[i] += zero_sec # Let us plot the daily number of pages we've visited for that period of time. # In[17]: bin_size = 24 * 3600 # have one bin per 24 hours num_of_bins = (tt[-1] - tt[0]) / bin_size # figure out intervals to accomodate number of labels on x-axis num_of_xlabels = 10 label_bin = (tt[-1] - tt[0]) / (num_of_xlabels - 1) locs = [tt[0] + label_bin * (i) for i in range(num_of_xlabels)] labels = map(lambda x: str(datetime.datetime.fromtimestamp(x)).split()[0], locs) # plot the histogram plt.figure(figsize=(15, 5)) plt.hist(tt, num_of_bins, range = (tt[0], tt[-1])) plt.xticks(locs, labels) plt.ylabel("Daily Visits") # What are websites we've visited most often? # We've already had an idea about this, but now let us do more elaborate processing. # In[14]: import re import numpy as np website_pattern = re.compile('http(s)?://(\w+\.)?(\w+?)\.\w+?/') verbose = False f = open('visits.txt') website_counts = {} # stores counts for each website addresses = {} # stores the different addresses (base-urls) for each website for line in f: try: # keep only the url, ignore the timestamp tokens = line.rstrip().split('|') m = website_pattern.search("".join(tokens[:-1])) try: # parse the url to extract the website name # for example, if the url is http://www.facebook.com/ab238ub # the website name is 'facebook' website = m.group(3) # update count website_counts[website] = website_counts.get(website, 0) + 1 # keep track of the different base urls that match this website # for example, if the url is http://www.facebook.com/ab238ub # the base url is http://www.facebook.com/ if website not in addresses: addresses[website] = {} addresses[website][m.group(0)] = addresses[website].get(m.group(0), 0) + 1 except Exception, e: if verbose: sys.stderr.write(str(e)) sys.stderr.write('\n') except: sys.stderr.write(line) sys.stderr.write('\n') f.close() # Plot the number of visits to the websites we visit most frequently # In[15]: k = 5 # The number of top websites to present # Keep the top-k websites k = min(k, len(website_counts)) top_websites = sorted(website_counts.items(), key = lambda x: x[1], reverse = True)[:k] # The number of visits to websites that are not among the top-k other_visits = sum(x[1] for x in top_websites[k:]) # Let's make a histogram for the top-k websites plt.figure(figsize = (10, 5)) # create new figure bar_width = 0.5 xpos = np.arange(len(top_websites)) + bar_width # bar positions on the x-axis plt.bar(xpos, [w[1] for w in top_websites], bar_width) # places the bars plt.xticks(xpos+bar_width/2, [w[0] for w in top_websites]) # ticks on x-axis # In[16]: ## For each of the top-k websites, show the most popular ## addresses (base urls) m = 3 # Show at most m addresses... pct = 0.90 # ... or stop at 90% of visits # For each of the top-k websites... for website_name, website_visits in top_websites: # ... store the most frequently visited base urls ... website_addresses = sorted(addresses[website_name].items(), key = lambda x: x[1], reverse = True) top_addresses = []; visits = 0. for address, address_count in website_addresses: visits += address_count top_addresses.append((address, address_count)) if len(top_addresses) > m or visits / website_visits >= pct: break # ... and the number of visits to addresses that are not among the top other_num = website_visits - visits # make a pie-chart for this website # if it is associated with many addresses if len(top_addresses) >= m: # TODO change this condition if you want plt.figure(figsize = (7,7)) labels = [x[0] for x in top_addresses] + ["other"] sizes = [x[1] for x in top_addresses] + [other_num] # arbitrary choice of colors -- I like blue n = float(len(sizes)) colors = [( p / (n + 1.), 0., (1. - p / (n + 1.)))\ for p in xrange(len(sizes))] colors[-1] = 'grey' plt.pie(sizes, labels=labels, colors=colors, labeldistance = 1.1, autopct='%1.1f%%', startangle = 90) # ### Resources # For more information on the tools we used in this session, check the following websites. # 1. Tutorial on the [sqlite3 command line tool](http://zetcode.com/db/sqlite/tool/) # 2. Python's documentation for the [sqlite3 module](https://docs.python.org/2/library/sqlite3.html) # 3. [Running scripts in iPython]( http://nbviewer.ipython.org/github/ipython/ipython/blob/d835d46dcc50043971b4a9915398bad1b5d63648/docs/examples/notebooks/Script%20Magics.ipynb) # 4. [Markdown syntax](http://daringfireball.net/projects/markdown/syntax) # In[ ]: