#!/usr/bin/env python
# coding: utf-8
In this notebook, we describe how to retrieve our browsing history from Safari (Part 1) and produce some simple plots (Part 2) in order to see what websites we visit most often.

(Note: this was run on Mac OS X 10.10.2).
# ## Part 1: Retrieving our Safari Browsing History

# To access our browsing history, we go to ~/Library/Safari and look for the database *History.db*. We make a copy of it in a folder in our workspace, e.g. to *~/Workspace/web_browsing/hs.db*.

# In[1]:


get_ipython().run_cell_magic('bash', '', 'cp ~/Library/Safari/History.db ~/Workspace/web_browsing/hs.db\n')


# Now let us fire up sqlite3 and see what tables are inside the database.

# In[2]:


get_ipython().run_cell_magic('script', 'sqlite3 hs.db', '.tables\n')


# Among the four (4) tables above, we'll be using 'history_items' and 'history_visits'. Let's take a look at their schemata.

# In[3]:


get_ipython().run_cell_magic('script', 'sqlite3 hs.db', '.schema history_items\n')


# In[4]:


get_ipython().run_cell_magic('script', 'sqlite3 hs.db', '.schema history_visits \n')


# The first thing to note is that each *history visit* table is associated with one *history item*, as we can tell from 'history_item' column in 'history_visits' table.
# 
# `history_item INTEGER NOT NULL REFERENCES history_items(id) ON DELETE CASCADE`
# 
# Moreover, as we can tell from the columns of the two tables, 'history_visits' contains information about individual visits to websites, like the *time* of the visit and the *title* of the website, while *history_items* contains general and aggregate information about the websites we visit, such as their url and number of visits.
# 
# Let's take a look now at the addresses that we visit most often (and despair later)...

# In[5]:


get_ipython().run_cell_magic('script', 'sqlite3 hs.db', 'SELECT id, url, visit_count  FROM history_items ORDER BY visit_count DESC LIMIT 5;\n')


# ...As well as our most recent visits.

# In[6]:


get_ipython().run_cell_magic('script', 'sqlite3 hs.db', 'SELECT V.title, I.url, V.visit_time\nFROM history_items as I,\n(SELECT history_item, title, visit_time FROM history_visits ORDER BY visit_time DESC LIMIT 5) as V\nWHERE I.id = V.history_item;\n')


# Note that, from what I can tell at least, time is expressed as the number of seconds since the beginning of 2001-01-01.

# Finally, let's dump to text files ('visits.csv') only the information that interests us: for now, that's only the web address visited (url) and the associated timestamp.

# In[7]:


get_ipython().run_cell_magic('bash', '--out visits', 'sqlite3 hs.db \\\n"SELECT I.url, V.visit_time \\\nFROM history_items as I, \\\n(SELECT history_item, title, visit_time FROM history_visits) as V \\\nWHERE I.id = V.history_item;" \n')


# In[8]:


f = open('visits.txt', 'w')
f.write(visits)
f.close()


# That was it, our browsing history is in file 'visits.txt'.

# In[9]:


get_ipython().run_cell_magic('bash', '', 'head -5 visits.txt\necho "... ... ..."\ntail -5 visits.txt\n')


# ## Part 2: Plots

# In[10]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[11]:


import sys
import matplotlib.pyplot as plt
import time
import datetime


# Let us parse the text file we created and extract the timestamps of our visits.

# In[12]:


f = open('visits.txt')
tt = [] # stores timestamps of our visits
for line in f:
    try:
        tokens = line.rstrip().split('|')
        time_sec = float(tokens[-1])
        tt.append(time_sec)
    except:
        sys.stderr.write(line)
        sys.stderr.write('\n')
f.close()
# sort the timestamps
tt.sort()

# turn into standard POSIX by adding
# the first second of 2001-1-1 UTC
zero_sec = time.mktime(datetime.datetime(2001, 1, 1).timetuple())
for i in range(len(tt)): tt[i] += zero_sec


# Let us plot the daily number of pages we've visited for that period of time.

# In[17]:


bin_size = 24 * 3600 # have one bin per 24 hours
num_of_bins = (tt[-1] - tt[0]) / bin_size
# figure out intervals to accomodate number of labels on x-axis
num_of_xlabels = 10
label_bin = (tt[-1] - tt[0]) / (num_of_xlabels - 1)
locs = [tt[0] + label_bin * (i) for i in range(num_of_xlabels)]
labels = map(lambda x: str(datetime.datetime.fromtimestamp(x)).split()[0], locs)
# plot the histogram
plt.figure(figsize=(15, 5))
plt.hist(tt, num_of_bins, range = (tt[0], tt[-1]))
plt.xticks(locs, labels)
plt.ylabel("Daily Visits")


# What are websites we've visited most often?
# We've already had an idea about this, but now let us do more elaborate processing.

# In[14]:


import re
import numpy as np
website_pattern = re.compile('http(s)?://(\w+\.)?(\w+?)\.\w+?/')

verbose = False

f = open('visits.txt')
website_counts = {} # stores counts for each website
addresses = {} # stores the different addresses (base-urls) for each website
for line in f:
    try:
        # keep only the url, ignore the timestamp
        tokens = line.rstrip().split('|')
        m = website_pattern.search("".join(tokens[:-1]))
        try:
            # parse the url to extract the website name
            # for example, if the url is http://www.facebook.com/ab238ub
            # the website name is 'facebook'
            website = m.group(3)
            # update count
            website_counts[website] = website_counts.get(website, 0) + 1
            # keep track of the different base urls that match this website
            # for example, if the url is http://www.facebook.com/ab238ub
            # the base url is http://www.facebook.com/
            if website not in addresses:
                addresses[website] = {}
            addresses[website][m.group(0)] = addresses[website].get(m.group(0), 0) + 1
        except Exception, e:
            if verbose:
                sys.stderr.write(str(e))
                sys.stderr.write('\n')
    except:
        sys.stderr.write(line)
        sys.stderr.write('\n')
f.close()


# Plot the number of visits to the websites we visit most frequently

# In[15]:


k = 5 # The number of top websites to present

# Keep the top-k websites
k = min(k, len(website_counts))
top_websites = sorted(website_counts.items(),
                key = lambda x: x[1], reverse = True)[:k]
# The number of visits to websites that are not among the top-k
other_visits = sum(x[1] for x in top_websites[k:])

# Let's make a histogram for the top-k websites
plt.figure(figsize = (10, 5)) # create new figure
bar_width = 0.5
xpos = np.arange(len(top_websites)) + bar_width # bar positions on the x-axis
plt.bar(xpos, [w[1] for w in top_websites], bar_width) # places the bars
plt.xticks(xpos+bar_width/2, [w[0] for w in top_websites]) # ticks on x-axis


# In[16]:


## For each of the top-k websites, show the most popular
## addresses (base urls)

m = 3       # Show at most m addresses...
pct = 0.90  # ... or stop at 90% of visits

# For each of the top-k websites...
for website_name, website_visits in top_websites:

    # ... store the most frequently visited base urls ...
    website_addresses = sorted(addresses[website_name].items(),
                               key = lambda x: x[1], reverse = True)
    top_addresses = []; visits = 0.
    for address, address_count in website_addresses:
        visits += address_count
        top_addresses.append((address, address_count))
        if len(top_addresses) > m or visits / website_visits >= pct:
            break
        
    # ... and the number of visits to addresses that are not among the top
    other_num = website_visits - visits

    # make a pie-chart for this website
    # if it is associated with many addresses
    if len(top_addresses) >= m: # TODO change this condition if you want
        plt.figure(figsize = (7,7))
        labels = [x[0] for x in top_addresses] + ["other"]
        sizes = [x[1] for x in top_addresses] + [other_num] 
        # arbitrary choice of colors -- I like blue
        n = float(len(sizes))
        colors = [( p / (n + 1.), 0., (1. - p / (n + 1.)))\
                                  for p in xrange(len(sizes))]
        colors[-1] = 'grey'
        plt.pie(sizes, labels=labels, colors=colors,
                labeldistance = 1.1, autopct='%1.1f%%', startangle = 90)


# ### Resources

# For more information on the tools we used in this session, check the following websites.
# 1. Tutorial on the [sqlite3 command line tool](http://zetcode.com/db/sqlite/tool/)
# 2. Python's documentation for the [sqlite3 module](https://docs.python.org/2/library/sqlite3.html)
# 3. [Running scripts in iPython]( http://nbviewer.ipython.org/github/ipython/ipython/blob/d835d46dcc50043971b4a9915398bad1b5d63648/docs/examples/notebooks/Script%20Magics.ipynb)
# 4. [Markdown syntax](http://daringfireball.net/projects/markdown/syntax)

# In[ ]: