In [1]:
#coded by Agung, PhD
import pandas as pd
import numpy as np
import os, sqlite3
import json


databases = []
dataku = []
directory = "D:/DATA/example"
for filename in os.listdir(directory):
    flname = os.path.join(directory, filename)
    databases.append(flname)

for database in databases:

    try:
        with sqlite3.connect(database) as conn:
            cur = conn.cursor()
            sqlqry = pd.read_sql("SELECT value FROM data WHERE name='BrowserBookmarksProbe'",conn)
            a = sqlqry['value']
            records = [json.loads(line) for line in a]
            for row in records:
                dataku.append(row)

    except sqlite3.Error, err:
        print "[INFO] %s" % err

url = [url['url'] for url in dataku]
visits = [visits['visits'] for visits in dataku]
datazip = zip(url,visits)
frame = pd.DataFrame(data=datazip, columns=['url','visits'])
In [2]:
urls = frame['url']
In [ ]:
import pprint
from difflib import SequenceMatcher

# http://python-cluster.sourceforge.net/
from cluster import HierarchicalClustering

def distance(url1, url2):
    ratio = SequenceMatcher(None, url1, url2).ratio()
    return 1.0 - ratio

# Perform clustering
hc = HierarchicalClustering(urls, distance)
clusters = hc.getlevel(0.2)

pprint.pprint(clusters)