In [15]:
import pandas as pd
import numpy as np
import os, sqlite3
import json
from urllib import urlencode
from urlparse import urlparse, parse_qs, urlunparse
import urlparse
import pprint
from difflib import SequenceMatcher



def parsed(x):
    parse = urlparse(x)
    return parse.netloc


lines = []

databases = []
dataku = []


directory = "D:/DATA/example"
for filename in os.listdir(directory):
    flname = os.path.join(directory, filename)
    databases.append(flname)

for database in databases:

    try:
        with sqlite3.connect(database) as conn:
            cur = conn.cursor()
            sqlqry = pd.read_sql("SELECT value FROM data WHERE name='BrowserBookmarksProbe'",conn)
            a = sqlqry['value']
            #b = sqlqry['timestamp']
            records = [json.loads(line) for line in a]
            for row in records:
                dataku.append(row)
                

    except sqlite3.Error, err:
        print "[INFO] %s" % err

url = [url['url'] for url in dataku]
visits = [visits['visits'] for visits in dataku]
datazip = zip(url,visits)

df = pd.DataFrame(datazip, columns=['url','visits'])

dfnew= df['url'].apply(parsed)
In [16]:
clusterdata = dfnew.head(100)
In [22]:
clusterdata.head(10)
Out[22]:
0       m.blog.naver.com
1         m.facebook.com
2                details
3    story-api.kakao.com
4        m.cafe.daum.net
5                details
6          cafe.daum.net
7         m.facebook.com
8          j.exit-ad.com
9               asked.kr
Name: url, dtype: object
In [26]:
strdata = clusterdata.apply(str)
In [27]:
strdata.head(10)
Out[27]:
0       m.blog.naver.com
1         m.facebook.com
2                details
3    story-api.kakao.com
4        m.cafe.daum.net
5                details
6          cafe.daum.net
7         m.facebook.com
8          j.exit-ad.com
9               asked.kr
Name: url, dtype: object
In [28]:
import pprint
from difflib import SequenceMatcher

from cluster import HierarchicalClustering

def distance(url1, url2):
    ratio = SequenceMatcher(None, url1, url2).ratio()
    return 1.0 - ratio

# Perform clustering
hc = HierarchicalClustering(strdata, distance)
clusters = hc.getlevel(0.2)
In [29]:
clusters
Out[29]:
[['details', 'details'],
 ['auth.waffle.at',
  'auth.waffle.at',
  'auth.waffle.at',
  'auth.waffle.at',
  'auth.waffle.at',
  'auth.waffle.at',
  'auth.waffle.at',
  'auth.waffle.at',
  'auth.waffle.at'],
 ['bit.ly'],
 ['asked.kr', 'asked.kr', 'asked.kr', 'asked.kr'],
 ['cyber.kepco.co.kr',
  'cyber.kepco.co.kr',
  'cyber.kepco.co.kr',
  'cyber.kepco.co.kr',
  'cyber.kepco.co.kr',
  'cyber.kepco.co.kr',
  'cyber.kepco.co.kr',
  'cyber.kepco.co.kr'],
 ['jesuside.com.ne.kr'],
 ['t.co', 't.co'],
 ['j.exit-ad.com'],
 ['m.event.toast.com'],
 ['twitpic.com'],
 ['i1.media.daumcdn.net'],
 ['cafe.daum.net', 'm.cafe.daum.net', 'm.cafe.daum.net'],
 ['eugenejulia.tistory.com'],
 ['m.imbc.com'],
 ['m.winixcorp.com'],
 ['m.humoruniv.com', 'web.humoruniv.com'],
 ['ggstory.com'],
 ['m.gsshop.com', 'm.gsshop.com'],
 ['m.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com',
  'm.facebook.com'],
 ['instagram.com'],
 ['m.universaltoy.co.kr'],
 ['m.insight.co.kr'],
 ['www.shescoming.co.kr',
  'www.shescoming.co.kr',
  'www.shescoming.co.kr',
  'www.shescoming.co.kr',
  'www.shescoming.co.kr'],
 ['www.lotteconf.co.kr'],
 ['www.skt-lte.co.kr'],
 ['www.ebs.co.kr'],
 ['m.11am.co.kr'],
 ['www.annpiona.co.kr', 'm.hani.co.kr', 'www.hani.co.kr'],
 ['www.pulmuoneamio.com'],
 ['www.threemusic.com'],
 ['www.youtube.com'],
 ['www.gobalnews.com'],
 ['www.google.com'],
 ['www.100tap.com'],
 ['story-api.kakao.com'],
 ['www.korea-ps.com'],
 ['m.blog.naver.com',
  'm.blog.naver.com',
  'm.blog.naver.com',
  'm.blog.naver.com',
  'm.blog.naver.com'],
 ['m.cafe.naver.com',
  'm.cafe.naver.com',
  'm.cafe.naver.com',
  'nid.naver.com',
  'nid.naver.com',
  'www.naver.com',
  'm.news.naver.com',
  'news.naver.com'],
 ['plus.kakao.com'],
 ['www.dalkomm.com'],
 ['www.kakao.com']]
In [30]:
for row in clusters:
    print row
['details', 'details']
['auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at']
['bit.ly']
['asked.kr', 'asked.kr', 'asked.kr', 'asked.kr']
['cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr']
['jesuside.com.ne.kr']
['t.co', 't.co']
['j.exit-ad.com']
['m.event.toast.com']
['twitpic.com']
['i1.media.daumcdn.net']
['cafe.daum.net', 'm.cafe.daum.net', 'm.cafe.daum.net']
['eugenejulia.tistory.com']
['m.imbc.com']
['m.winixcorp.com']
['m.humoruniv.com', 'web.humoruniv.com']
['ggstory.com']
['m.gsshop.com', 'm.gsshop.com']
['m.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com']
['instagram.com']
['m.universaltoy.co.kr']
['m.insight.co.kr']
['www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr']
['www.lotteconf.co.kr']
['www.skt-lte.co.kr']
['www.ebs.co.kr']
['m.11am.co.kr']
['www.annpiona.co.kr', 'm.hani.co.kr', 'www.hani.co.kr']
['www.pulmuoneamio.com']
['www.threemusic.com']
['www.youtube.com']
['www.gobalnews.com']
['www.google.com']
['www.100tap.com']
['story-api.kakao.com']
['www.korea-ps.com']
['m.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com']
['m.cafe.naver.com', 'm.cafe.naver.com', 'm.cafe.naver.com', 'nid.naver.com', 'nid.naver.com', 'www.naver.com', 'm.news.naver.com', 'news.naver.com']
['plus.kakao.com']
['www.dalkomm.com']
['www.kakao.com']