import pprint
from difflib import SequenceMatcher
# http://python-cluster.sourceforge.net/
from cluster import HierarchicalClustering
# input urls to be clustered
urls = [
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385',
'#articles',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335',
'http://yro.slashdot.org/~drDugan/',
'http://web.sourceforge.com/privacy.php',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123',
'http://slashdot.org//slashdot.org/~Darkness404',
'http://slashdot.org//radio.slashdot.org',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457',
'http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238',
'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657',
'http://web.sourceforge.com/terms.php'
'http://slashdot.org//it.slashdot.org/search',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581',
'http://xkcd.com/612/',
'http://web.sourceforge.com/advertising',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785',
]
# distance function compares two urls and finds the distance
# uses SequenceMatcher from python standard module difflib
def distance(url1, url2):
ratio = SequenceMatcher(None, url1, url2).ratio()
return 1.0 - ratio
# Perform clustering
hc = HierarchicalClustering(urls, distance)
clusters = hc.getlevel(0.2)
pprint.pprint(clusters)
[['#articles'], ['http://xkcd.com/612/'], ['http://web.sourceforge.com/advertising'], ['http://web.sourceforge.com/privacy.php'], ['http://web.sourceforge.com/terms.phphttp://slashdot.org//it.slashdot.org/search'], ['http://yro.slashdot.org/~drDugan/'], ['http://slashdot.org//radio.slashdot.org'], ['http://slashdot.org//slashdot.org/~Darkness404'], ['http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457'], ['http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238', 'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657']]
type(urls)
list