import pprint from difflib import SequenceMatcher # http://python-cluster.sourceforge.net/ from cluster import HierarchicalClustering # input urls to be clustered urls = [ 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385', '#articles', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335', 'http://yro.slashdot.org/~drDugan/', 'http://web.sourceforge.com/privacy.php', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123', 'http://slashdot.org//slashdot.org/~Darkness404', 'http://slashdot.org//radio.slashdot.org', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457', 'http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238', 'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657', 'http://web.sourceforge.com/terms.php' 'http://slashdot.org//it.slashdot.org/search', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581', 'http://xkcd.com/612/', 'http://web.sourceforge.com/advertising', 'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785', ] # distance function compares two urls and finds the distance # uses SequenceMatcher from python standard module difflib def distance(url1, url2): ratio = SequenceMatcher(None, url1, url2).ratio() return 1.0 - ratio # Perform clustering hc = HierarchicalClustering(urls, distance) clusters = hc.getlevel(0.2) pprint.pprint(clusters) type(urls)