import pprint from difflib import SequenceMatcher # from cluster import HierarchicalClustering # input urls to be clustered urls = [ '', '#articles', '', '', '', '', '', '', '', '', '', '', '', '', '' '', '', '', '', '', ] # distance function compares two urls and finds the distance # uses SequenceMatcher from python standard module difflib def distance(url1, url2): ratio = SequenceMatcher(None, url1, url2).ratio() return 1.0 - ratio # Perform clustering hc = HierarchicalClustering(urls, distance) clusters = hc.getlevel(0.2) pprint.pprint(clusters) type(urls)