In [2]:
import pprint
from difflib import SequenceMatcher

# http://python-cluster.sourceforge.net/
from cluster import HierarchicalClustering

# input urls to be clustered
urls = [
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385',
    '#articles',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335',
    'http://yro.slashdot.org/~drDugan/',
    'http://web.sourceforge.com/privacy.php',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123',
    'http://slashdot.org//slashdot.org/~Darkness404',
    'http://slashdot.org//radio.slashdot.org',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457',
    'http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238',
    'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657',
    'http://web.sourceforge.com/terms.php'
    'http://slashdot.org//it.slashdot.org/search',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581',
    'http://xkcd.com/612/',
    'http://web.sourceforge.com/advertising',
    'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785',
]

# distance function compares two urls and finds the distance
# uses SequenceMatcher from python standard module difflib
def distance(url1, url2):
    ratio = SequenceMatcher(None, url1, url2).ratio()
    return 1.0 - ratio

# Perform clustering
hc = HierarchicalClustering(urls, distance)
clusters = hc.getlevel(0.2)

pprint.pprint(clusters)
[['#articles'],
 ['http://xkcd.com/612/'],
 ['http://web.sourceforge.com/advertising'],
 ['http://web.sourceforge.com/privacy.php'],
 ['http://web.sourceforge.com/terms.phphttp://slashdot.org//it.slashdot.org/search'],
 ['http://yro.slashdot.org/~drDugan/'],
 ['http://slashdot.org//radio.slashdot.org'],
 ['http://slashdot.org//slashdot.org/~Darkness404'],
 ['http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457'],
 ['http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238',
  'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581',
  'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657']]
In [4]:
type(urls)
Out[4]:
list