import simplejson as json from eventlet import GreenPool from eventlet.green import urllib2 as urllib2 def get_page(url): return json.loads(urllib2.urlopen(url).read()) base_url = 'https://api.angel.co/1/tags/1695/startups' response = get_page(base_url) last_page = response['last_page'] pool = GreenPool(20) startups = [] for item in pool.imap(get_page,[base_url+'?page=%d'%(page_num+1) for page_num in range(last_page)]): startups.extend(item['startups']) non_hidden = filter(lambda(x):not x['hidden'],startups) non_hidden = filter(lambda(x):len(x['markets'])>1,non_hidden) 'PeerIndex' in set([item['name'] for item in non_hidden]) import numpy as np from scipy.sparse import coo_matrix def sparse_encoding(items): dictionary = {} maxid = 0 ii = [] jj = [] v = [] for i,item in enumerate(items): for tag in item: j = dictionary.setdefault(tag,maxid) if j==maxid: maxid = maxid+1 jj.append(j) ii.append(i) v.append(1) return coo_matrix((v,(ii,jj))),dictionary data,dictionary = sparse_encoding([set([market['id'] for market in startup['markets']]) for startup in non_hidden]) spy(data.T) from scipy.spatial.distance import pdist,squareform A = 1-squareform(pdist(data.todense(),'cosine')) A[isnan(A)] = 0 spy(A) xlabel('startups') ylabel('startups') title('similarity between startups') from scipy.sparse.linalg import eigsh Areg = A + 0.02*ones(A.shape) D = diag(sum(Areg,0)) L = D - Areg u,v = eigs(L,3,D,which='SM') order = sorted(range(data.shape[0]),key=lambda(x):v[x,1]) spy(A[order,:][:,order]) plot(v[:,1],v[:,2],'.') from sklearn.cluster import * clustering = SpectralClustering(affinity='precomputed',n_clusters = 7) labels = clustering.fit_predict(Areg) scatter(v[:,1],v[:,2],c=labels) from collections import Counter for l in range(7): cnt = Counter([market['name'] for startup,label in zip(non_hidden,labels) if label==l for market in startup['markets']]) print 'Cluster %d, (%s)'%(l,', '.join([item[0] for item in cnt.most_common(3)])) for name,markets in [(startup['name'],[market['name'] for market in startup['markets']]) for startup,label in zip(non_hidden,labels) if label==l][:10]: print '\t%s:\t\t%s'%(name,markets)