from glue import qglue import pandas as pd # set up IPython/Qt integration # NOTE: this cell takes a second to run. For some reason, # IPython will stall if you try to run the next cell before this one completes %gui qt4 states = pd.read_csv('state_crime.csv') states.head() app = qglue(states=states) dc = app.data_collection print dc data = dc[0] print type(data) print data data['Murder'] from sklearn.cluster import KMeans import numpy as np # extract data into Numpy [N,3] array X = np.column_stack((data['Robbery'], data['Rape'], data['Murder'])) clusters = KMeans(n_clusters=3).fit_predict(X) # add cluster_id as a new attribute c = data.add_component(clusters, 'cluster_id') # create 3 new subsets, that select each value in clusters dc.new_subset_group(label='Cluster 1', subset_state = (c == 0)) dc.new_subset_group(label='Cluster 2', subset_state = (c == 1)) dc.new_subset_group(label='Cluster 3', subset_state = (c == 2)) df = data.to_dataframe() print df.columns cuts = pd.cut(df.Robbery, 10) df.groupby(cuts).Murder.mean() outliers = data.subsets[0].to_mask() print "Selected %i rows" % outliers.sum() print outliers df[outliers].head()