import pandas as pd import plotly.plotly as py from plotly.graph_objs import * import sklearn as skl import numpy as np #import xlrd import matplotlib.pyplot as plt import glob # Importing the datasets and combining them. path = 'raw_data/csv' allFiles = glob.glob(path + '/*.csv') df = pd.DataFrame() for i, filename in enumerate(allFiles): #print i, filename df_file = pd.read_csv(filename) #df_file.drop('CSA2010', inplace=True) #df_file['filename'] = filename #df_file['file_ID'] = i if i == 0: df = df_file else: df = pd.merge(df,df_file) df.index = df['CSA2010'] df.drop('CSA2010', inplace=True) del df['CSA2010'] cols = df.columns df[cols] = ( df[cols] # Replace things that aren't numbers and change any empty entries to nan # (to allow type conversion) .replace({r'[^0-9\.]': '', '': np.nan}, regex=True) # Change to float and convert from %s .astype(np.float64) ) #df.to_csv(path+'full_vital_signs_dataset.csv') len(df.columns) red_orange = '#ff3700' # One of the rows is an aggregate Baltimore City. df.drop('Baltimore City', inplace=True) df_white_sorted = df['pwhite10'].sort(inplace=False) fig = plt.figure(figsize=[7,20]) ax1 = plt.subplot(1,1,1) df_white_sorted.plot(kind='barh', grid='off') # turn off square border around plot ax1.spines["top"].set_visible(False) ax1.spines["bottom"].set_visible(False) ax1.spines["right"].set_visible(False) ax1.spines["left"].set_visible(False) # turn off ticks ax1.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on",labelsize=16) title('% White', fontsize=20) df_chPov_sorted = df['hhchpov12'].sort(inplace=False)#.apply(lambda x: x[:-1]).astype(float) #df_chPov_sorted.sort(inplace=True) fig = plt.figure(figsize=[7,20]) ax1 = plt.subplot(1,1,1) df_chPov_sorted.plot(kind='barh', grid='off') # turn off square border around plot ax1.spines["top"].set_visible(False) ax1.spines["bottom"].set_visible(False) ax1.spines["right"].set_visible(False) ax1.spines["left"].set_visible(False) # turn off ticks ax1.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on",labelsize=16) title('% HH Children in Poverty', fontsize=20) # pandas only import matplotlib.pyplot as plt fig = plt.figure(figsize=[8,8]) ax1 = plt.subplot(1,1,1) #df.drop('Baltimore City', inplace=True) x = df['pwhite10']#.apply(lambda x: x[:-1]).astype(float) y = df['hhchpov12']#.apply(lambda x: x[:-1]).astype(float) schoolage_pop = df['tpop10'] * df['age18_10'] / 100 #.apply(lambda x: x[:-1]).astype(float) s = (schoolage_pop/50)**2 plt.scatter(x,y, color=red_orange, marker='.', s=s, alpha=.4) xlabel('\n% Population White',fontsize=16) ylabel('% Households w. Children in Poverty\n',fontsize=16) xlim([-5,100]) ylim([-5,100]) title('Baltimore', fontsize=20) # turn off square border around plot ax1.spines["top"].set_visible(False) ax1.spines["bottom"].set_visible(False) ax1.spines["right"].set_visible(False) ax1.spines["left"].set_visible(False) # turn off ticks ax1.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on",labelsize=16) plt.show() # same plot, interactive from plotly #df.drop('Baltimore City', inplace=True) import plotly.plotly as py x = df['pwhite10']#.apply(lambda x: x[:-1]).astype(float) y = df['hhchpov12']#.apply(lambda x: x[:-1]).astype(float) schoolage_pop = df['tpop10'] * df['age18_10']/100#.apply(lambda x: x[:-1]).astype(float) trace1 = Scatter(x=x, y=y, text = np.array(x.index), mode = 'markers', marker = Marker(size = schoolage_pop, sizemode = 'area', sizeref = schoolage_pop.max()/1000, opacity = 0.5, color = 'blue') ) layout = Layout( title = 'Baltimore: Too Many Non-White Kids in Poverty', xaxis = XAxis(range = [-5,100], title = '% Population White (2010)', showgrid = False, showline = False, zeroline = False, autotick = False, dtick = 20), yaxis = YAxis(range = [-5,100], title = '% Households w. Children in Poverty (2012)', showgrid = False, showline = False, zeroline = False, autotick = False, dtick = 20), autosize = False, width=500, height=500, font = Font(size=14), hovermode = 'closest' ) data = Data([trace1]) fig = Figure(data=data, layout=layout) py.iplot(fig, filename='Baltimore Percent Child Poverty Percent White') size = 100 alpha = 0.5 fontsize = 16 fig = plt.figure(figsize=[8,8]) ax1 = plt.subplot(1,1,1) scatter(df['phisp10'],df['hhpov12'],c='r',alpha=alpha,s=size) scatter(df['paa10'],df['hhpov12'],c='c',alpha=alpha,s=size) legend(['Hispanic','Black'],fontsize=fontsize) xlabel('Percent of Population',fontsize=fontsize) ylabel('Percent of Households in Poverty',fontsize=fontsize) # turn off square border around plot ax1.spines["top"].set_visible(False) ax1.spines["bottom"].set_visible(False) ax1.spines["right"].set_visible(False) ax1.spines["left"].set_visible(False) # turn off ticks ax1.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on",labelsize=16) xlim([-5,100]) ylim([-5,60]) plt.show() from sklearn.decomposition import PCA as PCA from sklearn.preprocessing import StandardScaler X = np.array(df) scaler = StandardScaler()#.fit(X) X_scaled = scaler.fit_transform(X) pca = PCA() pca.fit(X_scaled) #print(pca.explained_variance_) #print pd.Series(pca.explained_variance_ratio_).apply(lambda x: round(x,2)) bar(range(0,len(pca.explained_variance_ratio_)),pd.Series(pca.explained_variance_ratio_).apply(lambda x: round(x,2))) ''' cov_mat = np.cov(X_scaled.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) print('Eigenvectors \n%s' %eig_vecs) print('\nEigenvalues \n%s' %eig_vals) print len(eig_vals) ''' pd.Series(pca.explained_variance_ratio_).cumsum().plot() xlabel('number of dimensions') ylabel('cumulative fraction of variance explained') ylim([0,1]) #sum(pca.explained_variance_ratio_[:5]) pca.n_components = 2 X_reduced = pca.fit_transform(X_scaled) print 'Shape: ', X_reduced.shape print 'Explained Variance Ratio = ', round(sum(pca.explained_variance_ratio_[:pca.n_components]),2) df_X_reduced = pd.DataFrame(X_reduced, index=df.index) fig = plt.figure(figsize=[8,8]) ax = plt.subplot(1,1,1) plot(df_X_reduced[0], df_X_reduced[1], marker='o', linestyle='', alpha=1.0) #xlim([-7,7]) #ylim([-10,15]) ii=0 for i,j in zip(df_X_reduced[0],df_X_reduced[1]): ax.annotate('%s' %df_X_reduced.ix[ii].name, xy=(i,j), xytext=(10,0), textcoords='offset points') ii = ii+1 x = df_X_reduced[0]#.apply(lambda x: x[:-1]).astype(float) y = df_X_reduced[1]#.apply(lambda x: x[:-1]).astype(float) xscale = 22 yscale = 10 #schoolage_pop = df['tpop10'] * df['age18_10']/100#.apply(lambda x: x[:-1]).astype(float) trace1 = Scatter(x=x, y=y, text = df.index, mode = 'markers', marker = Marker(size = df['tpop10']/500, sizemode = 'diameter', #sizeref = schoolage_pop.max()/1000, opacity = 0.5, color = 'blue') ) layout = Layout( title = 'Baltimore Vital Signs PCA', xaxis = XAxis(range = [-xscale,xscale], title = '', showgrid = False, showline = False, zeroline = False, autotick = False, showticklabels=False), yaxis = YAxis(range = [-1*yscale,yscale], title = '', showgrid = False, showline = False, zeroline = False, autotick = False, showticklabels=False), autosize = False, width=500, height=500, font = Font(size=14), hovermode = 'closest' ) data = Data([trace1]) fig = Figure(data=data, layout=layout) py.iplot(fig, filename='Baltimore PCA') from sklearn.cluster import KMeans #reduced_data = PCA(n_components=2).fit_transform(data) def cluster(n_clusters): kmeans = KMeans(n_clusters=n_clusters) kmeans.fit(X_reduced) Z = kmeans.predict(X_reduced) return kmeans, Z max_clusters = len(df) inertias = zeros(max_clusters) for i in range(1,max_clusters): kmeans, Z = cluster(i) inertias[i] = kmeans.inertia_ fig = plt.figure(figsize=[6,6]) ax1 = plt.subplot(1,1,1) plot(inertias,marker='.') xlabel('Number of Clusters') ylabel('Inertia') #xlim([1,14]) plt.show() n_clusters = 7 model, Z = cluster(n_clusters) fig = plt.figure(figsize=[10,10]) ax = plt.subplot(1,1,1) plt.scatter(df_X_reduced[0], df_X_reduced[1], c=Z, marker='o', alpha=.5, s=200) #xlim([-7,7]) #ylim([-10,15]) ii=0 for i,j in zip(df_X_reduced[0],df_X_reduced[1]): ax.annotate('%s' %df_X_reduced.ix[ii].name, xy=(i,j), xytext=(10,0), textcoords='offset points', fontsize=12, alpha=1.0) ii = ii+1 plt.show() n_clusters = 7 model, Z = cluster(n_clusters) x_c = model.cluster_centers_[:,0] y_c = model.cluster_centers_[:,1] h=1 x = df_X_reduced[0]#.apply(lambda x: x[:-1]).astype(float) y = df_X_reduced[1]#.apply(lambda x: x[:-1]).astype(float) xscale = 22 yscale = 10 #schoolage_pop = df['tpop10'] * df['age18_10']/100#.apply(lambda x: x[:-1]).astype(float) # Plot the decision boundary. For that, we will asign a color to each x_min, x_max = x.min() + 1, x.max() - 1 y_min, y_max = y.min() + 1, y.max() - 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Zc = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot #Z = Z.reshape(xx.shape) #plt.figure(1) #plt.clf() trace1 = Scatter(x=x, y=y, text = df.index, name = '', mode = 'markers', marker = Marker(size = df['tpop10']/400, sizemode = 'diameter', #sizeref = schoolage_pop.max()/1000, opacity = 0.6, color = Z) ) trace2 = Scatter(x=x_c, y=y_c, mode = 'markers', text = '', name='', marker = Marker(symbol='x', size=15, color = [0,1,2,3,4,5,6]) ) layout = Layout( title = 'Baltimore Vital Signs PCA w/ K-Means Clustering
Number of Clusters: 7', xaxis = XAxis(range = [-xscale,xscale], title = '', showgrid = False, showline = False, zeroline = False, autotick = False, showticklabels=False), yaxis = YAxis(range = [-1*yscale,yscale], title = '', showgrid = False, showline = False, zeroline = False, autotick = False, showticklabels=False), autosize = False, width=600, height=600, font = Font(size=14), hovermode = 'closest', showlegend=False ) data = Data([trace1,trace2]) fig = Figure(data=data, layout=layout) py.iplot(fig, filename='Baltimore PCA, K-Means Cluster')