import pandas as pd
import plotly.plotly as py
from plotly.graph_objs import *
import sklearn as skl
import numpy as np
#import xlrd
import matplotlib.pyplot as plt
import glob

# Importing the datasets and combining them.

path = 'raw_data/csv'

allFiles = glob.glob(path + '/*.csv')
df = pd.DataFrame()

for i, filename in enumerate(allFiles):
    #print i, filename
    df_file = pd.read_csv(filename)
    #df_file.drop('CSA2010', inplace=True)
    #df_file['filename'] = filename
    #df_file['file_ID'] = i
    if i == 0:
        df = df_file
    else:
        df = pd.merge(df,df_file)

df.index = df['CSA2010']
df.drop('CSA2010', inplace=True)
del df['CSA2010']

cols = df.columns
df[cols] = (
    df[cols]
    # Replace things that aren't numbers and change any empty entries to nan
    # (to allow type conversion)
    .replace({r'[^0-9\.]': '', '': np.nan}, regex=True)
    # Change to float and convert from %s
    .astype(np.float64)
)

#df.to_csv(path+'full_vital_signs_dataset.csv')

len(df.columns)

red_orange = '#ff3700'

# One of the rows is an aggregate Baltimore City.
df.drop('Baltimore City', inplace=True)

df_white_sorted = df['pwhite10'].sort(inplace=False)

fig = plt.figure(figsize=[7,20])
ax1 = plt.subplot(1,1,1)


df_white_sorted.plot(kind='barh', grid='off')

# turn off square border around plot
ax1.spines["top"].set_visible(False)  
ax1.spines["bottom"].set_visible(False)  
ax1.spines["right"].set_visible(False)  
ax1.spines["left"].set_visible(False)

# turn off ticks
ax1.tick_params(axis="both", which="both", bottom="off", top="off",
               labelbottom="on", left="off", right="off", labelleft="on",labelsize=16)

title('% White', fontsize=20)

df_chPov_sorted = df['hhchpov12'].sort(inplace=False)#.apply(lambda x: x[:-1]).astype(float)
#df_chPov_sorted.sort(inplace=True)

fig = plt.figure(figsize=[7,20])
ax1 = plt.subplot(1,1,1)


df_chPov_sorted.plot(kind='barh', grid='off')

# turn off square border around plot
ax1.spines["top"].set_visible(False)  
ax1.spines["bottom"].set_visible(False)  
ax1.spines["right"].set_visible(False)  
ax1.spines["left"].set_visible(False)

# turn off ticks
ax1.tick_params(axis="both", which="both", bottom="off", top="off",
               labelbottom="on", left="off", right="off", labelleft="on",labelsize=16)

title('% HH Children in Poverty', fontsize=20)

# pandas only
import matplotlib.pyplot as plt

fig = plt.figure(figsize=[8,8])
ax1 = plt.subplot(1,1,1)

#df.drop('Baltimore City', inplace=True)

x = df['pwhite10']#.apply(lambda x: x[:-1]).astype(float)
y = df['hhchpov12']#.apply(lambda x: x[:-1]).astype(float)
schoolage_pop = df['tpop10'] * df['age18_10'] / 100 #.apply(lambda x: x[:-1]).astype(float)
s = (schoolage_pop/50)**2 

plt.scatter(x,y,
            color=red_orange,
            marker='.',
            s=s,
            alpha=.4)

xlabel('\n% Population White',fontsize=16)
ylabel('% Households w. Children in Poverty\n',fontsize=16)
xlim([-5,100])
ylim([-5,100])
title('Baltimore', fontsize=20)

# turn off square border around plot
ax1.spines["top"].set_visible(False)  
ax1.spines["bottom"].set_visible(False)  
ax1.spines["right"].set_visible(False)  
ax1.spines["left"].set_visible(False)

# turn off ticks
ax1.tick_params(axis="both", which="both", bottom="off", top="off",
               labelbottom="on", left="off", right="off", labelleft="on",labelsize=16)

plt.show()


# same plot, interactive from plotly

#df.drop('Baltimore City', inplace=True)

import plotly.plotly as py

x = df['pwhite10']#.apply(lambda x: x[:-1]).astype(float)
y = df['hhchpov12']#.apply(lambda x: x[:-1]).astype(float)
schoolage_pop = df['tpop10'] * df['age18_10']/100#.apply(lambda x: x[:-1]).astype(float)


trace1 = Scatter(x=x,
                 y=y,
                 text = np.array(x.index),
                 mode = 'markers',
                 marker = Marker(size = schoolage_pop,
                                 sizemode = 'area',
                                 sizeref = schoolage_pop.max()/1000,
                                 opacity = 0.5,
                                 color = 'blue')
)

layout = Layout(
    title = 'Baltimore: Too Many Non-White Kids in Poverty',
    xaxis = XAxis(range = [-5,100],
                  title = '% Population White (2010)',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = False,
                  dtick = 20),
    yaxis = YAxis(range = [-5,100],
                  title = '% Households w. Children in Poverty (2012)',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = False,
                  dtick = 20),
    autosize = False,
    width=500,
    height=500,
    font = Font(size=14),
    hovermode = 'closest'   
)

data = Data([trace1])
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='Baltimore Percent Child Poverty Percent White')


size = 100
alpha = 0.5
fontsize = 16

fig = plt.figure(figsize=[8,8])
ax1 = plt.subplot(1,1,1)

scatter(df['phisp10'],df['hhpov12'],c='r',alpha=alpha,s=size)
scatter(df['paa10'],df['hhpov12'],c='c',alpha=alpha,s=size)

legend(['Hispanic','Black'],fontsize=fontsize)
xlabel('Percent of Population',fontsize=fontsize)
ylabel('Percent of Households in Poverty',fontsize=fontsize)

# turn off square border around plot
ax1.spines["top"].set_visible(False)  
ax1.spines["bottom"].set_visible(False)  
ax1.spines["right"].set_visible(False)  
ax1.spines["left"].set_visible(False)

# turn off ticks
ax1.tick_params(axis="both", which="both", bottom="off", top="off",
               labelbottom="on", left="off", right="off", labelleft="on",labelsize=16)

xlim([-5,100])
ylim([-5,60])

plt.show()

from sklearn.decomposition import PCA as PCA
from sklearn.preprocessing import StandardScaler

X = np.array(df)
scaler = StandardScaler()#.fit(X)
X_scaled = scaler.fit_transform(X)

pca = PCA()
pca.fit(X_scaled)
#print(pca.explained_variance_)  
#print pd.Series(pca.explained_variance_ratio_).apply(lambda x: round(x,2))
bar(range(0,len(pca.explained_variance_ratio_)),pd.Series(pca.explained_variance_ratio_).apply(lambda x: round(x,2)))

'''
cov_mat = np.cov(X_scaled.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)
print len(eig_vals)
'''

pd.Series(pca.explained_variance_ratio_).cumsum().plot()
xlabel('number of dimensions')
ylabel('cumulative fraction of variance explained')
ylim([0,1])
#sum(pca.explained_variance_ratio_[:5])

pca.n_components = 2
X_reduced = pca.fit_transform(X_scaled)
print 'Shape: ', X_reduced.shape
print 'Explained Variance Ratio = ', round(sum(pca.explained_variance_ratio_[:pca.n_components]),2)

df_X_reduced = pd.DataFrame(X_reduced, index=df.index)

fig = plt.figure(figsize=[8,8])
ax = plt.subplot(1,1,1)
plot(df_X_reduced[0], df_X_reduced[1], marker='o', linestyle='', alpha=1.0)
#xlim([-7,7])
#ylim([-10,15])
ii=0
for i,j in zip(df_X_reduced[0],df_X_reduced[1]):
    
    ax.annotate('%s' %df_X_reduced.ix[ii].name, xy=(i,j), xytext=(10,0), textcoords='offset points')
    ii = ii+1

x = df_X_reduced[0]#.apply(lambda x: x[:-1]).astype(float)
y = df_X_reduced[1]#.apply(lambda x: x[:-1]).astype(float)
xscale = 22
yscale = 10
#schoolage_pop = df['tpop10'] * df['age18_10']/100#.apply(lambda x: x[:-1]).astype(float)


trace1 = Scatter(x=x,
                 y=y,
                 text = df.index,
                 mode = 'markers',
                 marker = Marker(size = df['tpop10']/500,
                                 sizemode = 'diameter',
                                 #sizeref = schoolage_pop.max()/1000,
                                 opacity = 0.5,
                                 color = 'blue')
)

layout = Layout(
    title = 'Baltimore Vital Signs PCA',
    xaxis = XAxis(range = [-xscale,xscale],
                  title = '',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = False,
                  showticklabels=False),
    yaxis = YAxis(range = [-1*yscale,yscale],
                  title = '',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = False,
                  showticklabels=False),
    autosize = False,
    width=500,
    height=500,
    font = Font(size=14),
    hovermode = 'closest'   
)

data = Data([trace1])
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='Baltimore PCA')


from sklearn.cluster import KMeans

#reduced_data = PCA(n_components=2).fit_transform(data)
def cluster(n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X_reduced)
    Z = kmeans.predict(X_reduced)
    return kmeans, Z

max_clusters = len(df)

inertias = zeros(max_clusters)

for i in range(1,max_clusters):
        kmeans, Z = cluster(i)
        inertias[i] = kmeans.inertia_

fig = plt.figure(figsize=[6,6])
ax1 = plt.subplot(1,1,1)
plot(inertias,marker='.')
xlabel('Number of Clusters')
ylabel('Inertia')
#xlim([1,14])
plt.show()

n_clusters = 7
model, Z = cluster(n_clusters)

fig = plt.figure(figsize=[10,10])
ax = plt.subplot(1,1,1)
plt.scatter(df_X_reduced[0], df_X_reduced[1], c=Z, marker='o', alpha=.5, s=200)
#xlim([-7,7])
#ylim([-10,15])
ii=0
for i,j in zip(df_X_reduced[0],df_X_reduced[1]):
    ax.annotate('%s' %df_X_reduced.ix[ii].name, xy=(i,j), xytext=(10,0), textcoords='offset points', fontsize=12, alpha=1.0)
    ii = ii+1
plt.show()

n_clusters = 7
model, Z = cluster(n_clusters)
x_c = model.cluster_centers_[:,0]
y_c = model.cluster_centers_[:,1]

h=1

x = df_X_reduced[0]#.apply(lambda x: x[:-1]).astype(float)
y = df_X_reduced[1]#.apply(lambda x: x[:-1]).astype(float)
xscale = 22
yscale = 10
#schoolage_pop = df['tpop10'] * df['age18_10']/100#.apply(lambda x: x[:-1]).astype(float)

# Plot the decision boundary. For that, we will asign a color to each
x_min, x_max = x.min() + 1, x.max() - 1
y_min, y_max = y.min() + 1, y.max() - 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Zc = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
#Z = Z.reshape(xx.shape)
#plt.figure(1)
#plt.clf()


trace1 = Scatter(x=x,
                 y=y,
                 text = df.index,
                 name = '',
                 mode = 'markers',
                 marker = Marker(size = df['tpop10']/400,
                                 sizemode = 'diameter',
                                 #sizeref = schoolage_pop.max()/1000,
                                 opacity = 0.6,
                                 color = Z)
)

trace2 = Scatter(x=x_c,
                 y=y_c,
                 mode = 'markers',
                 text = '',
                 name='',
                 marker = Marker(symbol='x',
                                 size=15,
                                 color = [0,1,2,3,4,5,6])
)

layout = Layout(
    title = 'Baltimore Vital Signs PCA w/ K-Means Clustering <br> Number of Clusters: 7',
    xaxis = XAxis(range = [-xscale,xscale],
                  title = '',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = False,
                  showticklabels=False),
    yaxis = YAxis(range = [-1*yscale,yscale],
                  title = '',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = False,
                  showticklabels=False),
    autosize = False,
    width=600,
    height=600,
    font = Font(size=14),
    hovermode = 'closest',
    showlegend=False
)

data = Data([trace1,trace2])
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='Baltimore PCA, K-Means Cluster')