#!/usr/bin/env python
# coding: utf-8

# In[51]:


import numpy as np
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import csv


# In[104]:


#this function converts name to a feature vector 
def convert_name(name):
    arr = np.zeros(26*26+5)
    #26*26 all possiable 2-grams + 4 for last 1, 3, 3 excluding last
    name = str(name)
    #Iterate every 2 characters 2gram and caliculate the frequency of the 2gram
    for x in range(len(name)-1):
        ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a'))
        arr[ind] += 1
        
    # check if Last character is vowel
    if (name[-1]=='a' or name[-1]=='e' or name[-1]=='i' or name[-1]=='o' or name[-1]=='u'):
        arr[-1] = 1
    else:
        arr[-1] = 0
        
    #check last 3 characters and set the value to 1 if the last 3 character key is found in the features dictionary for that index 
    if name[-3:] in my_features.keys():
        arr[-2]=my_features[name[-3:]]

    #check 3 characters from last skiping last 3 
    if name[-6:][:3] in my_features.keys():
        arr[-3]=my_features[name[-6:][:3]]

    #check 3 characters from last skiping last 1     
    if name[-4:][:3] in my_features.keys():
        arr[-4]=my_features[name[-4:][:3]]
    
    #first 3 characters      
    if name[:3] in my_features.keys():
        arr[-5]=my_features[name[:3]]
    return arr


# In[105]:


#load data
my_data = np.genfromtxt('data/gender.csv', 
                        delimiter=',', 
                        dtype=[('name','<U16'), ('gender',int)],
                        converters={0: lambda s:s.lower()})
my_data = np.array([row for row in my_data])

#load most common last 3 character for india names create a features dictionary
features_reader = csv.reader(open('data/features.csv', 'r'))
my_features = {}
for row in features_reader:
   k, v = row
   my_features[k] = int(v)


# In[106]:


name_map = np.vectorize(convert_name, otypes=[np.ndarray])
Xlist = name_map(my_data['name'])
X = np.array(Xlist.tolist())
y = my_data['gender']


# In[107]:


for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))


# In[108]:


#check important features
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
indices = indices[:10]
# Print the feature ranking
print("Feature ranking:")

for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

import matplotlib.pyplot as plt
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()


# In[109]:


idx = np.random.choice(np.arange(len(Xlist)), 10, replace=False)
xs = my_data['name'][idx]
ys = y[idx]
pred = clf.predict(X[idx])
for a,b, p in zip(xs,ys, pred):
     print(a,b,p)


# In[110]:


#give some name not present in test/training data (manual check)
xreal = name_map(['virinchi','samanvi','ashrith','saikeerthi','seol','vistista','kranthi','ramraji','anasuya','mangli','keerthi','reddy','shastry'])
Xreal_conveted = np.array(xreal.tolist())
pred = clf.predict(Xreal_conveted)
pred


# In[ ]: