In [51]:
import numpy as np
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import csv
In [104]:
#this function converts name to a feature vector 
def convert_name(name):
    arr = np.zeros(26*26+5)
    #26*26 all possiable 2-grams + 4 for last 1, 3, 3 excluding last
    name = str(name)
    #Iterate every 2 characters 2gram and caliculate the frequency of the 2gram
    for x in range(len(name)-1):
        ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a'))
        arr[ind] += 1
        
    # check if Last character is vowel
    if (name[-1]=='a' or name[-1]=='e' or name[-1]=='i' or name[-1]=='o' or name[-1]=='u'):
        arr[-1] = 1
    else:
        arr[-1] = 0
        
    #check last 3 characters and set the value to 1 if the last 3 character key is found in the features dictionary for that index 
    if name[-3:] in my_features.keys():
        arr[-2]=my_features[name[-3:]]

    #check 3 characters from last skiping last 3 
    if name[-6:][:3] in my_features.keys():
        arr[-3]=my_features[name[-6:][:3]]

    #check 3 characters from last skiping last 1     
    if name[-4:][:3] in my_features.keys():
        arr[-4]=my_features[name[-4:][:3]]
    
    #first 3 characters      
    if name[:3] in my_features.keys():
        arr[-5]=my_features[name[:3]]
    return arr
In [105]:
#load data
my_data = np.genfromtxt('data/gender.csv', 
                        delimiter=',', 
                        dtype=[('name','<U16'), ('gender',int)],
                        converters={0: lambda s:s.lower()})
my_data = np.array([row for row in my_data])

#load most common last 3 character for india names create a features dictionary
features_reader = csv.reader(open('data/features.csv', 'r'))
my_features = {}
for row in features_reader:
   k, v = row
   my_features[k] = int(v)
In [106]:
name_map = np.vectorize(convert_name, otypes=[np.ndarray])
Xlist = name_map(my_data['name'])
X = np.array(Xlist.tolist())
y = my_data['gender']
In [107]:
for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))
0.818355640535
0.821906582901
0.830920513521
0.824638077028
0.821360284075
In [108]:
#check important features
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
indices = indices[:10]
# Print the feature ranking
print("Feature ranking:")

for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

import matplotlib.pyplot as plt
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()
Feature ranking:
1. feature 680 (0.185898)
2. feature 679 (0.071854)
3. feature 677 (0.050360)
4. feature 678 (0.042615)
5. feature 676 (0.041021)
6. feature 13 (0.010510)
7. feature 442 (0.010436)
8. feature 182 (0.010124)
9. feature 338 (0.009793)
10. feature 346 (0.009638)
In [109]:
idx = np.random.choice(np.arange(len(Xlist)), 10, replace=False)
xs = my_data['name'][idx]
ys = y[idx]
pred = clf.predict(X[idx])
for a,b, p in zip(xs,ys, pred):
     print(a,b,p)
vaibahv 1 1
sagrika 0 0
ramashray 1 1
karekar 1 1
harshika 0 0
deepaak 1 1
fulo 0 0
mahiya 0 0
punyasloka 1 1
pemaram 1 1
In [110]:
#give some name not present in test/training data (manual check)
xreal = name_map(['virinchi','samanvi','ashrith','saikeerthi','seol','vistista','kranthi','ramraji','anasuya','mangli','keerthi','reddy','shastry'])
Xreal_conveted = np.array(xreal.tolist())
pred = clf.predict(Xreal_conveted)
pred
Out[110]:
array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1])