import numpy as np
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import csv
#this function converts name to a feature vector
def convert_name(name):
arr = np.zeros(26*26+5)
#26*26 all possiable 2-grams + 4 for last 1, 3, 3 excluding last
name = str(name)
#Iterate every 2 characters 2gram and caliculate the frequency of the 2gram
for x in range(len(name)-1):
ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a'))
arr[ind] += 1
# check if Last character is vowel
if (name[-1]=='a' or name[-1]=='e' or name[-1]=='i' or name[-1]=='o' or name[-1]=='u'):
arr[-1] = 1
else:
arr[-1] = 0
#check last 3 characters and set the value to 1 if the last 3 character key is found in the features dictionary for that index
if name[-3:] in my_features.keys():
arr[-2]=my_features[name[-3:]]
#check 3 characters from last skiping last 3
if name[-6:][:3] in my_features.keys():
arr[-3]=my_features[name[-6:][:3]]
#check 3 characters from last skiping last 1
if name[-4:][:3] in my_features.keys():
arr[-4]=my_features[name[-4:][:3]]
#first 3 characters
if name[:3] in my_features.keys():
arr[-5]=my_features[name[:3]]
return arr
#load data
my_data = np.genfromtxt('data/gender.csv',
delimiter=',',
dtype=[('name','<U16'), ('gender',int)],
converters={0: lambda s:s.lower()})
my_data = np.array([row for row in my_data])
#load most common last 3 character for india names create a features dictionary
features_reader = csv.reader(open('data/features.csv', 'r'))
my_features = {}
for row in features_reader:
k, v = row
my_features[k] = int(v)
name_map = np.vectorize(convert_name, otypes=[np.ndarray])
Xlist = name_map(my_data['name'])
X = np.array(Xlist.tolist())
y = my_data['gender']
for x in range(5):
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
clf.fit(Xtr, ytr)
print(np.mean(clf.predict(Xte) == yte))
0.818355640535 0.821906582901 0.830920513521 0.824638077028 0.821360284075
#check important features
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
indices = indices[:10]
# Print the feature ranking
print("Feature ranking:")
for f in range(10):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
import matplotlib.pyplot as plt
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()
Feature ranking: 1. feature 680 (0.185898) 2. feature 679 (0.071854) 3. feature 677 (0.050360) 4. feature 678 (0.042615) 5. feature 676 (0.041021) 6. feature 13 (0.010510) 7. feature 442 (0.010436) 8. feature 182 (0.010124) 9. feature 338 (0.009793) 10. feature 346 (0.009638)
idx = np.random.choice(np.arange(len(Xlist)), 10, replace=False)
xs = my_data['name'][idx]
ys = y[idx]
pred = clf.predict(X[idx])
for a,b, p in zip(xs,ys, pred):
print(a,b,p)
vaibahv 1 1 sagrika 0 0 ramashray 1 1 karekar 1 1 harshika 0 0 deepaak 1 1 fulo 0 0 mahiya 0 0 punyasloka 1 1 pemaram 1 1
#give some name not present in test/training data (manual check)
xreal = name_map(['virinchi','samanvi','ashrith','saikeerthi','seol','vistista','kranthi','ramraji','anasuya','mangli','keerthi','reddy','shastry'])
Xreal_conveted = np.array(xreal.tolist())
pred = clf.predict(Xreal_conveted)
pred
array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1])