#!/usr/bin/env python # coding: utf-8 # In[51]: import numpy as np from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.ensemble import RandomForestClassifier from sklearn import svm import csv # In[104]: #this function converts name to a feature vector def convert_name(name): arr = np.zeros(26*26+5) #26*26 all possiable 2-grams + 4 for last 1, 3, 3 excluding last name = str(name) #Iterate every 2 characters 2gram and caliculate the frequency of the 2gram for x in range(len(name)-1): ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a')) arr[ind] += 1 # check if Last character is vowel if (name[-1]=='a' or name[-1]=='e' or name[-1]=='i' or name[-1]=='o' or name[-1]=='u'): arr[-1] = 1 else: arr[-1] = 0 #check last 3 characters and set the value to 1 if the last 3 character key is found in the features dictionary for that index if name[-3:] in my_features.keys(): arr[-2]=my_features[name[-3:]] #check 3 characters from last skiping last 3 if name[-6:][:3] in my_features.keys(): arr[-3]=my_features[name[-6:][:3]] #check 3 characters from last skiping last 1 if name[-4:][:3] in my_features.keys(): arr[-4]=my_features[name[-4:][:3]] #first 3 characters if name[:3] in my_features.keys(): arr[-5]=my_features[name[:3]] return arr # In[105]: #load data my_data = np.genfromtxt('data/gender.csv', delimiter=',', dtype=[('name','