Data is from Kaggle's Gender Recognition by Voice
import pandas as pd
xy = pd.read_csv('data/voice.csv')
X = xy.drop('label', axis='columns')
y = xy['label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
We'll train a random forest classifier on the entire dataset.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, rf.predict(X_test))
0.98232323232323238
Nice! We got over 98% accuracy.
from lime.lime_tabular import LimeTabularExplainer
features = list(X_train.columns)
explainer = LimeTabularExplainer(X_train.values, feature_names=features, class_names=['female', 'male'])
# randomly pick an example
example = X_train.sample(1).values[0]
exp = explainer.explain_instance(example, rf.predict_proba)
exp.show_in_notebook()
This person has less than 0.12 mean fundamental frequency. That's why the model classified this person as a male.
dreamgonfly@gmail.com