%matplotlib inline import matplotlib.pyplot as plt import numpy as np import pandas as pd import sqlite3 as sql db_read = 'data/cba.sqlite' sql_query = '''SELECT tot.*, adv.*, misc.*, info.ht, info.wt, info.nationality FROM team_info_realgm tinfo JOIN player_season_totals tot ON tinfo.team_id = tot.team JOIN player_season_advanced adv ON tot.player = adv.player AND tot.season = adv.season AND tinfo.team_id = adv.team JOIN player_season_misc misc ON tot.player = misc.player AND tot.season = misc.season AND tinfo.team_id = misc.team JOIN player_season_info info ON tot.player = info.player AND tot.season = info.season AND tinfo.team = info.team''' with sql.connect(db_read) as con: df = pd.read_sql(sql_query, con) stats = df.T.drop_duplicates().T stats.shape cols_retained = stats.columns - ['sys_id', 'season', 'player', 'team', 'birth_city', 'draft_status'] stats = stats[cols_retained] ht_inches = [] for height in stats['ht']: h = height.split('-') if h[0] and h[1]: ht = 12.0 * float(h[0]) + 1.0 * float(h[1]) ht_inches.append(ht) else: ht_inches.append('') stats['ht'] = ht_inches stats = stats.convert_objects(convert_numeric=True) stats = stats.fillna(stats.mean()[['ht' , 'wt']]) # [(type(stats[col]), col) for col in stats.columns] # [(stats[col].dtype, col) for col in stats.columns] # [(stats[col].isnull().sum(), col) for col in stats.columns] stats.shape stats['nationality'].value_counts() stats['nationality'] = stats['nationality'].replace(to_replace='-', value=np.nan) stats = stats.dropna(subset=['nationality']) #domestic for China, Taiwan, Hong Kong, non-domestic for anything else #1 for domestic, 0 for foreign stats['domestic'] = stats['nationality'].apply(lambda x: 1 if x in ['China', 'Taiwan', 'Hong Kong'] else 0) stats['nationality_class'] = stats['nationality'].apply(lambda x: x if x in ['China', 'United States'] else "Other") stats['nationality_class'].value_counts() stats_nopos = stats.drop(['pos'], axis=1) stats_nopos = stats_nopos.dropna() stats.shape stats = stats.dropna() pos_binarized = pd.get_dummies(stats['pos'], prefix='pos') stats_final = pd.concat([stats, pos_binarized], axis=1).drop(['pos'], axis=1) from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA y_nat = stats_final.pop('nationality').values y_nat_class = stats_final.pop('nationality_class').values y_dom = stats_final.pop('domestic').values x = stats_final.values y_nat = y_nat.astype(str) y_nat_class = y_nat_class.astype(str) feature_names = stats_final.columns.values scaler = StandardScaler() x_scaled = scaler.fit_transform(x) def show_exp_var(decomper, data): var_exp = [] for n in xrange(1, 63+1): dc = decomper(n_components=n) dc.fit(data) var_exp.append(dc.explained_variance_.sum()) plt.figure(figsize=(16,10)) plt.grid(b=True, which='major', color='black', linestyle='-') plt.plot(np.array(var_exp)) plt.tight_layout() show_exp_var(PCA, x_scaled) pca = PCA(n_components=30) x_reduced = pca.fit_transform(x_scaled) x_reduced.shape from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_curve, roc_auc_score from sklearn.grid_search import GridSearchCV xtrain, xtest, ytrain, ytest = train_test_split(x_reduced, y_dom, random_state=13) def get_scores(x, y, model): print "model:", model print "cf matrix:\n", confusion_matrix(y, model.predict(x)) print "accuracy:", model.score(x, y) print "precision (tp/tp+fp):", precision_score(y, model.predict(x)) print "recall (tp/p):", recall_score(y, model.predict(x)) print "f1:", f1_score(y, model.predict(x)) lg = LogisticRegression() lg.fit(xtrain, ytrain) get_scores(xtest, ytest, lg) rf = RandomForestClassifier(n_jobs=-1) pg = {'n_estimators': np.arange(10, 101, 10), 'criterion': ['gini', 'entropy']} gs = GridSearchCV(rf, param_grid=pg, scoring='f1') gs.fit(x_reduced, y_dom) gs.grid_scores_ gs.best_params_ rf = RandomForestClassifier(n_jobs=-1, n_estimators=60, criterion='gini') rf.fit(xtrain, ytrain) get_scores(xtest, ytest, rf) rf.classes_ # xtrain_full, xtest_full, ytrain_full, ytest_full = train_test_split(x, y_dom, random_state=13) xtrain_full, xtest_full, ytrain_full, ytest_full = train_test_split(x, y_dom) rf_full = RandomForestClassifier(n_jobs=-1, n_estimators=60, criterion='gini') rf_full.fit(xtrain_full, ytrain_full) get_scores(xtest_full, ytest_full, rf_full) rf_full.feature_importances_ feature_ranks = [feature_names[i] for i in np.argsort(rf_full.feature_importances_)[::-1]] feature_ranks feature_indices = np.arange(len(feature_names)) fig = plt.figure(figsize=(16,10)) ax = fig.add_axes([0, 0, 1, 1]) ax.set_title("Feature Importance in Classifying Domestic vs Foreign CBA Players") ax.barh(feature_indices, np.sort(rf_full.feature_importances_)[::-1], align="center") ax.set_yticks(feature_indices) ax.set_yticklabels(feature_ranks) ax.autoscale() fig.show() from statsmodels.stats.outliers_influence import variance_inflation_factor vif = [variance_inflation_factor(x_scaled, col) for col in xrange(x_scaled.shape[1])] zip(vif, feature_names) corr_matrix = pd.DataFrame(x_scaled, columns=feature_names).corr().abs() corr_top = corr_matrix.unstack().order(ascending=False) corr_top[corr_top != 1][:750:2] #no values/information in this feature stats_pruned = stats_final.drop('_20_ast', axis=1) # y = stats_pruned.pop('nationality').values x = stats_pruned.values scaler = StandardScaler() x_pruned = scaler.fit_transform(x) feature_names_pruned = stats_pruned.columns.values