import pandas as pd import numpy as np import matplotlib.pyplot as plt import folium as fm import geopy from sklearn import linear_model, naive_bayes, feature_selection, metrics, tree from sklearn.cross_validation import train_test_split %matplotlib inline tips_with_adjectives_1 = pd.read_pickle('./dumps/tips_with_adjectives.pkl') adj_dummies = pd.read_pickle('./dumps/adjective_dataframe.pkl') adj_df = pd.read_csv('./dumps/adjective_count_list.csv') print len(adj_dummies) print len(tips_with_adjectives_1) adjective_list = list(adj_df[adj_df['count'] > 10]['word']) print "Number of tips: ", len(tips_with_adjectives_1) print "Number of adjectives: ", len(adj_df) print "Number of significant adjectives (appears in more than 10 tips): ", len(adjective_list) # tips_with_adjectives['address'] = tips_with_adjectives.apply(lambda x: "{0} {1} {2}, {3}, New York, NY".format(x['BUILDING'].strip(), x['STREET'].strip(), int(x['ZIPCODE']), x['BORO']), axis=1) latlong_df = pd.read_pickle('./dumps/with_lat_long.pkl')[['foursquare_id', 'lat_long']] tips_adj_df = tips_with_adjectives_1.join(adj_dummies) tips_adj_df.drop_duplicates(['foursquare_id', 'description'], inplace=True) tips_adj_df = tips_adj_df.merge(latlong_df, on='foursquare_id', how='left') len(tips_adj_df) # desired_columns = [ # 'foursquare_id', # 'DBA', # 'description', # 'tip_words', # 'tip_adjs', # 'adj_string', # 'foursquare_rating', # 'foursquare_num_of_users', # 'foursquare_price_tier', # 'grade_A', # 'grade_C', # 'GRADE', # 'lat_long' # ] # tips_df = tips_with_adjectives[desired_columns] # len(tips_df) def score_and_predict(model, x_features, y_targets, columns, model_type): score = model.score(x_features, y_targets) y_pred = model.predict(x_features) auc = metrics.roc_auc_score(y_targets, y_pred) p_values = feature_selection.f_classif(x_features, y_targets) if model_type == 'naive-bayes' or model_type == 'logistic': coef_list = [np.exp(round(x, 4)) for x in model.coef_[0]] elif model_type == 'linear': coef_list = [round(x, 4) for x in model.coef_] df_dict = {'adjective': columns, 'p-value': p_values[0], 'coef': coef_list} model_df = pd.DataFrame(df_dict) model_df.sort(['p-value', 'coef'], ascending=[1,0], inplace=True) print 'MODEL: ', model print 'SCORE: ', score print 'AUC: ', auc print '\n' print 'TOP PREDICTORS (p-value < 0.05):' print model_df[model_df['p-value'] <= 0.05] print '\n' # return model_df # for index, column in enumerate(tips_adj_df.columns.values): # print column, index # ADJECTIVE COLUMNS ARE 31 till second to last column # based on adjectives included in tip descriptions X_adjs = tips_adj_df.ix[:, 34:-1] # based on ratings, number of users, and price tier X_foursquare_info = tips_adj_df[['foursquare_rating', 'foursquare_num_of_users', 'foursquare_price_tier']].dropna(axis=1) y = tips_adj_df['grade_A'] X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8) clf_multi_nb = naive_bayes.MultinomialNB() clf_multi_nb = clf_multi_nb.fit(X_train, y_train) print 'Using training set' score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes') print 'Using testing set' score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes') print 'Using all data' score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes') clf_linear = linear_model.LinearRegression() clf_linear = clf_linear.fit(X_train, y_train) print 'Using training set' score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear') print 'Using testing set' score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear') print 'Using all data' score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear') clf_logistic = linear_model.LogisticRegression() clf_logistic.fit(X_train, y_train) print 'Using training set' score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic') print 'Using testing set' score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic') print 'Using all data' score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic') y = tips_adj_df['grade_C'] X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8) clf_multi_nb = naive_bayes.MultinomialNB() clf_multi_nb = clf_multi_nb.fit(X_train, y_train) print 'Using training set' score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes') print 'Using testing set' score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes') print 'Using all data' score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes') clf_linear = linear_model.LinearRegression() clf_linear = clf_linear.fit(X_train, y_train) print 'Using training set' score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear') print 'Using testing set' score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear') print 'Using all data' score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear') clf_logistic = linear_model.LogisticRegression() clf_logistic = clf_logistic.fit(X_train, y_train) print 'Using training set' score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic') print 'Using testing set' score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic') print 'Using all data' score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic') clf_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5) print 'Using training set' clf_tree = clf_tree.fit(X_train, y_train) score = clf_tree.score(X_train, y_train) y_pred = clf_tree.predict(X_train) print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_train, y_pred)), "\n" print "Classification report" print metrics.classification_report(y_train, y_pred), "\n" print "Confusion matrix" print metrics.confusion_matrix(y_train, y_pred), "\n" print 'Using testing set' score = clf_tree.score(X_test, y_test) y_pred = clf_tree.predict(X_test) print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y_test, y_pred)), "\n" print "Classification report" print metrics.classification_report(y_test, y_pred), "\n" print "Confusion matrix" print metrics.confusion_matrix(y_test, y_pred), "\n" print 'Using all data' score = clf_tree.score(X_adjs.values, y.values) y_pred = clf_tree.predict(X_adjs.values) print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y.values, y_pred)), "\n" print "Classification report" print metrics.classification_report(y.values, y_pred), "\n" print "Confusion matrix" print metrics.confusion_matrix(y.values, y_pred), "\n" y = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]['grade_C'] cheap_df = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1] X_adjs = cheap_df.ix[:, 34:-1] X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8) clf_multi_nb = naive_bayes.MultinomialNB() clf_multi_nb.fit(X_train, y_train) print 'Using training set' score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes') print 'Using testing set' score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes') print 'Using all data' score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes') # tips_adj_df.to_pickle('./dumps/tips_complete_features.pkl')