#!/usr/bin/env python # coding: utf-8 # ### Import Packages # In[10]: import pandas as pd import string from nltk.corpus import stopwords from nltk.stem.snowball import SpanishStemmer from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import VotingClassifier from sklearn.multiclass import OneVsRestClassifier from xgboost import XGBClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score from sklearn.metrics import log_loss from nltk.corpus import stopwords from scipy.sparse import hstack # ### Load Data # In[7]: train_data = pd.read_excel('train_universidad.xlsx',sheetname=1) test_data = pd.read_excel('test_universidad.xlsx') # ### Get Columns # In[8]: train_data.columns # ### First 5 rows # In[9]: train_data.head() # ### Append Data # In[11]: all_data = train_data.append(test_data) # ### Adding 'Comentario' length # In[12]: all_data['COMENTARIO_LEN'] =all_data['COMENTARIO'].str.len() # ### Columns Sets # In[13]: data_cols= ['NIVEL ACTUAL', 'Ciclo', 'COMENTARIO', 'COMENTARIO_LEN', 'IND_GEA', 'IND_DELEGADO', 'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista'] model_cols= ['NIVEL ACTUAL', 'Ciclo', 'IND_GEA', 'IND_DELEGADO', 'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista'] model_log_cols= ['NIVEL ACTUAL', 'Ciclo', 'COMENTARIO_LEN','IND_GEA', 'IND_DELEGADO', 'CANT_CURSOS_MATRICU_SIN_INGLES', 'UOD_depostista_ind_deportista'] # ### Cleaning Data # In[14]: all_data['IND_GEA'] = all_data['IND_GEA'].map({'IND_GEA':1}).fillna(0) all_data['IND_DELEGADO'] = all_data['IND_DELEGADO'].map({'Delegado':1}).fillna(0) all_data['UOD_depostista_ind_deportista'] = all_data['UOD_depostista_ind_deportista'].map({'Deportista':1}).fillna(0) all_data['CANT_CURSOS_MATRICU_SIN_INGLES'] = all_data['CANT_CURSOS_MATRICU_SIN_INGLES'].fillna(4) # ### Encoding 'Nivel Actual' # In[15]: le = LabelEncoder() # In[16]: all_data['NIVEL ACTUAL'] = le.fit_transform(all_data['NIVEL ACTUAL']) # In[17]: all_data = all_data.reset_index(drop=True) # ### Removing stopwords, punctuation, stemming. # In[18]: stop = stopwords.words('spanish') stemmer = SpanishStemmer() # In[19]: pretable = dict.fromkeys(string.punctuation) table = str.maketrans(pretable) # In[20]: all_data['COMENTARIO'] = all_data['COMENTARIO'].apply(lambda x: ' '.join([word.translate(table) for word in x.split() if word not in stop])) # In[21]: all_data['COMENTARIO'] = all_data['COMENTARIO'].str.\ replace('enseƱansa','enseƱanza').\ replace('pencion','pension') # In[22]: all_data['COMENTARIO'] = all_data['COMENTARIO'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop])) # In[23]: all_data.head() # ### Split Data # In[24]: X = all_data.loc[:19999,data_cols] y = all_data.loc[:19999,'NPS'] X_final = all_data.loc[20000:,data_cols] # In[25]: x_train,x_test, y_train, y_test = train_test_split(X,y, test_size=0.1,random_state=1) # ### Vectorize 'Comentarios' # In[26]: vec = TfidfVectorizer(ngram_range=(1,3), min_df=0.001, max_df=0.6,strip_accents='unicode') # In[27]: vec # In[28]: vec.fit(x_train['COMENTARIO']) # In[29]: vec.get_feature_names() # In[30]: x_train_tokens = vec.transform(x_train['COMENTARIO']) # In[31]: x_test_tokens = vec.transform(x_test['COMENTARIO']) # ### Stacking together tokens and categorical features # In[32]: full_x_train = hstack((x_train[model_cols].as_matrix(),x_train_tokens)) full_x_test = hstack((x_test[model_cols].as_matrix(),x_test_tokens)) # ### Voting Classifier # In[33]: clf_log2 = LogisticRegression(C= 1, class_weight= None, solver= 'newton-cg', random_state=1) clf_xgb = XGBClassifier( objective='multi:softprob', scale_pos_weight=1, max_depth= 9,gamma=0.3, colsample_bytree= 0.9, subsample= 0.8,seed=27) clf_nb2 = OneVsRestClassifier(MultinomialNB()) # In[36]: clf_voting = VotingClassifier(estimators=[('lr',clf_log2),('xgb', clf_xgb),('nb',clf_nb2)], voting='soft') # In[37]: clf_voting.fit(full_x_train,y_train) # In[39]: predict_clf_voting = clf_voting.predict(full_x_test) # In[42]: print('accuracy: %s' % accuracy_score(predict_clf_voting,y_test)) print('log_loss: %s' % log_loss(y_test, clf_voting.predict_proba(full_x_test))) # ### Results for submission # In[43]: x_final_tokens = vec.transform(X_final['COMENTARIO']) # In[44]: final_x_test = hstack((X_final[model_cols].as_matrix(),x_final_tokens)) # In[45]: final_predict = clf_voting.predict_proba(final_x_test) # In[46]: final_cod = all_data.loc[20000:,'COD_ENCUESTADO'].copy().reset_index(drop=True) # In[47]: final_predict_df = pd.concat([final_cod,pd.DataFrame(final_predict,columns = ['NPS1','NPS2','NPS3','NPS4'])],axis=1) # In[49]: final_predict_df.to_csv('submission.csv',index=False) # In[ ]: