In [1]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport data.read_data
%aimport models.train_model
%aimport features.build_features
%aimport visualization.visualize
from data.read_data import read_data, get_stopwords
from features.build_features import get_vec, to_categorical, replace_na, to_tfidf, stack_sparse, to_sparse_int, get_fasttext
from models.train_model import split_train, score_function, model_ridge, model_xgb, model_ensembler
from visualization.visualize import plot_roc, plot_scatter
[16:55:32] DEBUG Fast version of gensim.models.doc2vec is being used
[16:55:32] DEBUG Fast version of Fasttext is being used
[16:55:32] INFO 'pattern' package not found; tag filters are not available for English
[nltk_data] Downloading package punkt to /home/cris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [2]:
train = read_data(test=False)
y = train['Target']
stopwords = get_stopwords()
train.head()
Out[2]:
ID review_content review_title review_stars product Target
0 0 En appelant un acheteur pour demander si l'écr... La Police s'inscrit en acheteur privé sur Pric... 5 2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5... 0
1 1 Alors, là, on a affaire au plus grand Navet ja... Chef D'Oeuvre Absolu en vue... 5 7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb... 1
2 2 Effet garanti sur la terrase. Ils donnent immé... Effet garanti sur la terrase. Ils donnent immé... 3 7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c... 0
3 3 tres bon rapport qualite prix tre pratique en ... bon produit 4 77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88... 1
4 4 Ordinateur de bureau trés bien pour quelqu'un ... Apple Power MAC G4 3 f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b... 1
In [3]:
# Feature engineering
train = replace_na(train, ['review_content', 'review_title'])
X_dummies = to_categorical(train, 'review_stars')
X_content = to_tfidf(train, 'review_content', stopwords)
X_title = to_tfidf(train, 'review_title', stopwords)
X_length = to_sparse_int(train, 'review_content')

sparse_merge = stack_sparse([X_dummies, X_content, X_title, X_length])
In [4]:
model_fasttext = get_fasttext()
xtrain = get_vec(train['review_content'].values, model_fasttext, stopwords)
[16:55:47] INFO loading projection weights from ../data/external/wiki.fr.bin
[16:55:47] DEBUG {'kw': {}, 'mode': 'rb', 'uri': '../data/external/wiki.fr.bin'}
[16:55:47] DEBUG encoding_wrapper: {'errors': 'strict', 'encoding': None, 'mode': 'rb', 'fileobj': <_io.BufferedReader name='../data/external/wiki.fr.bin'>}
[16:56:20] INFO loaded (1152449, 300) matrix from ../data/external/wiki.fr.bin
In [5]:
X_train_tfv, X_test_tfv, X_train_ft, X_test_ft, y_train, y_test = train_test_split(sparse_merge, xtrain, y, test_size=0.33, random_state=7)
In [6]:
ens = model_ensembler(X_train_tfv, X_train_ft, y_train)
[16:57:12] INFO Found 2 classes
[16:57:12] INFO Training Level 0 Fold # 1. Model # 0
[16:58:48] INFO Predicting Level 0. Fold # 1. Model # 0
[16:58:49] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.707828
[16:58:49] INFO Training Level 0 Fold # 2. Model # 0
[17:00:29] INFO Predicting Level 0. Fold # 2. Model # 0
[17:00:30] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.701661
[17:00:30] INFO Training Level 0 Fold # 3. Model # 0
[17:02:09] INFO Predicting Level 0. Fold # 3. Model # 0
[17:02:09] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.708444
[17:02:09] INFO Level 0. Model # 0. Mean Score = 0.705978. Std Dev = 0.003062
[17:02:09] INFO Training Level 0 Fold # 1. Model # 1
[17:02:34] INFO Predicting Level 0. Fold # 1. Model # 1
[17:02:35] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.700502
[17:02:35] INFO Training Level 0 Fold # 2. Model # 1
[17:03:00] INFO Predicting Level 0. Fold # 2. Model # 1
[17:03:02] INFO Level 0. Fold # 2. Model # 1. Validation Score = 0.693814
[17:03:02] INFO Training Level 0 Fold # 3. Model # 1
[17:03:27] INFO Predicting Level 0. Fold # 3. Model # 1
[17:03:29] INFO Level 0. Fold # 3. Model # 1. Validation Score = 0.701992
[17:03:29] INFO Level 0. Model # 1. Mean Score = 0.698769. Std Dev = 0.003556
[17:03:29] INFO Saving predictions for level # 0
[17:03:29] INFO Training Level 1 Fold # 1. Model # 0
[17:03:32] INFO Predicting Level 1. Fold # 1. Model # 0
[17:03:32] INFO Level 1. Fold # 1. Model # 0. Validation Score = 0.691023
[17:03:32] INFO Training Level 1 Fold # 2. Model # 0
[17:03:35] INFO Predicting Level 1. Fold # 2. Model # 0
[17:03:35] INFO Level 1. Fold # 2. Model # 0. Validation Score = 0.688373
[17:03:35] INFO Training Level 1 Fold # 3. Model # 0
[17:03:37] INFO Predicting Level 1. Fold # 3. Model # 0
[17:03:37] INFO Level 1. Fold # 3. Model # 0. Validation Score = 0.696277
[17:03:37] INFO Level 1. Model # 0. Mean Score = 0.691891. Std Dev = 0.003284
[17:03:37] INFO Saving predictions for level # 1
In [7]:
import numpy as np
In [8]:
test_data_dict = {0: [X_test_tfv, X_test_tfv], 1: [X_test_ft]}
preds = ens.predict(test_data_dict, lentest=X_test_ft.shape[0])
preds1 = np.mean((preds[0][:,1], preds[1][:,1]),axis=0)
score_function(y_test, preds1)
[17:03:38] INFO Training Fulldata Level 0. Model # 0
[17:05:45] INFO Predicting Test Level 0. Model # 0
[17:05:46] INFO Training Fulldata Level 0. Model # 1
[17:06:23] INFO Predicting Test Level 0. Model # 1
[17:06:27] INFO Training Fulldata Level 1. Model # 0
[17:06:31] INFO Predicting Test Level 1. Model # 0
Out[8]:
0.71845231796402476