In [100]:
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import umap
import umap.plot

from imblearn.under_sampling import RandomUnderSampler
In [101]:
df = pd.read_pickle("dataframe_survey_2018-01-23_enriched.pickle")
df
Out[101]:
url typealyzer actual e s t sntf_s sntf_n sntf_t sntf_f ... sad you cogmech auxverb they incl money feel we hear
1 http://adropofcolour.tumblr.com ISFP INFJ 0.291281 0.787844 0.460961 0.663515 0.178565 0.069282 0.088638 ... 0.000000 0.019704 0.098522 0.147783 0.000000 0.039409 0.009852 0.019704 0.044335 0.009852
2 http://godheadcomplex.tumblr.com ESFP INFP 0.883579 0.951693 0.238407 0.855921 0.046931 0.021850 0.075297 ... 0.000000 0.017513 0.201401 0.084063 0.001751 0.056042 0.007005 0.017513 0.047285 0.003503
3 http://chaotikaeon2.tumblr.com INTJ INTP 0.332444 0.357863 0.591322 0.147668 0.252326 0.339831 0.260175 ... 0.003283 0.014540 0.181989 0.114916 0.000938 0.071295 0.010319 0.008912 0.054409 0.014540
5 http://perpetually-in-transit.blogspot.com ESFP ENFJ 0.944394 0.943192 0.105527 0.778825 0.051134 0.017299 0.152742 ... 0.002497 0.018727 0.207241 0.104869 0.002497 0.049938 0.014981 0.011236 0.041199 0.017478
10 http://museofmystery.wordpress.com/2012/08/29/... ISTP INFP 0.073352 0.850472 0.608812 0.628322 0.112762 0.149270 0.109646 ... 0.001031 0.005155 0.215464 0.122680 0.005155 0.043299 0.019588 0.002062 0.021649 0.012371
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
25432 http://pistoche.tumblr.com ESFP INTJ 0.685653 0.969891 0.480241 0.960824 0.029758 0.004220 0.005199 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25433 http://lokh.tumblr.com ISTP INTP 0.201637 0.553602 0.662618 0.468074 0.374926 0.099968 0.057033 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25435 http://readerdye.tumblr.com ISTP INFP 0.375704 0.756593 0.740688 0.697536 0.229456 0.051684 0.021324 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25436 http://loveisart.tumblr.com ISTP ENFP 0.002516 0.848823 0.661502 0.584138 0.118812 0.192779 0.104271 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25437 http://angelalll.tumblr.com ESTP INFP 0.814616 0.652280 0.832608 0.518149 0.281291 0.163392 0.037168 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

22919 rows × 115 columns

In [102]:
df.func # 4 cognitive functions without their attitudinal directions introversion/extraversion
Out[102]:
1        f
2        t
3        f
5        f
10       t
        ..
25432    f
25433    f
25435    t
25436    f
25437    n
Name: func, Length: 22919, dtype: object
In [144]:
liwc_cols = ["negate","ppron","nonfl","i","relativ","percept","quant","affect","shehe","achieve","bio","leisure","conj","motion","posemo","adverb","home","future","negemo","number","inhib","humans","pronoun","excl","space","tentat","see","past","anx","family","present","health","verb","certain","anger","preps","swear","ingest","discrep","friend","relig","time","cause","article","body","social","assent","work","sexual","insight","ipron","filler","death","funct","sad","you","cogmech","auxverb","they","incl","money","feel","we","hear"]
data = df[liwc_cols]
data["y"] = df.func
data = data.dropna()
data
/opt/anaconda3/envs/mindalyzer/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
Out[144]:
negate ppron nonfl i relativ percept quant affect shehe achieve ... you cogmech auxverb they incl money feel we hear y
1 0.034483 0.428571 0.049261 0.334975 0.197044 0.039409 0.024631 0.088670 0.029557 0.024631 ... 0.019704 0.098522 0.147783 0.000000 0.039409 0.009852 0.019704 0.044335 0.009852 f
2 0.040280 0.416813 0.063047 0.285464 0.316988 0.031524 0.029772 0.082312 0.064799 0.012259 ... 0.017513 0.201401 0.084063 0.001751 0.056042 0.007005 0.017513 0.047285 0.003503 t
3 0.017824 0.439962 0.068949 0.277674 0.353189 0.040807 0.031895 0.090994 0.092402 0.018293 ... 0.014540 0.181989 0.114916 0.000938 0.071295 0.010319 0.008912 0.054409 0.014540 f
5 0.038702 0.377029 0.049938 0.233458 0.223471 0.046192 0.041199 0.072409 0.081149 0.024969 ... 0.018727 0.207241 0.104869 0.002497 0.049938 0.014981 0.011236 0.041199 0.017478 f
10 0.014433 0.479381 0.091753 0.364948 0.319588 0.026804 0.047423 0.102062 0.082474 0.030928 ... 0.005155 0.215464 0.122680 0.005155 0.043299 0.019588 0.002062 0.021649 0.012371 t
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
22914 0.091241 0.390511 0.040146 0.244526 0.306569 0.025547 0.021898 0.065693 0.025547 0.021898 ... 0.040146 0.244526 0.069343 0.003650 0.091241 0.007299 0.000000 0.076642 0.007299 t
22915 0.020309 0.508530 0.053615 0.351340 0.287571 0.049553 0.045491 0.119415 0.084890 0.017059 ... 0.019090 0.211210 0.150690 0.012998 0.045085 0.016247 0.016653 0.040211 0.010154 n
22916 0.029458 0.482904 0.061021 0.307207 0.262493 0.039979 0.028406 0.110994 0.124671 0.025776 ... 0.017885 0.197791 0.162020 0.006312 0.042083 0.010521 0.013677 0.026828 0.010521 n
22917 0.113971 0.139706 0.025735 0.084559 0.136029 0.022059 0.018382 0.040441 0.025735 0.000000 ... 0.014706 0.113971 0.069853 0.007353 0.011029 0.000000 0.007353 0.007353 0.000000 f
22918 0.003861 0.455598 0.057915 0.333977 0.322394 0.019305 0.036680 0.073359 0.075290 0.017375 ... 0.027027 0.133205 0.113900 0.003861 0.055985 0.003861 0.003861 0.015444 0.011583 n

20819 rows × 65 columns

In [145]:
print(len(data.columns))
y = data.iloc[:,[64]]
X = data.iloc[:,0:63]
65
In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)
In [159]:
X_test.values.shape
Out[159]:
(4164, 63)
In [160]:
y_test.values.ravel().shape
Out[160]:
(4164,)
In [161]:
y_train.y.value_counts()
Out[161]:
n    6883
f    4489
t    3278
s    2005
Name: y, dtype: int64
In [170]:
type(y_train.iloc[:,0])
Out[170]:
pandas.core.series.Series
In [171]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train.iloc[:,0])
Out[171]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
In [175]:
y_pred = model.predict(X_test)
In [176]:
accuracy = accuracy_score(y_test, y_pred)
accuracy
Out[176]:
0.3547070124879923
In [178]:
fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)

plt.show()

With balanced classes

In [179]:
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
data_balanced, balanced_y = rus.fit_resample(data, data['y'])
In [180]:
data_balanced
Out[180]:
negate ppron nonfl i relativ percept quant affect shehe achieve ... you cogmech auxverb they incl money feel we hear y
0 0.050562 0.252809 0.033708 0.188202 0.160112 0.025281 0.016854 0.070225 0.030899 0.008427 ... 0.002809 0.115169 0.115169 0.002809 0.025281 0.005618 0.016854 0.028090 0.002809 f
1 0.038462 0.378205 0.054487 0.272436 0.285256 0.054487 0.038462 0.108974 0.057692 0.006410 ... 0.012821 0.227564 0.108974 0.000000 0.083333 0.003205 0.019231 0.035256 0.022436 f
2 0.018786 0.552023 0.046243 0.341040 0.339595 0.052023 0.066474 0.114162 0.135838 0.026012 ... 0.027457 0.225434 0.167630 0.020231 0.049133 0.010116 0.018786 0.027457 0.020231 f
3 0.039113 0.413299 0.061278 0.320730 0.251630 0.053455 0.032595 0.067797 0.053455 0.002608 ... 0.010430 0.170795 0.088657 0.001304 0.045632 0.005215 0.013038 0.027379 0.007823 f
4 0.026490 0.394702 0.079470 0.270199 0.241060 0.046358 0.027815 0.092715 0.070199 0.023841 ... 0.027815 0.143046 0.092715 0.000000 0.018543 0.006623 0.010596 0.026490 0.025166 f
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10031 0.027867 0.454984 0.065380 0.274920 0.310289 0.032154 0.029475 0.088424 0.138800 0.012326 ... 0.013398 0.190782 0.150054 0.003751 0.043944 0.008039 0.007503 0.024116 0.013934 t
10032 0.052083 0.322917 0.050000 0.229167 0.237500 0.018750 0.006250 0.093750 0.041667 0.010417 ... 0.008333 0.129167 0.079167 0.002083 0.012500 0.002083 0.006250 0.041667 0.004167 t
10033 0.128889 0.306667 0.017778 0.213333 0.284444 0.022222 0.013333 0.075556 0.022222 0.017778 ... 0.026667 0.155556 0.053333 0.000000 0.031111 0.008889 0.008889 0.044444 0.000000 t
10034 0.048276 0.439655 0.047414 0.289655 0.238793 0.046552 0.056034 0.106034 0.076724 0.015517 ... 0.020690 0.199138 0.129310 0.000862 0.052586 0.008621 0.012069 0.051724 0.022414 t
10035 0.060134 0.443207 0.044543 0.293987 0.360802 0.020045 0.042316 0.122494 0.073497 0.026726 ... 0.022272 0.200445 0.113586 0.006682 0.069042 0.002227 0.011136 0.046771 0.004454 t

10036 rows × 65 columns

In [181]:
balanced_y.value_counts()
Out[181]:
t    2509
n    2509
f    2509
s    2509
Name: y, dtype: int64
In [182]:
data_balanced.y.value_counts()
Out[182]:
t    2509
n    2509
f    2509
s    2509
Name: y, dtype: int64
In [183]:
print(len(data.columns))
y = data_balanced.iloc[:,[64]]
X = data_balanced.iloc[:,0:63]
65
In [184]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)
In [185]:
y_train.y.value_counts()
Out[185]:
s    2033
t    2017
f    1994
n    1984
Name: y, dtype: int64
In [186]:
y_test.y.value_counts()
Out[186]:
n    525
f    515
t    492
s    476
Name: y, dtype: int64
In [188]:
model = xgb.XGBClassifier()
model.fit(X_train.values, y_train.iloc[:,0])
Out[188]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
In [194]:
X_test
Out[194]:
negate ppron nonfl i relativ percept quant affect shehe achieve ... funct sad you cogmech auxverb they incl money feel we
7343 0.020151 0.458438 0.076826 0.332494 0.328715 0.030227 0.036524 0.095718 0.044081 0.028967 ... 1.358942 0.006297 0.021411 0.152393 0.119647 0.010076 0.035264 0.013854 0.003778 0.050378
2127 0.034139 0.470128 0.034139 0.301565 0.281650 0.046230 0.044808 0.088193 0.076814 0.014936 ... 1.494310 0.002845 0.013514 0.256046 0.148649 0.008535 0.047653 0.009957 0.014936 0.069701
2225 0.030955 0.484260 0.073977 0.351522 0.317419 0.049318 0.037775 0.107030 0.076600 0.031480 ... 1.606506 0.003148 0.021511 0.178909 0.136411 0.006821 0.038300 0.013641 0.018363 0.027807
5473 0.067358 0.393782 0.058722 0.260794 0.303972 0.034542 0.058722 0.074266 0.072539 0.010363 ... 1.298791 0.003454 0.005181 0.127807 0.100173 0.001727 0.032815 0.015544 0.013817 0.053541
1515 0.037975 0.379747 0.042194 0.299578 0.248945 0.059072 0.046414 0.033755 0.063291 0.016878 ... 1.059072 0.000000 0.004219 0.168776 0.067511 0.008439 0.025316 0.021097 0.021097 0.004219
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
550 0.053942 0.448133 0.049793 0.319502 0.242739 0.020747 0.024896 0.080913 0.068465 0.006224 ... 1.307054 0.014523 0.020747 0.155602 0.105809 0.008299 0.037344 0.020747 0.006224 0.031120
3477 0.025516 0.520656 0.102066 0.375456 0.284933 0.060753 0.034629 0.116039 0.078372 0.027339 ... 1.597813 0.007290 0.029162 0.157959 0.156744 0.002430 0.033414 0.019441 0.026124 0.035237
3042 0.010163 0.443089 0.052846 0.239837 0.241870 0.034553 0.038618 0.083333 0.095528 0.010163 ... 1.355691 0.008130 0.026423 0.172764 0.095528 0.012195 0.069106 0.000000 0.006098 0.069106
9734 0.045028 0.360225 0.061914 0.250469 0.275797 0.055347 0.034709 0.110694 0.059099 0.014071 ... 1.252345 0.016886 0.009381 0.214822 0.119137 0.000938 0.045966 0.024390 0.037523 0.040338
9261 0.034934 0.425036 0.055313 0.216885 0.229985 0.039301 0.024745 0.090247 0.085881 0.011645 ... 1.199418 0.001456 0.080058 0.177584 0.109170 0.001456 0.046579 0.008734 0.013100 0.040757

2008 rows × 63 columns

In [196]:
y_pred = model.predict(X_test.values)
In [197]:
accuracy = accuracy_score(y_test, y_pred)
accuracy
Out[197]:
0.24651394422310757
In [198]:
fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test.values, y_test, ax=ax)
plt.show()
In [199]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           f       0.27      0.27      0.27       515
           n       0.26      0.25      0.25       525
           s       0.24      0.26      0.25       476
           t       0.21      0.21      0.21       492

    accuracy                           0.25      2008
   macro avg       0.25      0.25      0.25      2008
weighted avg       0.25      0.25      0.25      2008

With reduced dimensionality

In [200]:
reducer = umap.UMAP()
mapper = umap.UMAP().fit(X) # for plotting
embedding = reducer.fit_transform(X)
In [201]:
embedding.shape
Out[201]:
(10036, 2)
In [202]:
y.values.ravel()
Out[202]:
array(['f', 'f', 'f', ..., 't', 't', 't'], dtype=object)
In [203]:
umap.plot.points(mapper, labels=y.values.ravel())
Out[203]:
<AxesSubplot:>
In [204]:
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2, random_state=2045)
In [205]:
embedding.shape
Out[205]:
(10036, 2)
In [206]:
y.values.shape
Out[206]:
(10036, 1)
In [207]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred
/opt/anaconda3/envs/mindalyzer/lib/python3.6/site-packages/sklearn/utils/validation.py:73: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)
Out[207]:
array(['n', 't', 'f', ..., 's', 's', 's'], dtype=object)
In [208]:
X_train.shape
Out[208]:
(8028, 2)
In [209]:
X_test.shape
Out[209]:
(2008, 2)
In [210]:
fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)
plt.show()
In [211]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           f       0.26      0.23      0.24       515
           n       0.23      0.23      0.23       525
           s       0.21      0.24      0.23       476
           t       0.24      0.24      0.24       492

    accuracy                           0.23      2008
   macro avg       0.23      0.23      0.23      2008
weighted avg       0.24      0.23      0.23      2008