Notebook

In [1]:

!pip install catboost==0.17.5

Collecting catboost==0.17.5
  Downloading https://files.pythonhosted.org/packages/21/41/75b28629b9f2668548f431efe0236062aec12cd0a9a647313d7f2d1c9221/catboost-0.17.5-cp36-none-manylinux1_x86_64.whl (62.7MB)
     |████████████████████████████████| 62.7MB 43.8MB/s 
Requirement already satisfied: matplotlib in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (3.0.3)
Requirement already satisfied: scipy in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (1.2.1)
Requirement already satisfied: pandas>=0.24.0 in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (0.25.1)
Requirement already satisfied: plotly in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (4.1.1)
Requirement already satisfied: graphviz in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (0.8.4)
Requirement already satisfied: numpy>=1.16.0 in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (1.16.4)
Requirement already satisfied: six in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (1.12.0)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (1.1.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (2.4.2)
Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (2.8.0)
Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.6/site-packages (from pandas>=0.24.0->catboost==0.17.5) (2019.2)
Requirement already satisfied: retrying>=1.3.3 in /opt/conda/lib/python3.6/site-packages (from plotly->catboost==0.17.5) (1.3.3)
Requirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib->catboost==0.17.5) (41.2.0)
Installing collected packages: catboost
  Found existing installation: catboost 0.17.3
    Uninstalling catboost-0.17.3:
      Successfully uninstalled catboost-0.17.3
Successfully installed catboost-0.17.5

In [2]:

import catboost
catboost.__version__

Out[2]:

'0.17.5'

In [3]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction submission file example (random solutions).csv
/kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction submission file.csv
/kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction test (without labels).csv
/kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction training (with labels).csv
/kaggle/input/submission/Submission.csv

In [4]:

# Load Training & Test data
data = pd.read_csv('../input/tcdml1920-income-ind/tcd ml 2019-20 income prediction training (with labels).csv', index_col='Instance')
trainData = deepcopy(data.drop(['Income in EUR', 'Hair Color'], axis=1))
data1 = pd.read_csv('../input/tcdml1920-income-ind/tcd ml 2019-20 income prediction test (without labels).csv', index_col='Instance')
testData = deepcopy(data1.drop(['Income', 'Hair Color'], axis=1))

In [5]:

trainData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111993 entries, 1 to 111993
Data columns (total 9 columns):
Year of Record       111552 non-null float64
Gender               104561 non-null object
Age                  111499 non-null float64
Country              111993 non-null object
Size of City         111993 non-null int64
Profession           111671 non-null object
University Degree    104623 non-null object
Wears Glasses        111993 non-null int64
Body Height [cm]     111993 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 8.5+ MB

In [6]:

testData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73230 entries, 111994 to 185223
Data columns (total 9 columns):
Year of Record       72935 non-null float64
Gender               68368 non-null object
Age                  72951 non-null float64
Country              73230 non-null object
Size of City         73230 non-null int64
Profession           73035 non-null object
University Degree    68373 non-null object
Wears Glasses        73230 non-null int64
Body Height [cm]     73230 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 5.6+ MB

In [7]:

pd.options.display.float_format = '{:.2f}'.format
data['Income in EUR'].describe()

Out[7]:

count    111993.00
mean     109213.81
std      149802.35
min       -5696.91
25%       30771.69
50%       57339.17
75%      126093.60
max     5285252.36
Name: Income in EUR, dtype: float64

In [8]:

# Row Duplication check Train & Test
duplicateRowsTrain = trainData[trainData.duplicated()]
print('train' +duplicateRowsTrain)

duplicateRowsTest = testData[testData.duplicated()]
print('test' + duplicateRowsTest)

Empty DataFrame
Columns: [Year of Record, Gender, Age, Country, Size of City, Profession, University Degree, Wears Glasses, Body Height [cm]]
Index: []
Empty DataFrame
Columns: [Year of Record, Gender, Age, Country, Size of City, Profession, University Degree, Wears Glasses, Body Height [cm]]
Index: []

In [9]:

# Null check Train
trainData.isnull().sum().sort_values()

Out[9]:

Country                 0
Size of City            0
Wears Glasses           0
Body Height [cm]        0
Profession            322
Year of Record        441
Age                   494
University Degree    7370
Gender               7432
dtype: int64

In [10]:

# Replace train missing values with 'unknown', 'others' and dropping NaN
trainData1 = deepcopy(trainData)
trainData1['Gender']            = trainData1['Gender'].replace({np.nan: 'unknown'})
trainData1['University Degree'] = trainData1['University Degree'].replace({np.nan: 'unknown'})
trainData1['Profession']        = trainData1['Profession'].replace({np.nan: 'others'})
trainData1 = trainData1.dropna()

In [11]:

# Null check Test
testData.isnull().sum().sort_values()

Out[11]:

Country                 0
Size of City            0
Wears Glasses           0
Body Height [cm]        0
Profession            195
Age                   279
Year of Record        295
University Degree    4857
Gender               4862
dtype: int64

In [12]:

# Replace test missing values with 'unknown', 'others' and replace NaN with mean
testData1 = deepcopy(testData)
testData1['Gender']            = testData1['Gender'].replace({np.nan: 'unknown'})
testData1['University Degree'] = testData1['University Degree'].replace({np.nan: 'unknown'})
testData1['Profession']        = testData1['Profession'].replace({np.nan: 'others'})
testData1['Year of Record']    = testData1['Year of Record'].replace({np.nan: testData1['Year of Record'].mean()})
testData1['Age']               = testData1['Age'].replace({np.nan: testData1['Age'].mean()})
testData1 = testData1.dropna()

In [13]:

testData1['University Degree'].value_counts()

Out[13]:

Bachelor    27914
Master      17890
No          17705
unknown      4857
PhD          4405
0             459
Name: University Degree, dtype: int64

In [14]:

# unknown and zero mapping to downsize categories

trainData1['University Degree'].where(trainData1['University Degree'] == 0, 'unknown')
trainData1['Gender'].where(trainData1['Gender'] == 0, 'unknown')

testData1['University Degree'].where(testData1['University Degree'] == 0, 'unknown')
testData1['Gender'].where(testData1['Gender'] == 0, 'unknown')

Out[14]:

Instance
111994    unknown
111995    unknown
111996    unknown
111997    unknown
111998    unknown
           ...   
185219    unknown
185220    unknown
185221    unknown
185222    unknown
185223    unknown
Name: Gender, Length: 73230, dtype: object

In [15]:

trainData1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111060 entries, 1 to 111993
Data columns (total 9 columns):
Year of Record       111060 non-null float64
Gender               111060 non-null object
Age                  111060 non-null float64
Country              111060 non-null object
Size of City         111060 non-null int64
Profession           111060 non-null object
University Degree    111060 non-null object
Wears Glasses        111060 non-null int64
Body Height [cm]     111060 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 8.5+ MB

In [16]:

testData1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73230 entries, 111994 to 185223
Data columns (total 9 columns):
Year of Record       73230 non-null float64
Gender               73230 non-null object
Age                  73230 non-null float64
Country              73230 non-null object
Size of City         73230 non-null int64
Profession           73230 non-null object
University Degree    73230 non-null object
Wears Glasses        73230 non-null int64
Body Height [cm]     73230 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 5.6+ MB

In [17]:

normColumn = ['Year of Record', 'Age', 'Size of City', 'Body Height [cm]']
for col in normColumn:
    trainData1[col] = (trainData1[col] - trainData1[col].mean()) / trainData1[col].std()
    testData1[col] = (testData1[col] - testData1[col].mean()) / testData1[col].std()

In [18]:

#Creating a training set for modeling and validation set to check model performance
X = trainData1
y = data.loc[trainData1.index, 'Income in EUR']


X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.9, random_state=18313172)
categorical_features_indices = np.array([1, 3, 5, 6,7])


model=CatBoostRegressor(
    iterations=20000,
    depth=10,
    learning_rate=0.001,
    loss_function='RMSE',
    task_type='GPU',
    border_count=32,
    verbose=500,
    random_seed=18313172
)
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 149595.7260333	test: 150914.1420584	best: 150914.1420584 (0)	total: 45.1ms	remaining: 15m 2s
500:	learn: 110975.2304253	test: 112377.2620384	best: 112377.2620384 (500)	total: 21.5s	remaining: 13m 55s
1000:	learn: 89339.2754408	test: 90639.6342533	best: 90639.6342533 (1000)	total: 43.9s	remaining: 13m 52s
1500:	learn: 77254.4124608	test: 78299.9275946	best: 78299.9275946 (1500)	total: 1m 7s	remaining: 13m 46s
2000:	learn: 70341.2129667	test: 71191.7507614	best: 71191.7507614 (2000)	total: 1m 31s	remaining: 13m 43s
2500:	learn: 66168.7933118	test: 66919.4712290	best: 66919.4712290 (2500)	total: 1m 56s	remaining: 13m 34s
3000:	learn: 63474.5356981	test: 64223.0711685	best: 64223.0711685 (3000)	total: 2m 19s	remaining: 13m 11s
3500:	learn: 61651.9567450	test: 62439.5555315	best: 62439.5555315 (3500)	total: 2m 42s	remaining: 12m 48s
4000:	learn: 60289.2185402	test: 61212.8852321	best: 61212.8852321 (4000)	total: 3m 9s	remaining: 12m 36s
4500:	learn: 59215.0575239	test: 60318.1784353	best: 60318.1784353 (4500)	total: 3m 33s	remaining: 12m 15s
5000:	learn: 58337.4992729	test: 59639.8253793	best: 59639.8253793 (5000)	total: 3m 56s	remaining: 11m 50s
5500:	learn: 57607.3371942	test: 59114.6178474	best: 59114.6178474 (5500)	total: 4m 20s	remaining: 11m 27s
6000:	learn: 56989.9768403	test: 58729.5125448	best: 58729.5125448 (6000)	total: 4m 45s	remaining: 11m 7s
6500:	learn: 56466.0037300	test: 58425.0518813	best: 58425.0518813 (6500)	total: 5m 11s	remaining: 10m 47s
7000:	learn: 55987.5413825	test: 58176.3524208	best: 58176.3524208 (7000)	total: 5m 36s	remaining: 10m 24s
7500:	learn: 55549.8738628	test: 57979.4582681	best: 57979.4582681 (7500)	total: 6m 2s	remaining: 10m 3s
8000:	learn: 55159.0726597	test: 57811.8757428	best: 57811.8757428 (8000)	total: 6m 31s	remaining: 9m 46s
8500:	learn: 54786.7366582	test: 57669.8035927	best: 57669.7315572 (8498)	total: 7m	remaining: 9m 29s
9000:	learn: 54447.2023644	test: 57553.7260230	best: 57553.7260230 (9000)	total: 7m 26s	remaining: 9m 5s
9500:	learn: 54136.2131406	test: 57448.7746479	best: 57448.7746479 (9500)	total: 7m 54s	remaining: 8m 44s
10000:	learn: 53841.9821730	test: 57347.6879779	best: 57347.6879779 (10000)	total: 8m 22s	remaining: 8m 22s
10500:	learn: 53548.1813576	test: 57252.3928831	best: 57252.3928831 (10500)	total: 8m 52s	remaining: 8m 1s
11000:	learn: 53268.1231175	test: 57182.1044128	best: 57182.1044128 (11000)	total: 9m 23s	remaining: 7m 41s
11500:	learn: 52984.5461628	test: 57105.3114793	best: 57105.3114793 (11500)	total: 9m 51s	remaining: 7m 17s
12000:	learn: 52711.9940093	test: 57032.4281303	best: 57032.3056257 (11999)	total: 10m 18s	remaining: 6m 52s
12500:	learn: 52450.7681290	test: 56979.0507795	best: 56979.0507795 (12500)	total: 10m 45s	remaining: 6m 27s
13000:	learn: 52202.4485121	test: 56924.0842024	best: 56924.0842024 (13000)	total: 11m 13s	remaining: 6m 2s
13500:	learn: 51953.0728618	test: 56872.9791663	best: 56872.9791663 (13500)	total: 11m 39s	remaining: 5m 36s
14000:	learn: 51700.6633886	test: 56825.3174241	best: 56825.3174241 (14000)	total: 12m 7s	remaining: 5m 11s
14500:	learn: 51465.6708007	test: 56773.9339014	best: 56773.7775788 (14498)	total: 12m 35s	remaining: 4m 46s
15000:	learn: 51239.0322292	test: 56735.2758564	best: 56735.2758564 (15000)	total: 13m 3s	remaining: 4m 21s
15500:	learn: 51018.4113180	test: 56703.4519602	best: 56703.2155196 (15499)	total: 13m 35s	remaining: 3m 56s
16000:	learn: 50801.4646706	test: 56677.4975544	best: 56677.4975544 (16000)	total: 14m 4s	remaining: 3m 31s
16500:	learn: 50584.6577016	test: 56645.1144961	best: 56645.1144961 (16500)	total: 14m 33s	remaining: 3m 5s
17000:	learn: 50375.4930260	test: 56620.6608040	best: 56620.6374589 (16992)	total: 15m	remaining: 2m 38s
17500:	learn: 50179.0605668	test: 56597.4543749	best: 56597.3109105 (17496)	total: 15m 29s	remaining: 2m 12s
18000:	learn: 49991.8944681	test: 56574.4887572	best: 56574.4887572 (18000)	total: 15m 57s	remaining: 1m 46s
18500:	learn: 49797.1038142	test: 56551.9879618	best: 56551.8978071 (18499)	total: 16m 29s	remaining: 1m 20s
19000:	learn: 49598.0592227	test: 56532.0569331	best: 56532.0569331 (19000)	total: 17m 3s	remaining: 53.8s
19500:	learn: 49418.0998164	test: 56514.5446915	best: 56514.5112787 (19498)	total: 17m 33s	remaining: 26.9s
19999:	learn: 49235.4781576	test: 56497.9561760	best: 56497.8492237 (19984)	total: 18m 1s	remaining: 0us
bestTest = 56497.84922
bestIteration = 19984
Shrink model to first 19985 iterations.

Out[18]:

<catboost.core.CatBoostRegressor at 0x7ff538191a90>

In [19]:

print(model.get_feature_importance(prettified=True))

          Feature Id  Importances
0            Country        30.65
1                Age        18.44
2     Year of Record        16.90
3         Profession        12.47
4  University Degree         7.72
5   Body Height [cm]         6.08
6       Size of City         3.95
7             Gender         3.36
8      Wears Glasses         0.42

In [20]:

submission = pd.DataFrame()
submission['Instance'] = testData1.index
submission['Income'] = model.predict(testData1)
submission.to_csv('Submission.csv', index = False)