!pip install catboost==0.17.5
Collecting catboost==0.17.5 Downloading https://files.pythonhosted.org/packages/21/41/75b28629b9f2668548f431efe0236062aec12cd0a9a647313d7f2d1c9221/catboost-0.17.5-cp36-none-manylinux1_x86_64.whl (62.7MB) |████████████████████████████████| 62.7MB 43.8MB/s Requirement already satisfied: matplotlib in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (3.0.3) Requirement already satisfied: scipy in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (1.2.1) Requirement already satisfied: pandas>=0.24.0 in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (0.25.1) Requirement already satisfied: plotly in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (4.1.1) Requirement already satisfied: graphviz in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (0.8.4) Requirement already satisfied: numpy>=1.16.0 in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (1.16.4) Requirement already satisfied: six in /opt/conda/lib/python3.6/site-packages (from catboost==0.17.5) (1.12.0) Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (0.10.0) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (1.1.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (2.4.2) Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib->catboost==0.17.5) (2.8.0) Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.6/site-packages (from pandas>=0.24.0->catboost==0.17.5) (2019.2) Requirement already satisfied: retrying>=1.3.3 in /opt/conda/lib/python3.6/site-packages (from plotly->catboost==0.17.5) (1.3.3) Requirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib->catboost==0.17.5) (41.2.0) Installing collected packages: catboost Found existing installation: catboost 0.17.3 Uninstalling catboost-0.17.3: Successfully uninstalled catboost-0.17.3 Successfully installed catboost-0.17.5
import catboost
catboost.__version__
'0.17.5'
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.
/kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction submission file example (random solutions).csv /kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction submission file.csv /kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction test (without labels).csv /kaggle/input/tcdml1920-income-ind/tcd ml 2019-20 income prediction training (with labels).csv /kaggle/input/submission/Submission.csv
# Load Training & Test data
data = pd.read_csv('../input/tcdml1920-income-ind/tcd ml 2019-20 income prediction training (with labels).csv', index_col='Instance')
trainData = deepcopy(data.drop(['Income in EUR', 'Hair Color'], axis=1))
data1 = pd.read_csv('../input/tcdml1920-income-ind/tcd ml 2019-20 income prediction test (without labels).csv', index_col='Instance')
testData = deepcopy(data1.drop(['Income', 'Hair Color'], axis=1))
trainData.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 111993 entries, 1 to 111993 Data columns (total 9 columns): Year of Record 111552 non-null float64 Gender 104561 non-null object Age 111499 non-null float64 Country 111993 non-null object Size of City 111993 non-null int64 Profession 111671 non-null object University Degree 104623 non-null object Wears Glasses 111993 non-null int64 Body Height [cm] 111993 non-null int64 dtypes: float64(2), int64(3), object(4) memory usage: 8.5+ MB
testData.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 73230 entries, 111994 to 185223 Data columns (total 9 columns): Year of Record 72935 non-null float64 Gender 68368 non-null object Age 72951 non-null float64 Country 73230 non-null object Size of City 73230 non-null int64 Profession 73035 non-null object University Degree 68373 non-null object Wears Glasses 73230 non-null int64 Body Height [cm] 73230 non-null int64 dtypes: float64(2), int64(3), object(4) memory usage: 5.6+ MB
pd.options.display.float_format = '{:.2f}'.format
data['Income in EUR'].describe()
count 111993.00 mean 109213.81 std 149802.35 min -5696.91 25% 30771.69 50% 57339.17 75% 126093.60 max 5285252.36 Name: Income in EUR, dtype: float64
# Row Duplication check Train & Test
duplicateRowsTrain = trainData[trainData.duplicated()]
print('train' +duplicateRowsTrain)
duplicateRowsTest = testData[testData.duplicated()]
print('test' + duplicateRowsTest)
Empty DataFrame Columns: [Year of Record, Gender, Age, Country, Size of City, Profession, University Degree, Wears Glasses, Body Height [cm]] Index: [] Empty DataFrame Columns: [Year of Record, Gender, Age, Country, Size of City, Profession, University Degree, Wears Glasses, Body Height [cm]] Index: []
# Null check Train
trainData.isnull().sum().sort_values()
Country 0 Size of City 0 Wears Glasses 0 Body Height [cm] 0 Profession 322 Year of Record 441 Age 494 University Degree 7370 Gender 7432 dtype: int64
# Replace train missing values with 'unknown', 'others' and dropping NaN
trainData1 = deepcopy(trainData)
trainData1['Gender'] = trainData1['Gender'].replace({np.nan: 'unknown'})
trainData1['University Degree'] = trainData1['University Degree'].replace({np.nan: 'unknown'})
trainData1['Profession'] = trainData1['Profession'].replace({np.nan: 'others'})
trainData1 = trainData1.dropna()
# Null check Test
testData.isnull().sum().sort_values()
Country 0 Size of City 0 Wears Glasses 0 Body Height [cm] 0 Profession 195 Age 279 Year of Record 295 University Degree 4857 Gender 4862 dtype: int64
# Replace test missing values with 'unknown', 'others' and replace NaN with mean
testData1 = deepcopy(testData)
testData1['Gender'] = testData1['Gender'].replace({np.nan: 'unknown'})
testData1['University Degree'] = testData1['University Degree'].replace({np.nan: 'unknown'})
testData1['Profession'] = testData1['Profession'].replace({np.nan: 'others'})
testData1['Year of Record'] = testData1['Year of Record'].replace({np.nan: testData1['Year of Record'].mean()})
testData1['Age'] = testData1['Age'].replace({np.nan: testData1['Age'].mean()})
testData1 = testData1.dropna()
testData1['University Degree'].value_counts()
Bachelor 27914 Master 17890 No 17705 unknown 4857 PhD 4405 0 459 Name: University Degree, dtype: int64
# unknown and zero mapping to downsize categories
trainData1['University Degree'].where(trainData1['University Degree'] == 0, 'unknown')
trainData1['Gender'].where(trainData1['Gender'] == 0, 'unknown')
testData1['University Degree'].where(testData1['University Degree'] == 0, 'unknown')
testData1['Gender'].where(testData1['Gender'] == 0, 'unknown')
Instance 111994 unknown 111995 unknown 111996 unknown 111997 unknown 111998 unknown ... 185219 unknown 185220 unknown 185221 unknown 185222 unknown 185223 unknown Name: Gender, Length: 73230, dtype: object
trainData1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 111060 entries, 1 to 111993 Data columns (total 9 columns): Year of Record 111060 non-null float64 Gender 111060 non-null object Age 111060 non-null float64 Country 111060 non-null object Size of City 111060 non-null int64 Profession 111060 non-null object University Degree 111060 non-null object Wears Glasses 111060 non-null int64 Body Height [cm] 111060 non-null int64 dtypes: float64(2), int64(3), object(4) memory usage: 8.5+ MB
testData1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 73230 entries, 111994 to 185223 Data columns (total 9 columns): Year of Record 73230 non-null float64 Gender 73230 non-null object Age 73230 non-null float64 Country 73230 non-null object Size of City 73230 non-null int64 Profession 73230 non-null object University Degree 73230 non-null object Wears Glasses 73230 non-null int64 Body Height [cm] 73230 non-null int64 dtypes: float64(2), int64(3), object(4) memory usage: 5.6+ MB
normColumn = ['Year of Record', 'Age', 'Size of City', 'Body Height [cm]']
for col in normColumn:
trainData1[col] = (trainData1[col] - trainData1[col].mean()) / trainData1[col].std()
testData1[col] = (testData1[col] - testData1[col].mean()) / testData1[col].std()
#Creating a training set for modeling and validation set to check model performance
X = trainData1
y = data.loc[trainData1.index, 'Income in EUR']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.9, random_state=18313172)
categorical_features_indices = np.array([1, 3, 5, 6,7])
model=CatBoostRegressor(
iterations=20000,
depth=10,
learning_rate=0.001,
loss_function='RMSE',
task_type='GPU',
border_count=32,
verbose=500,
random_seed=18313172
)
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
0: learn: 149595.7260333 test: 150914.1420584 best: 150914.1420584 (0) total: 45.1ms remaining: 15m 2s 500: learn: 110975.2304253 test: 112377.2620384 best: 112377.2620384 (500) total: 21.5s remaining: 13m 55s 1000: learn: 89339.2754408 test: 90639.6342533 best: 90639.6342533 (1000) total: 43.9s remaining: 13m 52s 1500: learn: 77254.4124608 test: 78299.9275946 best: 78299.9275946 (1500) total: 1m 7s remaining: 13m 46s 2000: learn: 70341.2129667 test: 71191.7507614 best: 71191.7507614 (2000) total: 1m 31s remaining: 13m 43s 2500: learn: 66168.7933118 test: 66919.4712290 best: 66919.4712290 (2500) total: 1m 56s remaining: 13m 34s 3000: learn: 63474.5356981 test: 64223.0711685 best: 64223.0711685 (3000) total: 2m 19s remaining: 13m 11s 3500: learn: 61651.9567450 test: 62439.5555315 best: 62439.5555315 (3500) total: 2m 42s remaining: 12m 48s 4000: learn: 60289.2185402 test: 61212.8852321 best: 61212.8852321 (4000) total: 3m 9s remaining: 12m 36s 4500: learn: 59215.0575239 test: 60318.1784353 best: 60318.1784353 (4500) total: 3m 33s remaining: 12m 15s 5000: learn: 58337.4992729 test: 59639.8253793 best: 59639.8253793 (5000) total: 3m 56s remaining: 11m 50s 5500: learn: 57607.3371942 test: 59114.6178474 best: 59114.6178474 (5500) total: 4m 20s remaining: 11m 27s 6000: learn: 56989.9768403 test: 58729.5125448 best: 58729.5125448 (6000) total: 4m 45s remaining: 11m 7s 6500: learn: 56466.0037300 test: 58425.0518813 best: 58425.0518813 (6500) total: 5m 11s remaining: 10m 47s 7000: learn: 55987.5413825 test: 58176.3524208 best: 58176.3524208 (7000) total: 5m 36s remaining: 10m 24s 7500: learn: 55549.8738628 test: 57979.4582681 best: 57979.4582681 (7500) total: 6m 2s remaining: 10m 3s 8000: learn: 55159.0726597 test: 57811.8757428 best: 57811.8757428 (8000) total: 6m 31s remaining: 9m 46s 8500: learn: 54786.7366582 test: 57669.8035927 best: 57669.7315572 (8498) total: 7m remaining: 9m 29s 9000: learn: 54447.2023644 test: 57553.7260230 best: 57553.7260230 (9000) total: 7m 26s remaining: 9m 5s 9500: learn: 54136.2131406 test: 57448.7746479 best: 57448.7746479 (9500) total: 7m 54s remaining: 8m 44s 10000: learn: 53841.9821730 test: 57347.6879779 best: 57347.6879779 (10000) total: 8m 22s remaining: 8m 22s 10500: learn: 53548.1813576 test: 57252.3928831 best: 57252.3928831 (10500) total: 8m 52s remaining: 8m 1s 11000: learn: 53268.1231175 test: 57182.1044128 best: 57182.1044128 (11000) total: 9m 23s remaining: 7m 41s 11500: learn: 52984.5461628 test: 57105.3114793 best: 57105.3114793 (11500) total: 9m 51s remaining: 7m 17s 12000: learn: 52711.9940093 test: 57032.4281303 best: 57032.3056257 (11999) total: 10m 18s remaining: 6m 52s 12500: learn: 52450.7681290 test: 56979.0507795 best: 56979.0507795 (12500) total: 10m 45s remaining: 6m 27s 13000: learn: 52202.4485121 test: 56924.0842024 best: 56924.0842024 (13000) total: 11m 13s remaining: 6m 2s 13500: learn: 51953.0728618 test: 56872.9791663 best: 56872.9791663 (13500) total: 11m 39s remaining: 5m 36s 14000: learn: 51700.6633886 test: 56825.3174241 best: 56825.3174241 (14000) total: 12m 7s remaining: 5m 11s 14500: learn: 51465.6708007 test: 56773.9339014 best: 56773.7775788 (14498) total: 12m 35s remaining: 4m 46s 15000: learn: 51239.0322292 test: 56735.2758564 best: 56735.2758564 (15000) total: 13m 3s remaining: 4m 21s 15500: learn: 51018.4113180 test: 56703.4519602 best: 56703.2155196 (15499) total: 13m 35s remaining: 3m 56s 16000: learn: 50801.4646706 test: 56677.4975544 best: 56677.4975544 (16000) total: 14m 4s remaining: 3m 31s 16500: learn: 50584.6577016 test: 56645.1144961 best: 56645.1144961 (16500) total: 14m 33s remaining: 3m 5s 17000: learn: 50375.4930260 test: 56620.6608040 best: 56620.6374589 (16992) total: 15m remaining: 2m 38s 17500: learn: 50179.0605668 test: 56597.4543749 best: 56597.3109105 (17496) total: 15m 29s remaining: 2m 12s 18000: learn: 49991.8944681 test: 56574.4887572 best: 56574.4887572 (18000) total: 15m 57s remaining: 1m 46s 18500: learn: 49797.1038142 test: 56551.9879618 best: 56551.8978071 (18499) total: 16m 29s remaining: 1m 20s 19000: learn: 49598.0592227 test: 56532.0569331 best: 56532.0569331 (19000) total: 17m 3s remaining: 53.8s 19500: learn: 49418.0998164 test: 56514.5446915 best: 56514.5112787 (19498) total: 17m 33s remaining: 26.9s 19999: learn: 49235.4781576 test: 56497.9561760 best: 56497.8492237 (19984) total: 18m 1s remaining: 0us bestTest = 56497.84922 bestIteration = 19984 Shrink model to first 19985 iterations.
<catboost.core.CatBoostRegressor at 0x7ff538191a90>
print(model.get_feature_importance(prettified=True))
Feature Id Importances 0 Country 30.65 1 Age 18.44 2 Year of Record 16.90 3 Profession 12.47 4 University Degree 7.72 5 Body Height [cm] 6.08 6 Size of City 3.95 7 Gender 3.36 8 Wears Glasses 0.42
submission = pd.DataFrame()
submission['Instance'] = testData1.index
submission['Income'] = model.predict(testData1)
submission.to_csv('Submission.csv', index = False)