In this competition we have data about tube assemblies which suppliers deliver to Caterpillar. The data is comprised of many files describing these tube assemblies.
The goal of competition if to predict prices of tube assemblies based on this information.
The main challenge of this competition is combining the data and choosing/creating features. After this I use XGBoost for prediction.
The metric to calculate the accuracy of predictions is Root Mean Squared Logarithmic Error (it penalizes an under-predicted estimate greater than an over-predicted estimate).
The RMSLE is calculated as
$$ \epsilon = \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$Where:
$\epsilon$ is the RMSLE value (score); $n$ is the number of observations; $p_i$ is prediction; $a_i$ is the actual response for $i$; $\log(x)$ is the natural logarithm of $x$
1.2 Comp Adaptor
1.3 Comp Boss
1.4 Comp Hfl
1.5 Comp Elbow
1.6 Comp Float
1.7 Comp Nut
1.8 Comp Other
1.9 Comp Sleeve
1.10 Comp Straight
1.11 Comp Tee
1.12 Comp Threaded
1.13 Tube
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
import glob
An easy way to read all the files in separate variables is to get the list of files and loop through it.
glob.glob('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/*.csv')
['Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\bill_of_materials.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\components.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_adaptor.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_boss.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_elbow.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_float.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_hfl.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_nut.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_other.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_sleeve.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_straight.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_tee.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\comp_threaded.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\sample_submission.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\specs.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\test_set.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\train_set.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\tube.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\tube_end_form.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\type_component.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\type_connection.csv', 'Kaggle/Data_Kaggle/Caterpillar Tube Pricing\\type_end_form.csv']
#Read each file in a separate data frame.
bill_of_materials = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/bill_of_materials.csv')
components = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/components.csv')
comp_adaptor = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_adaptor.csv')
comp_boss = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_boss.csv')
comp_elbow = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_elbow.csv')
comp_float = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_float.csv')
comp_hfl = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_hfl.csv')
comp_nut = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_nut.csv')
comp_other = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_other.csv')
comp_sleeve = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_sleeve.csv')
comp_straight = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_straight.csv')
comp_tee = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_tee.csv')
comp_threaded = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_threaded.csv')
specs = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/specs.csv')
tube = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/tube.csv')
tube_end_form = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/tube_end_form.csv')
type_component = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/type_component.csv')
type_connection = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/type_connection.csv')
type_end_form = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/type_end_form.csv')
train = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/train_set.csv', parse_dates=[2,])
test = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/test_set.csv', parse_dates=[3,])
#The file contains information about components of tube assemblies. All information is necessary.
#Missing values could be filled only with 0, but it isn't necessary.
bill_of_materials.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21198 entries, 0 to 21197 Data columns (total 17 columns): tube_assembly_id 21198 non-null object component_id_1 19149 non-null object quantity_1 19149 non-null float64 component_id_2 14786 non-null object quantity_2 14786 non-null float64 component_id_3 4791 non-null object quantity_3 4798 non-null float64 component_id_4 607 non-null object quantity_4 608 non-null float64 component_id_5 92 non-null object quantity_5 92 non-null float64 component_id_6 26 non-null object quantity_6 26 non-null float64 component_id_7 7 non-null object quantity_7 7 non-null float64 component_id_8 1 non-null object quantity_8 1 non-null float64 dtypes: float64(8), object(9) memory usage: 2.7+ MB
#Simply to see the line with all 8 components.
bill_of_materials.loc[bill_of_materials.quantity_8.notnull() == True]
tube_assembly_id | component_id_1 | quantity_1 | component_id_2 | quantity_2 | component_id_3 | quantity_3 | component_id_4 | quantity_4 | component_id_5 | quantity_5 | component_id_6 | quantity_6 | component_id_7 | quantity_7 | component_id_8 | quantity_8 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11523 | TA-11524 | C-1253 | 1.0 | C-1496 | 2.0 | C-1500 | 1.0 | C-1708 | 1.0 | C-1811 | 1.0 | C-1920 | 1.0 | C-1921 | 1.0 | C-1981 | 1.0 |
#There are columns with too few non-null values. But it is necessary to see more.
comp_adaptor.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 25 entries, 0 to 24 Data columns (total 20 columns): component_id 25 non-null object component_type_id 25 non-null object adaptor_angle 1 non-null float64 overall_length 24 non-null float64 end_form_id_1 25 non-null object connection_type_id_1 24 non-null object length_1 1 non-null float64 thread_size_1 17 non-null float64 thread_pitch_1 17 non-null float64 nominal_size_1 8 non-null float64 end_form_id_2 25 non-null object connection_type_id_2 24 non-null object length_2 1 non-null float64 thread_size_2 23 non-null float64 thread_pitch_2 23 non-null float64 nominal_size_2 2 non-null float64 hex_size 17 non-null float64 unique_feature 25 non-null object orientation 25 non-null object weight 23 non-null float64 dtypes: float64(12), object(8) memory usage: 4.0+ KB
comp_adaptor
component_id | component_type_id | adaptor_angle | overall_length | end_form_id_1 | connection_type_id_1 | length_1 | thread_size_1 | thread_pitch_1 | nominal_size_1 | end_form_id_2 | connection_type_id_2 | length_2 | thread_size_2 | thread_pitch_2 | nominal_size_2 | hex_size | unique_feature | orientation | weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C-0005 | CP-028 | NaN | 58.40 | A-001 | B-001 | NaN | 1.312 | 12.0 | NaN | A-001 | B-004 | NaN | 1.000 | 11.5 | NaN | 34.93 | No | No | 0.206 |
1 | C-0006 | CP-028 | NaN | 34.80 | A-001 | B-001 | NaN | 0.437 | 20.0 | NaN | A-001 | B-005 | NaN | 0.750 | 16.0 | NaN | 22.20 | No | No | 0.083 |
2 | C-1435 | CP-028 | NaN | 20.30 | A-007 | B-004 | NaN | NaN | NaN | 15.88 | A-001 | B-007 | NaN | 0.875 | 18.0 | NaN | 22.22 | No | No | 0.023 |
3 | C-1546 | CP-028 | NaN | 26.40 | A-007 | B-004 | NaN | 0.125 | 27.0 | NaN | A-001 | B-004 | NaN | 0.125 | 27.0 | NaN | 15.88 | No | No | 0.026 |
4 | C-1583 | CP-028 | NaN | 44.50 | A-001 | B-005 | NaN | 1.312 | 12.0 | NaN | A-007 | B-005 | NaN | 1.062 | 12.0 | NaN | 38.10 | No | No | 0.256 |
5 | C-1634 | CP-028 | NaN | 34.50 | A-001 | B-005 | NaN | 0.750 | 16.0 | NaN | A-001 | B-002 | NaN | 0.687 | 16.0 | NaN | 22.23 | No | No | 0.060 |
6 | C-1975 | CP-028 | NaN | 13.20 | A-007 | B-007 | NaN | NaN | NaN | 3.18 | A-001 | B-007 | NaN | 0.312 | 28.0 | NaN | NaN | No | No | 0.005 |
7 | C-0428 | CP-028 | NaN | 26.99 | A-001 | B-004 | NaN | 0.250 | 18.0 | NaN | A-007 | NaN | NaN | NaN | NaN | 9.52 | 17.46 | No | No | 0.032 |
8 | C-0443 | CP-028 | NaN | 22.35 | A-007 | B-007 | NaN | NaN | NaN | 19.05 | 9999 | 9999 | NaN | 1.062 | 16.0 | NaN | 26.97 | No | No | NaN |
9 | C-0823 | CP-028 | NaN | 16.80 | A-007 | B-007 | NaN | NaN | NaN | 9.52 | A-001 | 9999 | NaN | 0.625 | 18.0 | 9.52 | 15.75 | No | No | 0.014 |
10 | C-0354 | CP-028 | NaN | 36.30 | A-001 | B-004 | NaN | 1.250 | 11.5 | NaN | A-007 | B-004 | NaN | 1.000 | 11.5 | NaN | 44.45 | No | No | 0.136 |
11 | C-0361 | CP-028 | NaN | 41.00 | A-005 | NaN | NaN | NaN | NaN | 6.35 | A-006 | B-002 | NaN | 0.562 | 18.0 | NaN | 15.88 | No | No | 0.005 |
12 | C-1312 | CP-028 | NaN | 14.20 | A-002 | B-007 | NaN | 0.437 | 24.0 | NaN | 9999 | 9999 | NaN | NaN | NaN | NaN | 11.11 | No | No | 0.009 |
13 | C-1313 | CP-028 | NaN | 15.70 | A-007 | B-007 | NaN | NaN | NaN | 7.94 | A-001 | B-007 | NaN | 0.500 | 20.0 | NaN | 12.57 | No | No | 0.010 |
14 | C-1194 | CP-028 | NaN | 15.00 | A-007 | B-004 | NaN | 0.125 | 27.0 | NaN | A-007 | B-004 | NaN | 0.125 | 27.0 | NaN | NaN | No | No | 0.014 |
15 | C-1195 | CP-028 | NaN | 30.00 | A-007 | B-004 | NaN | 0.250 | 18.0 | NaN | A-007 | B-004 | NaN | 0.250 | 18.0 | NaN | NaN | Yes | No | 0.031 |
16 | C-1196 | CP-028 | NaN | 40.00 | A-007 | B-004 | NaN | 0.500 | 14.0 | NaN | A-007 | B-004 | NaN | 0.500 | 14.0 | NaN | NaN | No | No | 0.095 |
17 | C-1197 | CP-028 | NaN | 41.00 | A-007 | B-004 | NaN | 0.750 | 14.0 | NaN | A-007 | B-004 | NaN | 0.750 | 14.0 | NaN | NaN | No | No | 0.091 |
18 | C-1198 | CP-028 | NaN | 51.00 | A-007 | B-004 | NaN | 1.000 | 11.5 | NaN | A-007 | B-004 | NaN | 1.000 | 11.5 | NaN | NaN | No | No | 0.181 |
19 | C-1229 | CP-028 | NaN | 18.80 | A-007 | B-004 | NaN | NaN | NaN | 12.70 | A-001 | B-007 | NaN | 0.750 | 18.0 | NaN | 18.92 | No | No | 0.009 |
20 | C-1230 | CP-028 | NaN | 14.20 | A-007 | B-007 | NaN | NaN | NaN | 4.76 | A-001 | B-005 | NaN | 0.375 | 24.0 | NaN | 9.40 | No | No | 0.009 |
21 | C-1695 | CP-028 | NaN | 30.00 | A-001 | B-006 | NaN | 0.750 | 16.0 | NaN | A-007 | 9999 | NaN | 9999.000 | 9999.0 | NaN | 25.40 | No | No | NaN |
22 | C-1812 | CP-028 | NaN | 35.10 | A-007 | B-004 | NaN | 0.375 | 18.0 | NaN | A-007 | B-004 | NaN | 0.375 | 18.0 | NaN | NaN | No | No | 0.091 |
23 | C-1828 | CP-028 | NaN | 28.40 | A-001 | B-005 | NaN | 1.312 | 12.0 | NaN | A-007 | B-005 | NaN | 0.562 | 18.0 | NaN | 38.10 | No | No | 0.181 |
24 | C-1868 | CP-029 | 90.0 | NaN | A-001 | B-005 | 65.5 | 0.750 | 16.0 | NaN | A-001 | B-002 | 28.0 | 0.812 | 16.0 | NaN | NaN | No | Yes | 0.226 |
component_type_id, end_form_id_1, connection_type_id_1, end_form_id_2, connection_type_id_2 - descriptive information, it is useless in all files. Drop.
adaptor_angle - only one value. Drop.
unique_feature, orientation - categorical description. Drop.
C-1695 has abnormal values. And only one tube assembly has it. So I'll drop it.
C-0443 has NaN weight. And it belongs to only one tube assembly, which has two components, both with NaN weight. So drop.
C-1868. It seems that for some reason overall_length wasn't calculated. I'll calculate it as a sum of length_1 and length_2. Drop length_1 and length_2 after using.
comp_adaptor.drop(['adaptor_angle', 'component_type_id', 'end_form_id_1', 'connection_type_id_1', 'length_1', 'length_2',
'unique_feature', 'orientation', 'end_form_id_2', 'connection_type_id_2'], axis=1, inplace=True)
#Could input a formula, but it single value.
comp_adaptor.loc[comp_adaptor['overall_length'].isnull(), 'overall_length'] = 93.5
comp_adaptor.drop(comp_adaptor.index[[8, 21]], inplace=True)
comp_adaptor
component_id | overall_length | thread_size_1 | thread_pitch_1 | nominal_size_1 | thread_size_2 | thread_pitch_2 | nominal_size_2 | hex_size | weight | |
---|---|---|---|---|---|---|---|---|---|---|
0 | C-0005 | 58.40 | 1.312 | 12.0 | NaN | 1.000 | 11.5 | NaN | 34.93 | 0.206 |
1 | C-0006 | 34.80 | 0.437 | 20.0 | NaN | 0.750 | 16.0 | NaN | 22.20 | 0.083 |
2 | C-1435 | 20.30 | NaN | NaN | 15.88 | 0.875 | 18.0 | NaN | 22.22 | 0.023 |
3 | C-1546 | 26.40 | 0.125 | 27.0 | NaN | 0.125 | 27.0 | NaN | 15.88 | 0.026 |
4 | C-1583 | 44.50 | 1.312 | 12.0 | NaN | 1.062 | 12.0 | NaN | 38.10 | 0.256 |
5 | C-1634 | 34.50 | 0.750 | 16.0 | NaN | 0.687 | 16.0 | NaN | 22.23 | 0.060 |
6 | C-1975 | 13.20 | NaN | NaN | 3.18 | 0.312 | 28.0 | NaN | NaN | 0.005 |
7 | C-0428 | 26.99 | 0.250 | 18.0 | NaN | NaN | NaN | 9.52 | 17.46 | 0.032 |
9 | C-0823 | 16.80 | NaN | NaN | 9.52 | 0.625 | 18.0 | 9.52 | 15.75 | 0.014 |
10 | C-0354 | 36.30 | 1.250 | 11.5 | NaN | 1.000 | 11.5 | NaN | 44.45 | 0.136 |
11 | C-0361 | 41.00 | NaN | NaN | 6.35 | 0.562 | 18.0 | NaN | 15.88 | 0.005 |
12 | C-1312 | 14.20 | 0.437 | 24.0 | NaN | NaN | NaN | NaN | 11.11 | 0.009 |
13 | C-1313 | 15.70 | NaN | NaN | 7.94 | 0.500 | 20.0 | NaN | 12.57 | 0.010 |
14 | C-1194 | 15.00 | 0.125 | 27.0 | NaN | 0.125 | 27.0 | NaN | NaN | 0.014 |
15 | C-1195 | 30.00 | 0.250 | 18.0 | NaN | 0.250 | 18.0 | NaN | NaN | 0.031 |
16 | C-1196 | 40.00 | 0.500 | 14.0 | NaN | 0.500 | 14.0 | NaN | NaN | 0.095 |
17 | C-1197 | 41.00 | 0.750 | 14.0 | NaN | 0.750 | 14.0 | NaN | NaN | 0.091 |
18 | C-1198 | 51.00 | 1.000 | 11.5 | NaN | 1.000 | 11.5 | NaN | NaN | 0.181 |
19 | C-1229 | 18.80 | NaN | NaN | 12.70 | 0.750 | 18.0 | NaN | 18.92 | 0.009 |
20 | C-1230 | 14.20 | NaN | NaN | 4.76 | 0.375 | 24.0 | NaN | 9.40 | 0.009 |
22 | C-1812 | 35.10 | 0.375 | 18.0 | NaN | 0.375 | 18.0 | NaN | NaN | 0.091 |
23 | C-1828 | 28.40 | 1.312 | 12.0 | NaN | 0.562 | 18.0 | NaN | 38.10 | 0.181 |
24 | C-1868 | 93.50 | 0.750 | 16.0 | NaN | 0.812 | 16.0 | NaN | NaN | 0.226 |
#Descriptive and categorical features will be dropped.
comp_boss.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 147 entries, 0 to 146 Data columns (total 15 columns): component_id 147 non-null object component_type_id 147 non-null object type 124 non-null object connection_type_id 147 non-null object outside_shape 124 non-null object base_type 124 non-null object height_over_tube 147 non-null float64 bolt_pattern_long 23 non-null float64 bolt_pattern_wide 17 non-null float64 groove 147 non-null object base_diameter 57 non-null float64 shoulder_diameter 30 non-null float64 unique_feature 147 non-null object orientation 147 non-null object weight 145 non-null float64 dtypes: float64(6), object(9) memory usage: 17.3+ KB
#Use only important information.
comp_boss = comp_boss[['component_id', 'height_over_tube', 'weight']]
comp_boss.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 147 entries, 0 to 146 Data columns (total 3 columns): component_id 147 non-null object height_over_tube 147 non-null float64 weight 145 non-null float64 dtypes: float64(2), object(1) memory usage: 3.5+ KB
I wrote the following to check for outliers. Sometimes there are values which are too big. It is either mistake or it is due to unknown reason. I'll drop such values. Threshold is arbitrary.
for i in comp_boss.columns:
if comp_boss[i].dtype != object:
if comp_boss[i][comp_boss[i] > 4 * comp_boss[i].mean()].any() == True:
print(i)
print(comp_boss.loc[comp_boss[i] == comp_boss[i].max()])
height_over_tube component_id height_over_tube weight 31 C-0307 9999.0 1.05 weight component_id height_over_tube weight 21 C-0243 36.0 4.034
#Drop row with too big value. I don't drop weight, because it could be reasonable
comp_boss.drop(comp_boss.index[31], inplace=True)
comp_boss.head()
component_id | height_over_tube | weight | |
---|---|---|---|
0 | C-0008 | 17.0 | 0.032 |
1 | C-0009 | 13.0 | 0.033 |
2 | C-0020 | 28.4 | 0.070 |
3 | C-0054 | 27.1 | 0.180 |
4 | C-0071 | 20.0 | 0.080 |
comp_hfl.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6 entries, 0 to 5 Data columns (total 9 columns): component_id 6 non-null object component_type_id 6 non-null object hose_diameter 6 non-null float64 corresponding_shell 6 non-null object coupling_class 6 non-null object material 6 non-null object plating 6 non-null object orientation 6 non-null object weight 6 non-null float64 dtypes: float64(2), object(7) memory usage: 512.0+ bytes
comp_hfl
component_id | component_type_id | hose_diameter | corresponding_shell | coupling_class | material | plating | orientation | weight | |
---|---|---|---|---|---|---|---|---|---|
0 | C-0872 | CP-023 | 4.8 | C-0855 | SP-0098 | SP-0016 | Yes | No | 0.010 |
1 | C-0873 | CP-023 | 4.8 | C-0856 | SP-0098 | SP-0016 | Yes | No | 0.010 |
2 | C-0874 | CP-023 | 4.8 | C-0857 | SP-0098 | SP-0038 | Yes | No | 0.001 |
3 | C-1039 | CP-023 | 15.9 | C-1040 | SP-0097 | SP-0095 | No | No | 0.052 |
4 | C-1041 | CP-023 | 15.9 | C-1042 | SP-0099 | SP-0095 | No | No | 0.065 |
5 | C-1043 | CP-023 | 25.4 | C-1044 | SP-0099 | SP-0095 | No | No | 0.196 |
#It seems that only three columns are necessary.
comp_hfl = comp_hfl[['component_id', 'hose_diameter', 'weight']]
comp_hfl
component_id | hose_diameter | weight | |
---|---|---|---|
0 | C-0872 | 4.8 | 0.010 |
1 | C-0873 | 4.8 | 0.010 |
2 | C-0874 | 4.8 | 0.001 |
3 | C-1039 | 15.9 | 0.052 |
4 | C-1041 | 15.9 | 0.065 |
5 | C-1043 | 25.4 | 0.196 |
comp_elbow.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 16 columns): component_id 178 non-null object component_type_id 178 non-null object bolt_pattern_long 171 non-null float64 bolt_pattern_wide 138 non-null float64 extension_length 170 non-null float64 overall_length 175 non-null float64 thickness 171 non-null float64 drop_length 171 non-null float64 elbow_angle 130 non-null float64 mj_class_code 41 non-null object mj_plug_class_code 40 non-null object plug_diameter 7 non-null float64 groove 178 non-null object unique_feature 178 non-null object orientation 178 non-null object weight 176 non-null float64 dtypes: float64(9), object(7) memory usage: 22.3+ KB
comp_elbow.head()
component_id | component_type_id | bolt_pattern_long | bolt_pattern_wide | extension_length | overall_length | thickness | drop_length | elbow_angle | mj_class_code | mj_plug_class_code | plug_diameter | groove | unique_feature | orientation | weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C-0013 | CP-008 | 152.4 | 92.08 | 105.0 | 185.0 | 113.0 | 75.0 | 90.0 | NaN | NaN | NaN | Yes | No | Yes | 8.890 |
1 | C-0016 | CP-009 | 57.2 | 27.80 | 42.0 | 69.0 | 44.0 | 24.0 | 90.0 | NaN | NaN | NaN | No | No | Yes | 1.172 |
2 | C-0017 | CP-009 | 57.2 | 27.80 | 42.0 | 69.0 | 47.0 | 26.0 | 90.0 | NaN | NaN | NaN | Yes | No | Yes | 1.245 |
3 | C-0018 | CP-009 | 66.6 | 31.80 | 50.0 | 80.0 | 57.0 | 31.5 | 90.0 | NaN | NaN | NaN | Yes | No | Yes | 1.863 |
4 | C-0021 | CP-010 | 75.0 | NaN | 31.5 | 70.0 | 25.0 | 12.5 | 90.0 | NaN | NaN | NaN | No | Yes | Yes | 0.903 |
#Most of the columns aren't necessary.
comp_elbow.drop(['component_type_id', 'mj_class_code', 'mj_plug_class_code', 'plug_diameter', 'groove', 'unique_feature',
'orientation',], axis=1, inplace=True)
for i in comp_elbow.columns:
if comp_elbow[i].dtype != object:
if comp_elbow[i][comp_elbow[i] > 4 * comp_elbow[i].mean()].any() == True:
print(i)
print(comp_elbow.loc[comp_elbow[i] == comp_elbow[i].max()])
drop_length component_id bolt_pattern_long bolt_pattern_wide extension_length \ 52 C-0537 78.2 NaN 32.0 overall_length thickness drop_length elbow_angle weight 52 64.0 36.0 9999.0 169.25 0.42 weight component_id bolt_pattern_long bolt_pattern_wide extension_length \ 62 C-0646 106.38 61.93 55.5 overall_length thickness drop_length elbow_angle weight 62 111.0 115.0 57.5 90.0 10.19
comp_elbow.drop(comp_elbow.index[52], inplace=True)
comp_float.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16 entries, 0 to 15 Data columns (total 7 columns): component_id 16 non-null object component_type_id 16 non-null object bolt_pattern_long 16 non-null float64 bolt_pattern_wide 16 non-null float64 thickness 16 non-null float64 orientation 16 non-null object weight 16 non-null float64 dtypes: float64(4), object(3) memory usage: 976.0+ bytes
#Drop description.
comp_float.drop(['component_type_id', 'orientation'], axis=1, inplace=True)
comp_float
component_id | bolt_pattern_long | bolt_pattern_wide | thickness | weight | |
---|---|---|---|---|---|
0 | C-0027 | 148.00 | 96.00 | 18.00 | 2.230 |
1 | C-0454 | 58.72 | 30.18 | 28.00 | 0.590 |
2 | C-0455 | 58.72 | 30.18 | 28.00 | 0.525 |
3 | C-0494 | 52.40 | 26.20 | 15.85 | 0.230 |
4 | C-0496 | 58.80 | 30.20 | 14.20 | 0.284 |
5 | C-0508 | 77.76 | 42.88 | 25.40 | 1.144 |
6 | C-0572 | 69.85 | 35.71 | 15.70 | 0.447 |
7 | C-0797 | 120.65 | 69.85 | 20.00 | 1.983 |
8 | C-0891 | 47.62 | 22.22 | 28.00 | 0.465 |
9 | C-1096 | 76.40 | 69.90 | 25.00 | 2.565 |
10 | C-1102 | 130.20 | 77.80 | 36.50 | 4.060 |
11 | C-1135 | 116.00 | 17.00 | 22.00 | 1.880 |
12 | C-1149 | 52.36 | 26.19 | 36.00 | 0.783 |
13 | C-1452 | 52.38 | 26.19 | 22.00 | 0.276 |
14 | C-1453 | 58.72 | 30.18 | 22.00 | 0.389 |
15 | C-1584 | 106.38 | 61.93 | 25.40 | 0.516 |
comp_nut.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 65 entries, 0 to 64 Data columns (total 11 columns): component_id 65 non-null object component_type_id 65 non-null object hex_nut_size 42 non-null float64 seat_angle 15 non-null float64 length 65 non-null float64 thread_size 65 non-null object thread_pitch 65 non-null float64 diameter 23 non-null float64 blind_hole 23 non-null object orientation 65 non-null object weight 64 non-null float64 dtypes: float64(6), object(5) memory usage: 5.7+ KB
comp_nut.drop(['component_type_id', 'seat_angle', 'diameter', 'blind_hole', 'orientation'], axis=1, inplace=True)
comp_nut.head()
component_id | hex_nut_size | length | thread_size | thread_pitch | weight | |
---|---|---|---|---|---|---|
0 | C-1621 | 20.64 | 17.0 | .687 | 16.0 | 0.015 |
1 | C-1624 | 34.92 | 26.5 | 1.187 | 12.0 | 0.035 |
2 | C-1623 | 28.58 | 23.5 | 1.000 | 14.0 | 0.044 |
3 | C-1622 | 23.81 | 20.0 | .812 | 16.0 | 0.036 |
4 | C-1625 | 41.28 | 27.5 | 1.437 | 12.0 | 0.129 |
for i in comp_nut.columns:
if comp_nut[i].dtype != object:
if comp_nut[i][comp_nut[i] > 4 * comp_nut[i].mean()].any() == True:
print(i)
print(comp_nut.loc[comp_nut[i] == comp_nut[i].max()])
weight component_id hex_nut_size length thread_size thread_pitch weight 27 C-1442 57.15 36.1 1.875 12.0 0.343
comp_other.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1001 entries, 0 to 1000 Data columns (total 3 columns): component_id 1001 non-null object part_name 1001 non-null object weight 945 non-null float64 dtypes: float64(1), object(2) memory usage: 23.5+ KB
#Dtop description.
comp_other.drop(['part_name'], axis=1, inplace=True)
comp_other.head()
component_id | weight | |
---|---|---|
0 | C-1385 | 0.014 |
1 | C-1386 | 0.005 |
2 | C-1369 | 0.003 |
3 | C-0422 | 0.003 |
4 | C-1817 | 0.014 |
comp_sleeve.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 10 columns): component_id 50 non-null object component_type_id 50 non-null object connection_type_id 50 non-null object length 50 non-null float64 intended_nut_thread 50 non-null float64 intended_nut_pitch 50 non-null int64 unique_feature 50 non-null object plating 50 non-null object orientation 50 non-null object weight 50 non-null float64 dtypes: float64(3), int64(1), object(6) memory usage: 4.0+ KB
comp_sleeve.drop(['component_type_id', 'connection_type_id', 'unique_feature', 'plating', 'orientation'], axis=1, inplace=True)
comp_sleeve.head()
component_id | length | intended_nut_thread | intended_nut_pitch | weight | |
---|---|---|---|---|---|
0 | C-0001 | 17.3 | 1.062 | 12 | 0.013 |
1 | C-0002 | 11.2 | 0.500 | 20 | 0.005 |
2 | C-0003 | 19.3 | 1.187 | 12 | 0.014 |
3 | C-0048 | 9.5 | 0.562 | 18 | 0.006 |
4 | C-0049 | 9.5 | 0.812 | 16 | 0.012 |
for i in comp_sleeve.columns:
if comp_sleeve[i].dtype != object:
if comp_sleeve[i][comp_sleeve[i] > 4 * comp_sleeve[i].mean()].any() == True:
print(i)
print(comp_sleeve.loc[comp_sleeve[i] == comp_sleeve[i].max()])
length component_id length intended_nut_thread intended_nut_pitch weight 28 C-1627 9999.0 0.562 18 0.005 29 C-1628 9999.0 0.687 16 0.006 30 C-1629 9999.0 0.812 16 0.012 31 C-1630 9999.0 1.000 14 0.018 32 C-1631 9999.0 1.187 12 0.026 33 C-1632 9999.0 1.437 12 0.033 34 C-1633 9999.0 1.687 12 0.040 48 C-1866 9999.0 2.000 12 0.035
comp_sleeve.drop(comp_sleeve.index[[28, 29, 30, 31, 32, 33, 34, 48]], inplace=True)
comp_straight.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 361 entries, 0 to 360 Data columns (total 12 columns): component_id 361 non-null object component_type_id 361 non-null object bolt_pattern_long 291 non-null float64 bolt_pattern_wide 204 non-null float64 head_diameter 70 non-null float64 overall_length 41 non-null float64 thickness 361 non-null float64 mj_class_code 120 non-null object groove 361 non-null object unique_feature 361 non-null object orientation 361 non-null object weight 354 non-null float64 dtypes: float64(6), object(6) memory usage: 33.9+ KB
comp_straight.drop(['component_type_id', 'overall_length', 'mj_class_code', 'head_diameter', 'unique_feature', 'groove',
'orientation'], axis=1, inplace=True)
comp_straight.head()
component_id | bolt_pattern_long | bolt_pattern_wide | thickness | weight | |
---|---|---|---|---|---|
0 | C-0012 | 66.68 | 31.75 | 20.0 | 0.788 |
1 | C-0014 | 47.60 | 22.20 | 15.0 | 0.339 |
2 | C-0015 | 66.70 | 31.80 | 20.0 | 0.788 |
3 | C-0019 | 77.80 | 42.90 | 36.5 | 1.533 |
4 | C-0029 | 47.63 | 22.23 | 16.0 | 0.286 |
for i in comp_straight.columns:
if comp_straight[i].dtype != object:
if comp_straight[i][comp_straight[i] > 4 * comp_straight[i].mean()].any() == True:
print(i)
print(comp_straight.loc[comp_straight[i] == comp_straight[i].max()])
weight component_id bolt_pattern_long bolt_pattern_wide thickness weight 171 C-1066 38.1 NaN 13.0 9.693
comp_tee.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4 entries, 0 to 3 Data columns (total 14 columns): component_id 4 non-null object component_type_id 4 non-null object bolt_pattern_long 4 non-null float64 bolt_pattern_wide 4 non-null float64 extension_length 4 non-null float64 overall_length 4 non-null float64 thickness 4 non-null int64 drop_length 4 non-null float64 mj_class_code 4 non-null object mj_plug_class_code 4 non-null object groove 4 non-null object unique_feature 4 non-null object orientation 4 non-null object weight 4 non-null float64 dtypes: float64(6), int64(1), object(7) memory usage: 528.0+ bytes
comp_tee.drop(['component_type_id', 'mj_class_code', 'mj_plug_class_code', 'groove', 'unique_feature', 'orientation'],
axis=1, inplace=True)
comp_tee
component_id | bolt_pattern_long | bolt_pattern_wide | extension_length | overall_length | thickness | drop_length | weight | |
---|---|---|---|---|---|---|---|---|
0 | C-0271 | 58.70 | 30.20 | 57.10 | 93.0 | 57 | 28.5 | 1.526 |
1 | C-1809 | 58.72 | 30.18 | 57.09 | 108.0 | 57 | 28.5 | 2.184 |
2 | C-1830 | 52.40 | 26.20 | 43.50 | 78.5 | 51 | 25.5 | 1.135 |
3 | C-1865 | 58.70 | 30.20 | 57.10 | 107.0 | 57 | 28.5 | 1.953 |
for i in comp_tee.columns:
if comp_tee[i].dtype != object:
if comp_tee[i][comp_tee[i] > 4 * comp_tee[i].mean()].any() == True:
print(i)
print(comp_tee.loc[comp_tee[i] == comp_tee[i].max()])
comp_threaded.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 194 entries, 0 to 193 Data columns (total 32 columns): component_id 194 non-null object component_type_id 194 non-null object adaptor_angle 54 non-null float64 overall_length 121 non-null float64 hex_size 112 non-null float64 end_form_id_1 194 non-null object connection_type_id_1 135 non-null object length_1 73 non-null float64 thread_size_1 135 non-null float64 thread_pitch_1 135 non-null float64 nominal_size_1 60 non-null object end_form_id_2 194 non-null object connection_type_id_2 63 non-null object length_2 73 non-null float64 thread_size_2 63 non-null float64 thread_pitch_2 63 non-null float64 nominal_size_2 131 non-null float64 end_form_id_3 19 non-null object connection_type_id_3 10 non-null object length_3 19 non-null float64 thread_size_3 10 non-null float64 thread_pitch_3 10 non-null float64 nominal_size_3 9 non-null float64 end_form_id_4 1 non-null object connection_type_id_4 1 non-null object length_4 1 non-null float64 thread_size_4 1 non-null float64 thread_pitch_4 1 non-null float64 nominal_size_4 0 non-null float64 unique_feature 194 non-null object orientation 194 non-null object weight 193 non-null float64 dtypes: float64(19), object(13) memory usage: 48.6+ KB
comp_threaded.drop(['component_type_id', 'adaptor_angle', 'end_form_id_1', 'connection_type_id_1', 'end_form_id_2',
'connection_type_id_2', 'end_form_id_3', 'connection_type_id_3', 'end_form_id_4', 'connection_type_id_4',
'nominal_size_4', 'unique_feature', 'orientation'], axis=1, inplace=True)
comp_threaded.head()
component_id | overall_length | hex_size | length_1 | thread_size_1 | thread_pitch_1 | nominal_size_1 | length_2 | thread_size_2 | thread_pitch_2 | nominal_size_2 | length_3 | thread_size_3 | thread_pitch_3 | nominal_size_3 | length_4 | thread_size_4 | thread_pitch_4 | weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C-0007 | 24.4 | 12.70 | NaN | 0.437 | 20.0 | NaN | NaN | NaN | NaN | 6.35 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.028 |
1 | C-0030 | NaN | NaN | 47.0 | NaN | NaN | 42 | 47.0 | NaN | NaN | 42.00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.914 |
2 | C-0041 | 28.0 | 32.00 | NaN | NaN | NaN | 21.7 | NaN | 0.812 | 16.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.120 |
3 | C-0043 | 38.0 | 27.00 | NaN | NaN | NaN | 21.7 | NaN | 0.812 | 16.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.090 |
4 | C-0044 | 30.5 | 41.28 | NaN | NaN | NaN | 34 | NaN | 1.437 | 12.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.182 |
#There are five columns with length. So I fill NA with 0, summarize length and drop excessive columns.
comp_threaded['length_1'] = comp_threaded['length_1'].fillna(0)
comp_threaded['length_2'] = comp_threaded['length_2'].fillna(0)
comp_threaded['length_3'] = comp_threaded['length_3'].fillna(0)
comp_threaded['length_4'] = comp_threaded['length_4'].fillna(0)
comp_threaded['overall_length'] = comp_threaded['overall_length'].fillna(0)
comp_threaded['overall_length'] = comp_threaded['overall_length'] + comp_threaded['length_1'] + comp_threaded['length_2'] \
+ comp_threaded['length_3'] + comp_threaded['length_4']
comp_threaded.drop(['length_1', 'length_2', 'length_3', 'length_4'], axis=1, inplace=True)
for i in comp_threaded.columns:
if comp_threaded[i].dtype != object:
if comp_threaded[i][comp_threaded[i] > 4 * comp_threaded[i].mean()].any() == True:
print(i)
print(comp_threaded.loc[comp_threaded[i] == comp_threaded[i].max()])
nominal_size_2 component_id overall_length hex_size thread_size_1 thread_pitch_1 \ 40 C-0386 42.7 60.3 2.0 12.0 nominal_size_1 thread_size_2 thread_pitch_2 nominal_size_2 \ 40 NaN NaN NaN 9999.0 thread_size_3 thread_pitch_3 nominal_size_3 thread_size_4 \ 40 NaN NaN NaN NaN thread_pitch_4 weight 40 NaN 0.543 nominal_size_3 component_id overall_length hex_size thread_size_1 thread_pitch_1 \ 90 C-0971 124.1 NaN 1.187 12.0 nominal_size_1 thread_size_2 thread_pitch_2 nominal_size_2 \ 90 NaN 1.187 12.0 NaN thread_size_3 thread_pitch_3 nominal_size_3 thread_size_4 \ 90 NaN NaN 9999.0 NaN thread_pitch_4 weight 90 NaN 0.545 weight component_id overall_length hex_size thread_size_1 thread_pitch_1 \ 91 C-0974 112.7 NaN 1.437 12.0 nominal_size_1 thread_size_2 thread_pitch_2 nominal_size_2 \ 91 NaN NaN NaN 31.75 thread_size_3 thread_pitch_3 nominal_size_3 thread_size_4 \ 91 NaN NaN NaN NaN thread_pitch_4 weight 91 NaN 1.17
comp_threaded.drop(comp_threaded.index[[40, 90]], inplace=True)
tube.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21198 entries, 0 to 21197 Data columns (total 16 columns): tube_assembly_id 21198 non-null object material_id 20919 non-null object diameter 21198 non-null float64 wall 21198 non-null float64 length 21198 non-null float64 num_bends 21198 non-null int64 bend_radius 21198 non-null float64 end_a_1x 21198 non-null object end_a_2x 21198 non-null object end_x_1x 21198 non-null object end_x_2x 21198 non-null object end_a 21198 non-null object end_x 21198 non-null object num_boss 21198 non-null int64 num_bracket 21198 non-null int64 other 21198 non-null int64 dtypes: float64(4), int64(4), object(8) memory usage: 2.6+ MB
tube.drop(['material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other'],
axis=1, inplace=True)
tube.head()
tube_assembly_id | diameter | wall | length | num_bends | bend_radius | |
---|---|---|---|---|---|---|
0 | TA-00001 | 12.70 | 1.65 | 164.0 | 5 | 38.10 |
1 | TA-00002 | 6.35 | 0.71 | 137.0 | 8 | 19.05 |
2 | TA-00003 | 6.35 | 0.71 | 127.0 | 7 | 19.05 |
3 | TA-00004 | 6.35 | 0.71 | 137.0 | 9 | 19.05 |
4 | TA-00005 | 19.05 | 1.24 | 109.0 | 4 | 50.80 |
for i in tube.columns:
if tube[i].dtype != object:
if tube[i][tube[i] > 4 * tube[i].mean()].any() == True:
print(i)
print(tube.loc[tube[i] == tube[i].max()])
diameter tube_assembly_id diameter wall length num_bends bend_radius 11306 TA-11307 203.2 1.9 62.0 0 0.0 11309 TA-11310 203.2 1.9 48.0 0 0.0 11341 TA-11342 203.2 1.9 40.0 0 0.0 13049 TA-13050 203.2 1.9 70.0 0 0.0 14709 TA-14710 203.2 1.9 64.0 0 0.0 15619 TA-15620 203.2 1.9 26.0 0 0.0 16762 TA-16763 203.2 1.9 129.0 2 203.2 17421 TA-17422 203.2 1.9 22.0 0 0.0 wall tube_assembly_id diameter wall length num_bends bend_radius 389 TA-00390 66.7 7.9 310.0 2 177.80 390 TA-00391 66.7 7.9 393.0 2 177.80 391 TA-00392 66.7 7.9 174.0 2 177.80 392 TA-00393 66.7 7.9 188.0 2 177.80 431 TA-00432 66.7 7.9 146.0 2 177.80 432 TA-00433 66.7 7.9 146.0 2 177.80 1228 TA-01229 66.7 7.9 430.0 5 177.80 1620 TA-01621 66.7 7.9 160.0 2 177.80 3835 TA-03836 66.7 7.9 114.0 3 180.00 4184 TA-04185 66.7 7.9 466.0 5 177.80 4185 TA-04186 66.7 7.9 149.0 2 177.80 4186 TA-04187 66.7 7.9 130.0 2 177.80 4613 TA-04614 66.7 7.9 139.0 2 177.80 5289 TA-05290 66.7 7.9 225.0 0 0.00 5290 TA-05291 66.7 7.9 161.0 2 180.00 5644 TA-05645 66.7 7.9 134.0 1 180.00 6446 TA-06447 66.7 7.9 211.0 2 180.00 6447 TA-06448 66.7 7.9 211.0 2 180.00 6448 TA-06449 66.7 7.9 351.0 2 180.00 6449 TA-06450 66.7 7.9 351.0 2 180.00 6518 TA-06519 66.7 7.9 141.0 2 180.00 8602 TA-08603 66.7 7.9 37.0 1 180.00 8932 TA-08933 66.7 7.9 221.0 2 180.00 9026 TA-09027 66.7 7.9 68.0 2 171.45 9027 TA-09028 66.7 7.9 68.0 2 171.45 11542 TA-11543 66.7 7.9 158.0 2 180.00 11543 TA-11544 66.7 7.9 158.0 2 180.00 11546 TA-11547 66.7 7.9 78.0 2 180.00 11637 TA-11638 66.7 7.9 78.0 2 180.00 11690 TA-11691 66.7 7.9 112.0 2 180.00 11691 TA-11692 66.7 7.9 110.0 2 180.00 15323 TA-15324 66.7 7.9 195.0 3 180.00 17688 TA-17689 66.7 7.9 88.0 2 9999.00 17689 TA-17690 66.7 7.9 71.0 2 9999.00 18002 TA-18003 66.7 7.9 63.0 2 9999.00 18003 TA-18004 66.7 7.9 81.0 2 9999.00 18093 TA-18094 66.7 7.9 172.0 2 180.00 18094 TA-18095 66.7 7.9 96.0 2 180.00 18095 TA-18096 66.7 7.9 83.0 2 180.00 18096 TA-18097 66.7 7.9 124.0 2 180.00 18097 TA-18098 66.7 7.9 130.0 2 180.00 18098 TA-18099 66.7 7.9 124.0 2 180.00 18099 TA-18100 66.7 7.9 130.0 2 180.00 length tube_assembly_id diameter wall length num_bends bend_radius 4216 TA-04217 15.88 1.65 1333.0 6 31.75 num_bends tube_assembly_id diameter wall length num_bends bend_radius 20087 TA-20089 6.35 2.000 93.0 17 19.05 20105 TA-20107 6.35 2.375 83.0 17 19.05 bend_radius tube_assembly_id diameter wall length num_bends bend_radius 15132 TA-15133 31.75 1.65 49.0 2 9999.0 15174 TA-15175 31.75 1.65 36.0 1 9999.0 15175 TA-15176 31.75 1.65 76.0 1 9999.0 17688 TA-17689 66.70 7.90 88.0 2 9999.0 17689 TA-17690 66.70 7.90 71.0 2 9999.0 18002 TA-18003 66.70 7.90 63.0 2 9999.0 18003 TA-18004 66.70 7.90 81.0 2 9999.0 19320 TA-19321 6.35 0.71 32.0 3 9999.0
tube.drop(tube.index[[15132, 15174, 15175, 17688, 17689, 18002, 18003, 19320]], inplace=True)
These files contain only descriptions, so I don't use them:
tube_end_form
type_component
type_connection
type_end_form
components
#Create several features from dates for additional information.
train['year'] = train.quote_date.dt.year
train['month'] = train.quote_date.dt.month
train['dayofyear'] = train.quote_date.dt.dayofyear
train['dayofweek'] = train.quote_date.dt.dayofweek
train['day'] = train.quote_date.dt.day
test['year'] = test.quote_date.dt.year
test['month'] = test.quote_date.dt.month
test['dayofyear'] = test.quote_date.dt.dayofyear
test['dayofweek'] = test.quote_date.dt.dayofweek
test['day'] = test.quote_date.dt.day
train = train.drop('quote_date',axis=1)
test = test.drop('quote_date',axis=1)
#I combine all files with info on components in one file.
all_comp = pd.concat([comp_adaptor, comp_boss, comp_elbow, comp_float, comp_hfl, comp_nut, comp_other,
comp_sleeve, comp_straight, comp_tee, comp_threaded])
all_comp.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2033 entries, 0 to 193 Data columns (total 29 columns): bolt_pattern_long 481 non-null float64 bolt_pattern_wide 362 non-null float64 component_id 2033 non-null object drop_length 174 non-null float64 elbow_angle 129 non-null float64 extension_length 173 non-null float64 height_over_tube 146 non-null float64 hex_nut_size 42 non-null float64 hex_size 126 non-null float64 hose_diameter 6 non-null float64 intended_nut_pitch 42 non-null float64 intended_nut_thread 42 non-null float64 length 107 non-null float64 nominal_size_1 67 non-null object nominal_size_2 132 non-null float64 nominal_size_3 8 non-null float64 overall_length 393 non-null float64 thickness 551 non-null float64 thread_pitch 65 non-null float64 thread_pitch_1 149 non-null float64 thread_pitch_2 83 non-null float64 thread_pitch_3 10 non-null float64 thread_pitch_4 1 non-null float64 thread_size 65 non-null object thread_size_1 149 non-null float64 thread_size_2 83 non-null float64 thread_size_3 10 non-null float64 thread_size_4 1 non-null float64 weight 1964 non-null float64 dtypes: float64(26), object(3) memory usage: 476.5+ KB
#Some columns have little values, some have strings and integers, so I use only general parameters
all_comp = all_comp[['component_id', 'weight', 'length', 'overall_length', 'thickness']]
all_comp.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2033 entries, 0 to 193 Data columns (total 5 columns): component_id 2033 non-null object weight 1964 non-null float64 length 107 non-null float64 overall_length 393 non-null float64 thickness 551 non-null float64 dtypes: float64(4), object(1) memory usage: 95.3+ KB
#Combine two length columns.
all_comp['overall_length'] = all_comp['overall_length'].fillna(0)
all_comp['length'] = all_comp['length'].fillna(0)
all_comp['length'] = all_comp['length'] + all_comp['overall_length']
all_comp = all_comp.drop(['overall_length'], axis=1)
all_comp['weight'] = all_comp['weight'].fillna(0)
all_comp['thickness'] = all_comp['thickness'].fillna(0)
#This is how file with components looks like
all_comp.head()
component_id | weight | length | thickness | |
---|---|---|---|---|
0 | C-0005 | 0.206 | 58.4 | 0.0 |
1 | C-0006 | 0.083 | 34.8 | 0.0 |
2 | C-1435 | 0.023 | 20.3 | 0.0 |
3 | C-1546 | 0.026 | 26.4 | 0.0 |
4 | C-1583 | 0.256 | 44.5 | 0.0 |
#Add information about tube itself and the list of components to main files.
train = pd.merge(train, tube, on='tube_assembly_id', how='left')
train = pd.merge(train, bill_of_materials, on ='tube_assembly_id', how='left')
test = pd.merge(test, tube, on='tube_assembly_id', how='left')
test = pd.merge(test, bill_of_materials, on ='tube_assembly_id', how='left')
#Rename columns so that they will be different from length of components.
train.rename(columns={'length': 'length_t'}, inplace = True)
test.rename(columns={'length': 'length_t'}, inplace = True)
#Merging to get information about components
for i in range(1, 9, 2):
suffix1 = '_' + str(i)
suffix2 = '_' + str(i + 1)
component_1 = 'component_id' + suffix1
component_2 = 'component_id' + suffix2
train = pd.merge(train, all_comp, left_on = component_1, right_on = 'component_id', how='left')
train = pd.merge(train, all_comp, left_on = component_2, right_on = 'component_id', suffixes=(suffix1, suffix2), how='left')
test = pd.merge(test, all_comp, left_on = component_1, right_on = 'component_id', how='left')
test = pd.merge(test, all_comp, left_on = component_2, right_on = 'component_id', suffixes=(suffix1, suffix2), how='left')
#Drop unnecessary columns
train.drop(['component_id_1', 'component_id_2', 'component_id_3', 'component_id_4', 'component_id_5', 'component_id_6',
'component_id_7', 'component_id_8'], axis=1, inplace=True)
test.drop(['component_id_1', 'component_id_2', 'component_id_3', 'component_id_4', 'component_id_5', 'component_id_6',
'component_id_7', 'component_id_8'], axis=1, inplace=True)
train.head()
tube_assembly_id | supplier | annual_usage | min_order_quantity | bracket_pricing | quantity | cost | year | month | dayofyear | ... | thickness_5 | weight_6 | length_6 | thickness_6 | weight_7 | length_7 | thickness_7 | weight_8 | length_8 | thickness_8 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TA-00002 | S-0066 | 0 | 0 | Yes | 1 | 21.905933 | 2013 | 7 | 188 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | TA-00002 | S-0066 | 0 | 0 | Yes | 2 | 12.341214 | 2013 | 7 | 188 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | TA-00002 | S-0066 | 0 | 0 | Yes | 5 | 6.601826 | 2013 | 7 | 188 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | TA-00002 | S-0066 | 0 | 0 | Yes | 10 | 4.687770 | 2013 | 7 | 188 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | TA-00002 | S-0066 | 0 | 0 | Yes | 25 | 3.541561 | 2013 | 7 | 188 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 49 columns
#Add descriptive information about specs.
train = pd.merge(train, specs, on='tube_assembly_id', how='left')
test = pd.merge(test, specs, on='tube_assembly_id', how='left')
#Maybe it is strange, but it turned out that tube id is quite a good feature. It seems to be data leak
train['ta_id'] = train['tube_assembly_id'].apply(lambda x: int(x.split('-')[1]))
test['ta_id'] = test['tube_assembly_id'].apply(lambda x: int(x.split('-')[1]))
train.drop(['tube_assembly_id'], axis=1, inplace=True)
test.drop(['tube_assembly_id'], axis=1, inplace=True)
I created a lot of derivative features adn tried them. The following features turned out to be good
[col for col in list(train.columns) if 'thickness' in col]
['thickness_1', 'thickness_2', 'thickness_3', 'thickness_4', 'thickness_5', 'thickness_6', 'thickness_7', 'thickness_8']
#Calculate various additional features on physical parameters. They turned out to be useful.
length_columns = [col for col in list(train.columns) if 'length' in col]
weight_columns = [col for col in list(train.columns) if 'weight' in col]
thickness_columns = [col for col in list(train.columns) if 'thickness' in col]
train['avg_w'] = train[weight_columns].mean(axis=1)
train['avg_l'] = train[length_columns].mean(axis=1)
train['avg_th'] = train[thickness_columns].mean(axis=1)
train['min_w'] = train[weight_columns].min(axis=1)
train['min_l'] = train[length_columns].min(axis=1)
train['min_th'] = train[thickness_columns].min(axis=1)
train['max_w'] = train[weight_columns].max(axis=1)
train['max_l'] = train[length_columns].max(axis=1)
train['max_th'] = train[thickness_columns].max(axis=1)
test['avg_w'] = test[weight_columns].mean(axis=1)
test['avg_l'] = test[length_columns].mean(axis=1)
test['avg_th'] = test[thickness_columns].mean(axis=1)
test['min_w'] = test[weight_columns].min(axis=1)
test['min_l'] = test[length_columns].min(axis=1)
test['min_th'] = test[thickness_columns].min(axis=1)
test['max_w'] = test[weight_columns].max(axis=1)
test['max_l'] = test[length_columns].max(axis=1)
test['max_th'] = test[thickness_columns].max(axis=1)
train['tot_w'] = train[weight_columns].sum(axis=1)
train['tot_l'] = train[length_columns].sum(axis=1)
test['tot_w'] = test[weight_columns].sum(axis=1)
test['tot_l'] = test[length_columns].sum(axis=1)
#Take log of skewered columns to smooth them and fill NA.
for col in train.columns:
if train[col].dtype != 'object':
if skew(train[col]) > 0.75:
train[col] = np.log1p(train[col])
train[col] = train[col].apply(lambda x: 0 if x == -np.inf else x)
train[col] = train[col].fillna(0)
for col in test.columns:
if test[col].dtype != 'object':
if skew(test[col]) > 0.75:
test[col] = np.log1p(test[col])
test[col] = test[col].apply(lambda x: 0 if x == -np.inf else x)
test[col] = test[col].fillna(0)
for col in train.columns:
if train[col].dtype == 'object':
train[col].replace(np.nan,' ', regex=True, inplace= True)
for col in test.columns:
if test[col].dtype == 'object':
test[col].replace(np.nan,' ', regex=True, inplace= True)
X_train = train.drop('cost',axis=1)
Y_train = train['cost']
X_test = test.drop('id', axis=1)
#Check that the columns are the same
(X_test.columns == X_train.columns).all()
True
#Convert to arrays for easier transformation
X_train = np.array(X_train)
X_test = np.array(X_test)
#Label encode the categorical variables
for i in range(X_train.shape[1]):
if i in [0, 3, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56]:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train[:,i]) + list(X_test[:,i]))
X_train[:,i] = lbl.transform(X_train[:,i])
X_test[:,i] = lbl.transform(X_test[:,i])
#XGB need float.
X_train = X_train.astype(float)
X_test = X_test.astype(float)
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.1
params['min_child_weight'] = 5
params['subsample'] = 1.0
params['scale_pos_weight'] = 1.0
params['silent'] = 1
params['max_depth'] = 7
param = list(params.items())
xgtrain = xgb.DMatrix(X_train, label=Y_train)
xgtest = xgb.DMatrix(X_test)
num_rounds = 1200
model = xgb.train(param, xgtrain, num_rounds)
preds = np.expm1(model.predict(xgtest))
preds_df = pd.DataFrame({'id': test['id'], 'cost': preds})
preds_df.to_csv('Caterpillar.csv', index=False)
#0.229153 from ~0.19
This competition has already ended, but people still can submit their solutions and see their scores. First places have a score ~0.19.
My model got a score of 0.229153.