#!/usr/bin/env python # coding: utf-8 # ## Caterpillar Tube Pricing # # In this competition we have data about tube assemblies which suppliers deliver to Caterpillar. The data is comprised of many files describing these tube assemblies. # # - train_set and test_set contain information on price quotes from suppliers. Prices can be quoted in 2 ways: bracket and non-bracket pricing. Bracket pricing has multiple levels of purchase based on quantity (in other words, the cost is given assuming a purchase of quantity tubes). Non-bracket pricing has a minimum order amount (min_order) for which the price would apply. Each quote is issued with an annual_usage, an estimate of how many tube assemblies will be purchased in a given year; # - tube contains information on physical parameters of tube assemblies; # - bill_of_materials contains the list of components, and their quantities, used on each tube assembly; # - specs contains the list of unique specifications for the tube assembly; # - tube_end_form contains list of end types which are physically formed utilizing only the wall of the tube; # - components contains the list of all of the components used; # - type... contain the names for each feature; # - comp... contains information on physical parameters of components by their type; # # The goal of competition if to predict prices of tube assemblies based on this information. # # The main challenge of this competition is combining the data and choosing/creating features. After this I use XGBoost for prediction. # # The metric to calculate the accuracy of predictions is Root Mean Squared Logarithmic Error (it penalizes an under-predicted estimate greater than an over-predicted estimate). # # The RMSLE is calculated as # # $$ # \epsilon = \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } # $$ # # Where: # # \$\epsilon\$ is the RMSLE value (score); # \$n\$ is the number of observations; # \$p_i\$ is prediction; # \$a_i\$ is the actual response for \$i\$; # \$\log(x)\$ is the natural logarithm of \$x\$ # 1. [Data exploration](#datex) # # 1.1 [Bill of materials](#bom) # # 1.2 [Comp Adaptor](#comad) # # 1.3 [Comp Boss](#combo) # # 1.4 [Comp Hfl](#comhf) # # 1.5 [Comp Elbow](#comel) # # 1.6 [Comp Float](#comfl) # # 1.7 [Comp Nut](#comnu) # # 1.8 [Comp Other](#comot) # # 1.9 [Comp Sleeve](#comsl) # # 1.10 [Comp Straight](#comst) # # 1.11 [Comp Tee](#comte) # # 1.12 [Comp Threaded](#comth) # # 1.13 [Tube](#tube) # # 2. [Data preparation](#datprep) # # 3. [Model](#model) # In[1]: import numpy as np import pandas as pd import xgboost as xgb from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder from scipy.stats import skew import glob # ## Data exploration # # An easy way to read all the files in separate variables is to get the list of files and loop through it. # In[2]: glob.glob('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/*.csv') # In[3]: #Read each file in a separate data frame. bill_of_materials = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/bill_of_materials.csv') components = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/components.csv') comp_adaptor = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_adaptor.csv') comp_boss = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_boss.csv') comp_elbow = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_elbow.csv') comp_float = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_float.csv') comp_hfl = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_hfl.csv') comp_nut = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_nut.csv') comp_other = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_other.csv') comp_sleeve = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_sleeve.csv') comp_straight = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_straight.csv') comp_tee = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_tee.csv') comp_threaded = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/comp_threaded.csv') specs = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/specs.csv') tube = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/tube.csv') tube_end_form = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/tube_end_form.csv') type_component = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/type_component.csv') type_connection = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/type_connection.csv') type_end_form = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/type_end_form.csv') train = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/train_set.csv', parse_dates=[2,]) test = pd.read_csv('Kaggle/Data_Kaggle/Caterpillar Tube Pricing/test_set.csv', parse_dates=[3,]) # ## Bill of materials # In[4]: #The file contains information about components of tube assemblies. All information is necessary. #Missing values could be filled only with 0, but it isn't necessary. bill_of_materials.info() # In[5]: #Simply to see the line with all 8 components. bill_of_materials.loc[bill_of_materials.quantity_8.notnull() == True] # ## Comp Adaptor # In[6]: #There are columns with too few non-null values. But it is necessary to see more. comp_adaptor.info() # In[7]: comp_adaptor # component_type_id, end_form_id_1, connection_type_id_1, end_form_id_2, connection_type_id_2 - descriptive information, it is useless in all files. Drop. # # adaptor_angle - only one value. Drop. # # unique_feature, orientation - categorical description. Drop. # # C-1695 has abnormal values. And only one tube assembly has it. So I'll drop it. # # C-0443 has NaN weight. And it belongs to only one tube assembly, which has two components, both with NaN weight. So drop. # # C-1868. It seems that for some reason overall_length wasn't calculated. I'll calculate it as a sum of length_1 and length_2. # Drop length_1 and length_2 after using. # In[8]: comp_adaptor.drop(['adaptor_angle', 'component_type_id', 'end_form_id_1', 'connection_type_id_1', 'length_1', 'length_2', 'unique_feature', 'orientation', 'end_form_id_2', 'connection_type_id_2'], axis=1, inplace=True) #Could input a formula, but it single value. comp_adaptor.loc[comp_adaptor['overall_length'].isnull(), 'overall_length'] = 93.5 comp_adaptor.drop(comp_adaptor.index[[8, 21]], inplace=True) comp_adaptor # ## Comp Boss # In[9]: #Descriptive and categorical features will be dropped. comp_boss.info() # In[10]: #Use only important information. comp_boss = comp_boss[['component_id', 'height_over_tube', 'weight']] comp_boss.info() # I wrote the following to check for outliers. Sometimes there are values which are too big. It is either mistake or it is due to unknown reason. I'll drop such values. Threshold is arbitrary. # In[11]: for i in comp_boss.columns: if comp_boss[i].dtype != object: if comp_boss[i][comp_boss[i] > 4 * comp_boss[i].mean()].any() == True: print(i) print(comp_boss.loc[comp_boss[i] == comp_boss[i].max()]) # In[12]: #Drop row with too big value. I don't drop weight, because it could be reasonable comp_boss.drop(comp_boss.index[31], inplace=True) comp_boss.head() # ## Comp Hfl # In[13]: comp_hfl.info() # In[14]: comp_hfl # In[15]: #It seems that only three columns are necessary. comp_hfl = comp_hfl[['component_id', 'hose_diameter', 'weight']] comp_hfl # ## Comp Elbow # In[16]: comp_elbow.info() # In[17]: comp_elbow.head() # In[18]: #Most of the columns aren't necessary. comp_elbow.drop(['component_type_id', 'mj_class_code', 'mj_plug_class_code', 'plug_diameter', 'groove', 'unique_feature', 'orientation',], axis=1, inplace=True) # In[19]: for i in comp_elbow.columns: if comp_elbow[i].dtype != object: if comp_elbow[i][comp_elbow[i] > 4 * comp_elbow[i].mean()].any() == True: print(i) print(comp_elbow.loc[comp_elbow[i] == comp_elbow[i].max()]) # In[20]: comp_elbow.drop(comp_elbow.index[52], inplace=True) # ## Comp Float # In[21]: comp_float.info() # In[22]: #Drop description. comp_float.drop(['component_type_id', 'orientation'], axis=1, inplace=True) comp_float # ## Comp Nut # In[23]: comp_nut.info() # In[24]: comp_nut.drop(['component_type_id', 'seat_angle', 'diameter', 'blind_hole', 'orientation'], axis=1, inplace=True) comp_nut.head() # In[25]: for i in comp_nut.columns: if comp_nut[i].dtype != object: if comp_nut[i][comp_nut[i] > 4 * comp_nut[i].mean()].any() == True: print(i) print(comp_nut.loc[comp_nut[i] == comp_nut[i].max()]) # ## Comp Other # In[26]: comp_other.info() # In[27]: #Dtop description. comp_other.drop(['part_name'], axis=1, inplace=True) comp_other.head() # ## Comp Sleeve # In[28]: comp_sleeve.info() # In[29]: comp_sleeve.drop(['component_type_id', 'connection_type_id', 'unique_feature', 'plating', 'orientation'], axis=1, inplace=True) comp_sleeve.head() # In[30]: for i in comp_sleeve.columns: if comp_sleeve[i].dtype != object: if comp_sleeve[i][comp_sleeve[i] > 4 * comp_sleeve[i].mean()].any() == True: print(i) print(comp_sleeve.loc[comp_sleeve[i] == comp_sleeve[i].max()]) # In[31]: comp_sleeve.drop(comp_sleeve.index[[28, 29, 30, 31, 32, 33, 34, 48]], inplace=True) # ## Comp Straight # In[32]: comp_straight.info() # In[33]: comp_straight.drop(['component_type_id', 'overall_length', 'mj_class_code', 'head_diameter', 'unique_feature', 'groove', 'orientation'], axis=1, inplace=True) comp_straight.head() # In[34]: for i in comp_straight.columns: if comp_straight[i].dtype != object: if comp_straight[i][comp_straight[i] > 4 * comp_straight[i].mean()].any() == True: print(i) print(comp_straight.loc[comp_straight[i] == comp_straight[i].max()]) # ## Comp Tee # In[35]: comp_tee.info() # In[36]: comp_tee.drop(['component_type_id', 'mj_class_code', 'mj_plug_class_code', 'groove', 'unique_feature', 'orientation'], axis=1, inplace=True) comp_tee # In[37]: for i in comp_tee.columns: if comp_tee[i].dtype != object: if comp_tee[i][comp_tee[i] > 4 * comp_tee[i].mean()].any() == True: print(i) print(comp_tee.loc[comp_tee[i] == comp_tee[i].max()]) # ## Comp Threaded # In[38]: comp_threaded.info() # In[39]: comp_threaded.drop(['component_type_id', 'adaptor_angle', 'end_form_id_1', 'connection_type_id_1', 'end_form_id_2', 'connection_type_id_2', 'end_form_id_3', 'connection_type_id_3', 'end_form_id_4', 'connection_type_id_4', 'nominal_size_4', 'unique_feature', 'orientation'], axis=1, inplace=True) comp_threaded.head() # In[40]: #There are five columns with length. So I fill NA with 0, summarize length and drop excessive columns. comp_threaded['length_1'] = comp_threaded['length_1'].fillna(0) comp_threaded['length_2'] = comp_threaded['length_2'].fillna(0) comp_threaded['length_3'] = comp_threaded['length_3'].fillna(0) comp_threaded['length_4'] = comp_threaded['length_4'].fillna(0) comp_threaded['overall_length'] = comp_threaded['overall_length'].fillna(0) comp_threaded['overall_length'] = comp_threaded['overall_length'] + comp_threaded['length_1'] + comp_threaded['length_2'] \ + comp_threaded['length_3'] + comp_threaded['length_4'] comp_threaded.drop(['length_1', 'length_2', 'length_3', 'length_4'], axis=1, inplace=True) # In[41]: for i in comp_threaded.columns: if comp_threaded[i].dtype != object: if comp_threaded[i][comp_threaded[i] > 4 * comp_threaded[i].mean()].any() == True: print(i) print(comp_threaded.loc[comp_threaded[i] == comp_threaded[i].max()]) # In[42]: comp_threaded.drop(comp_threaded.index[[40, 90]], inplace=True) # ## Tube # In[43]: tube.info() # In[44]: tube.drop(['material_id', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'end_a', 'end_x', 'num_boss', 'num_bracket', 'other'], axis=1, inplace=True) tube.head() # In[45]: for i in tube.columns: if tube[i].dtype != object: if tube[i][tube[i] > 4 * tube[i].mean()].any() == True: print(i) print(tube.loc[tube[i] == tube[i].max()]) # In[46]: tube.drop(tube.index[[15132, 15174, 15175, 17688, 17689, 18002, 18003, 19320]], inplace=True) # These files contain only descriptions, so I don't use them: # # tube_end_form # # type_component # # type_connection # # type_end_form # # components # # Data preparation # In[47]: #Create several features from dates for additional information. train['year'] = train.quote_date.dt.year train['month'] = train.quote_date.dt.month train['dayofyear'] = train.quote_date.dt.dayofyear train['dayofweek'] = train.quote_date.dt.dayofweek train['day'] = train.quote_date.dt.day test['year'] = test.quote_date.dt.year test['month'] = test.quote_date.dt.month test['dayofyear'] = test.quote_date.dt.dayofyear test['dayofweek'] = test.quote_date.dt.dayofweek test['day'] = test.quote_date.dt.day train = train.drop('quote_date',axis=1) test = test.drop('quote_date',axis=1) # In[48]: #I combine all files with info on components in one file. all_comp = pd.concat([comp_adaptor, comp_boss, comp_elbow, comp_float, comp_hfl, comp_nut, comp_other, comp_sleeve, comp_straight, comp_tee, comp_threaded]) # In[49]: all_comp.info() # In[50]: #Some columns have little values, some have strings and integers, so I use only general parameters all_comp = all_comp[['component_id', 'weight', 'length', 'overall_length', 'thickness']] all_comp.info() # In[51]: #Combine two length columns. all_comp['overall_length'] = all_comp['overall_length'].fillna(0) all_comp['length'] = all_comp['length'].fillna(0) all_comp['length'] = all_comp['length'] + all_comp['overall_length'] all_comp = all_comp.drop(['overall_length'], axis=1) all_comp['weight'] = all_comp['weight'].fillna(0) all_comp['thickness'] = all_comp['thickness'].fillna(0) # In[52]: #This is how file with components looks like all_comp.head() # In[53]: #Add information about tube itself and the list of components to main files. train = pd.merge(train, tube, on='tube_assembly_id', how='left') train = pd.merge(train, bill_of_materials, on ='tube_assembly_id', how='left') test = pd.merge(test, tube, on='tube_assembly_id', how='left') test = pd.merge(test, bill_of_materials, on ='tube_assembly_id', how='left') # In[54]: #Rename columns so that they will be different from length of components. train.rename(columns={'length': 'length_t'}, inplace = True) test.rename(columns={'length': 'length_t'}, inplace = True) # In[55]: #Merging to get information about components for i in range(1, 9, 2): suffix1 = '_' + str(i) suffix2 = '_' + str(i + 1) component_1 = 'component_id' + suffix1 component_2 = 'component_id' + suffix2 train = pd.merge(train, all_comp, left_on = component_1, right_on = 'component_id', how='left') train = pd.merge(train, all_comp, left_on = component_2, right_on = 'component_id', suffixes=(suffix1, suffix2), how='left') test = pd.merge(test, all_comp, left_on = component_1, right_on = 'component_id', how='left') test = pd.merge(test, all_comp, left_on = component_2, right_on = 'component_id', suffixes=(suffix1, suffix2), how='left') # In[56]: #Drop unnecessary columns train.drop(['component_id_1', 'component_id_2', 'component_id_3', 'component_id_4', 'component_id_5', 'component_id_6', 'component_id_7', 'component_id_8'], axis=1, inplace=True) test.drop(['component_id_1', 'component_id_2', 'component_id_3', 'component_id_4', 'component_id_5', 'component_id_6', 'component_id_7', 'component_id_8'], axis=1, inplace=True) train.head() # In[57]: #Add descriptive information about specs. train = pd.merge(train, specs, on='tube_assembly_id', how='left') test = pd.merge(test, specs, on='tube_assembly_id', how='left') # In[58]: #Maybe it is strange, but it turned out that tube id is quite a good feature. It seems to be data leak train['ta_id'] = train['tube_assembly_id'].apply(lambda x: int(x.split('-')[1])) test['ta_id'] = test['tube_assembly_id'].apply(lambda x: int(x.split('-')[1])) train.drop(['tube_assembly_id'], axis=1, inplace=True) test.drop(['tube_assembly_id'], axis=1, inplace=True) # I created a lot of derivative features adn tried them. The following features turned out to be good # In[59]: [col for col in list(train.columns) if 'thickness' in col] # In[60]: #Calculate various additional features on physical parameters. They turned out to be useful. length_columns = [col for col in list(train.columns) if 'length' in col] weight_columns = [col for col in list(train.columns) if 'weight' in col] thickness_columns = [col for col in list(train.columns) if 'thickness' in col] train['avg_w'] = train[weight_columns].mean(axis=1) train['avg_l'] = train[length_columns].mean(axis=1) train['avg_th'] = train[thickness_columns].mean(axis=1) train['min_w'] = train[weight_columns].min(axis=1) train['min_l'] = train[length_columns].min(axis=1) train['min_th'] = train[thickness_columns].min(axis=1) train['max_w'] = train[weight_columns].max(axis=1) train['max_l'] = train[length_columns].max(axis=1) train['max_th'] = train[thickness_columns].max(axis=1) test['avg_w'] = test[weight_columns].mean(axis=1) test['avg_l'] = test[length_columns].mean(axis=1) test['avg_th'] = test[thickness_columns].mean(axis=1) test['min_w'] = test[weight_columns].min(axis=1) test['min_l'] = test[length_columns].min(axis=1) test['min_th'] = test[thickness_columns].min(axis=1) test['max_w'] = test[weight_columns].max(axis=1) test['max_l'] = test[length_columns].max(axis=1) test['max_th'] = test[thickness_columns].max(axis=1) train['tot_w'] = train[weight_columns].sum(axis=1) train['tot_l'] = train[length_columns].sum(axis=1) test['tot_w'] = test[weight_columns].sum(axis=1) test['tot_l'] = test[length_columns].sum(axis=1) # In[61]: #Take log of skewered columns to smooth them and fill NA. for col in train.columns: if train[col].dtype != 'object': if skew(train[col]) > 0.75: train[col] = np.log1p(train[col]) train[col] = train[col].apply(lambda x: 0 if x == -np.inf else x) train[col] = train[col].fillna(0) for col in test.columns: if test[col].dtype != 'object': if skew(test[col]) > 0.75: test[col] = np.log1p(test[col]) test[col] = test[col].apply(lambda x: 0 if x == -np.inf else x) test[col] = test[col].fillna(0) # In[62]: for col in train.columns: if train[col].dtype == 'object': train[col].replace(np.nan,' ', regex=True, inplace= True) for col in test.columns: if test[col].dtype == 'object': test[col].replace(np.nan,' ', regex=True, inplace= True) # In[63]: X_train = train.drop('cost',axis=1) Y_train = train['cost'] X_test = test.drop('id', axis=1) # In[64]: #Check that the columns are the same (X_test.columns == X_train.columns).all() # In[65]: #Convert to arrays for easier transformation X_train = np.array(X_train) X_test = np.array(X_test) # In[66]: #Label encode the categorical variables for i in range(X_train.shape[1]): if i in [0, 3, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56]: lbl = preprocessing.LabelEncoder() lbl.fit(list(X_train[:,i]) + list(X_test[:,i])) X_train[:,i] = lbl.transform(X_train[:,i]) X_test[:,i] = lbl.transform(X_test[:,i]) # In[67]: #XGB need float. X_train = X_train.astype(float) X_test = X_test.astype(float) # ## Model # In[68]: params = {} params['objective'] = 'reg:linear' params['eta'] = 0.1 params['min_child_weight'] = 5 params['subsample'] = 1.0 params['scale_pos_weight'] = 1.0 params['silent'] = 1 params['max_depth'] = 7 param = list(params.items()) # In[69]: xgtrain = xgb.DMatrix(X_train, label=Y_train) xgtest = xgb.DMatrix(X_test) # In[70]: num_rounds = 1200 model = xgb.train(param, xgtrain, num_rounds) preds = np.expm1(model.predict(xgtest)) # In[71]: preds_df = pd.DataFrame({'id': test['id'], 'cost': preds}) preds_df.to_csv('Caterpillar.csv', index=False) #0.229153 from ~0.19 # This competition has already ended, but people still can submit their solutions and see their scores. First places have a score ~0.19. # # My model got a score of 0.229153.