Import the pandas, scikit-learn, numpy and category_encoder libraries.
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import category_encoders as ce
Need to define the headers since the data does not contain any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style",
"drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
Read in the data from the url, add headers and convert ? to nan values
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
header=None, names=headers, na_values="?" )
df.head()
symboling | normalized_losses | make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | wheel_base | ... | engine_size | fuel_system | bore | stroke | compression_ratio | horsepower | peak_rpm | city_mpg | highway_mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | NaN | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111.0 | 5000.0 | 21 | 27 | 13495.0 |
1 | 3 | NaN | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111.0 | 5000.0 | 21 | 27 | 16500.0 |
2 | 1 | NaN | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154.0 | 5000.0 | 19 | 26 | 16500.0 |
3 | 2 | 164.0 | audi | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102.0 | 5500.0 | 24 | 30 | 13950.0 |
4 | 2 | 164.0 | audi | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115.0 | 5500.0 | 18 | 22 | 17450.0 |
5 rows × 26 columns
Look at the data types contained in the dataframe
df.dtypes
symboling int64 normalized_losses float64 make object fuel_type object aspiration object num_doors object body_style object drive_wheels object engine_location object wheel_base float64 length float64 width float64 height float64 curb_weight int64 engine_type object num_cylinders object engine_size int64 fuel_system object bore float64 stroke float64 compression_ratio float64 horsepower float64 peak_rpm float64 city_mpg int64 highway_mpg int64 price float64 dtype: object
Create a copy of the data with only the object columns.
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()
make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | engine_type | num_cylinders | fuel_system | |
---|---|---|---|---|---|---|---|---|---|---|
0 | alfa-romero | gas | std | two | convertible | rwd | front | dohc | four | mpfi |
1 | alfa-romero | gas | std | two | convertible | rwd | front | dohc | four | mpfi |
2 | alfa-romero | gas | std | two | hatchback | rwd | front | ohcv | six | mpfi |
3 | audi | gas | std | four | sedan | fwd | front | ohc | four | mpfi |
4 | audi | gas | std | four | sedan | 4wd | front | ohc | five | mpfi |
Check for null values in the data
obj_df[obj_df.isnull().any(axis=1)]
make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | engine_type | num_cylinders | fuel_system | |
---|---|---|---|---|---|---|---|---|---|---|
27 | dodge | gas | turbo | NaN | sedan | fwd | front | ohc | four | mpfi |
63 | mazda | diesel | std | NaN | sedan | fwd | front | ohc | four | idi |
Since the num_doors column contains the null values, look at what values are current options
obj_df["num_doors"].value_counts()
four 114 two 89 Name: num_doors, dtype: int64
We will fill in the doors value with the most common element - four.
obj_df = obj_df.fillna({"num_doors": "four"})
obj_df[obj_df.isnull().any(axis=1)]
make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | engine_type | num_cylinders | fuel_system |
---|
Convert the num_cylinders and num_doors values to numbers
obj_df["num_cylinders"].value_counts()
four 159 six 24 five 11 eight 5 two 4 three 1 twelve 1 Name: num_cylinders, dtype: int64
cleanup_nums = {"num_doors": {"four": 4, "two": 2},
"num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
"two": 2, "twelve": 12, "three":3 }}
obj_df = obj_df.replace(cleanup_nums)
obj_df.head()
make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | engine_type | num_cylinders | fuel_system | |
---|---|---|---|---|---|---|---|---|---|---|
0 | alfa-romero | gas | std | 2 | convertible | rwd | front | dohc | 4 | mpfi |
1 | alfa-romero | gas | std | 2 | convertible | rwd | front | dohc | 4 | mpfi |
2 | alfa-romero | gas | std | 2 | hatchback | rwd | front | ohcv | 6 | mpfi |
3 | audi | gas | std | 4 | sedan | fwd | front | ohc | 4 | mpfi |
4 | audi | gas | std | 4 | sedan | 4wd | front | ohc | 5 | mpfi |
obj_df.dtypes
make object fuel_type object aspiration object num_doors int64 body_style object drive_wheels object engine_location object engine_type object num_cylinders int64 fuel_system object dtype: object
One approach to encoding labels is to convert the values to a pandas category
obj_df["body_style"].value_counts()
sedan 96 hatchback 70 wagon 25 hardtop 8 convertible 6 Name: body_style, dtype: int64
obj_df["body_style"] = obj_df["body_style"].astype('category')
obj_df.dtypes
make object fuel_type object aspiration object num_doors int64 body_style category drive_wheels object engine_location object engine_type object num_cylinders int64 fuel_system object dtype: object
We can assign the category codes to a new column so we have a clean numeric representation
obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
obj_df.head()
make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | engine_type | num_cylinders | fuel_system | body_style_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | alfa-romero | gas | std | 2 | convertible | rwd | front | dohc | 4 | mpfi | 0 |
1 | alfa-romero | gas | std | 2 | convertible | rwd | front | dohc | 4 | mpfi | 0 |
2 | alfa-romero | gas | std | 2 | hatchback | rwd | front | ohcv | 6 | mpfi | 2 |
3 | audi | gas | std | 4 | sedan | fwd | front | ohc | 4 | mpfi | 3 |
4 | audi | gas | std | 4 | sedan | 4wd | front | ohc | 5 | mpfi | 3 |
obj_df.dtypes
make object fuel_type object aspiration object num_doors int64 body_style category drive_wheels object engine_location object engine_type object num_cylinders int64 fuel_system object body_style_cat int8 dtype: object
In order to do one hot encoding, use pandas get_dummies
pd.get_dummies(obj_df, columns=["drive_wheels"]).head()
make | fuel_type | aspiration | num_doors | body_style | engine_location | engine_type | num_cylinders | fuel_system | body_style_cat | drive_wheels_4wd | drive_wheels_fwd | drive_wheels_rwd | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | alfa-romero | gas | std | 2 | convertible | front | dohc | 4 | mpfi | 0 | 0 | 0 | 1 |
1 | alfa-romero | gas | std | 2 | convertible | front | dohc | 4 | mpfi | 0 | 0 | 0 | 1 |
2 | alfa-romero | gas | std | 2 | hatchback | front | ohcv | 6 | mpfi | 2 | 0 | 0 | 1 |
3 | audi | gas | std | 4 | sedan | front | ohc | 4 | mpfi | 3 | 0 | 1 | 0 |
4 | audi | gas | std | 4 | sedan | front | ohc | 5 | mpfi | 3 | 1 | 0 | 0 |
get_dummiers has options for selecting the columns and adding prefixes to make the resulting data easier to understand.
pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"]).head()
make | fuel_type | aspiration | num_doors | engine_location | engine_type | num_cylinders | fuel_system | body_style_cat | body_convertible | body_hardtop | body_hatchback | body_sedan | body_wagon | drive_4wd | drive_fwd | drive_rwd | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | alfa-romero | gas | std | 2 | front | dohc | 4 | mpfi | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | alfa-romero | gas | std | 2 | front | dohc | 4 | mpfi | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | alfa-romero | gas | std | 2 | front | ohcv | 6 | mpfi | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
3 | audi | gas | std | 4 | front | ohc | 4 | mpfi | 3 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
4 | audi | gas | std | 4 | front | ohc | 5 | mpfi | 3 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
obj_df["engine_type"].value_counts()
ohc 148 ohcf 15 ohcv 13 l 12 dohc 12 rotor 4 dohcv 1 Name: engine_type, dtype: int64
Use np.where and the str accessor to do this in one efficient line
obj_df["OHC_Code"] = np.where(obj_df["engine_type"].str.contains("ohc"), 1, 0)
obj_df[["make", "engine_type", "OHC_Code"]].head(20)
make | engine_type | OHC_Code | |
---|---|---|---|
0 | alfa-romero | dohc | 1 |
1 | alfa-romero | dohc | 1 |
2 | alfa-romero | ohcv | 1 |
3 | audi | ohc | 1 |
4 | audi | ohc | 1 |
5 | audi | ohc | 1 |
6 | audi | ohc | 1 |
7 | audi | ohc | 1 |
8 | audi | ohc | 1 |
9 | audi | ohc | 1 |
10 | bmw | ohc | 1 |
11 | bmw | ohc | 1 |
12 | bmw | ohc | 1 |
13 | bmw | ohc | 1 |
14 | bmw | ohc | 1 |
15 | bmw | ohc | 1 |
16 | bmw | ohc | 1 |
17 | bmw | ohc | 1 |
18 | chevrolet | l | 0 |
19 | chevrolet | ohc | 1 |
Instantiate the LabelEncoder
ord_enc = OrdinalEncoder()
obj_df["make_code"] = ord_enc.fit_transform(obj_df[["make"]])
obj_df[["make", "make_code"]].head(11)
make | make_code | |
---|---|---|
0 | alfa-romero | 0.0 |
1 | alfa-romero | 0.0 |
2 | alfa-romero | 0.0 |
3 | audi | 1.0 |
4 | audi | 1.0 |
5 | audi | 1.0 |
6 | audi | 1.0 |
7 | audi | 1.0 |
8 | audi | 1.0 |
9 | audi | 1.0 |
10 | bmw | 2.0 |
To accomplish something similar to pandas get_dummies, use LabelBinarizer
oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(obj_df[["body_style"]])
The results are an array that needs to be converted to a DataFrame
oe_results.toarray()
array([[1., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 1., 0., 0.], ..., [0., 0., 0., 1., 0.], [0., 0., 0., 1., 0.], [0., 0., 0., 1., 0.]])
pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()
convertible | hardtop | hatchback | sedan | wagon | |
---|---|---|---|---|---|
0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
category_encoder library
# Get a new clean dataframe
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()
make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | engine_type | num_cylinders | fuel_system | |
---|---|---|---|---|---|---|---|---|---|---|
0 | alfa-romero | gas | std | two | convertible | rwd | front | dohc | four | mpfi |
1 | alfa-romero | gas | std | two | convertible | rwd | front | dohc | four | mpfi |
2 | alfa-romero | gas | std | two | hatchback | rwd | front | ohcv | six | mpfi |
3 | audi | gas | std | four | sedan | fwd | front | ohc | four | mpfi |
4 | audi | gas | std | four | sedan | 4wd | front | ohc | five | mpfi |
Try out the Backward Difference Encoder on the engine_type column
# Specify the columns to encode then fit and transform
encoder = ce.BackwardDifferenceEncoder(cols=["engine_type"])
encoder.fit(obj_df, verbose=1)
/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead elif pd.api.types.is_categorical(cols):
BackwardDifferenceEncoder(cols=['engine_type'], mapping=[{'col': 'engine_type', 'mapping': engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \ 1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 4 0.142857 0.285714 0.428571 -0.428571 -0.285714 5 0.142857 0.285714 0.428571 0.571429 -0.285714 6 0.142857 0.285714 0.428571 0.571429 0.714286 7 0.142857 0.285714 0.428571 0.571429 0.714286 -1 0.000000 0.000000 0.000000 0.000000 0.000000 -2 0.000000 0.000000 0.000000 0.000000 0.000000 engine_type_5 1 -0.142857 2 -0.142857 3 -0.142857 4 -0.142857 5 -0.142857 6 -0.142857 7 0.857143 -1 0.000000 -2 0.000000 }])
encoder.fit_transform(obj_df).iloc[:,8:14].head()
/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead elif pd.api.types.is_categorical(cols):
engine_type_0 | engine_type_1 | engine_type_2 | engine_type_3 | engine_type_4 | engine_type_5 | |
---|---|---|---|---|---|---|
0 | -0.857143 | -0.714286 | -0.571429 | -0.428571 | -0.285714 | -0.142857 |
1 | -0.857143 | -0.714286 | -0.571429 | -0.428571 | -0.285714 | -0.142857 |
2 | 0.142857 | -0.714286 | -0.571429 | -0.428571 | -0.285714 | -0.142857 |
3 | 0.142857 | 0.285714 | -0.571429 | -0.428571 | -0.285714 | -0.142857 |
4 | 0.142857 | 0.285714 | -0.571429 | -0.428571 | -0.285714 | -0.142857 |
Another approach is to use a polynomial encoding.
encoder = ce.polynomial.PolynomialEncoder(cols=["engine_type"])
encoder.fit_transform(obj_df, verbose=1).iloc[:,8:14].head()
/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead elif pd.api.types.is_categorical(cols):
engine_type_0 | engine_type_1 | engine_type_2 | engine_type_3 | engine_type_4 | engine_type_5 | |
---|---|---|---|---|---|---|
0 | -0.566947 | 0.545545 | -0.408248 | 0.241747 | -0.109109 | 0.032898 |
1 | -0.566947 | 0.545545 | -0.408248 | 0.241747 | -0.109109 | 0.032898 |
2 | -0.377964 | 0.000000 | 0.408248 | -0.564076 | 0.436436 | -0.197386 |
3 | -0.188982 | -0.327327 | 0.408248 | 0.080582 | -0.545545 | 0.493464 |
4 | -0.188982 | -0.327327 | 0.408248 | 0.080582 | -0.545545 | 0.493464 |
Show an example of how to incorporate the encoding strategies into a scikit-learn pipeline
# for the purposes of this analysis, only use a small subset of features
feature_cols = [
'fuel_type', 'make', 'aspiration', 'highway_mpg', 'city_mpg',
'curb_weight', 'drive_wheels'
]
# Remove the empty price rows
df_ml = df.dropna(subset=['price'])
X = df_ml[feature_cols]
y = df_ml['price']
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
['fuel_type', 'make', 'drive_wheels']),
(OrdinalEncoder(), ['aspiration']),
remainder='passthrough')
linreg = LinearRegression()
pipe = make_pipeline(column_trans, linreg)
cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error')
array([-4476.0937653 , -1014.54842052, -4227.68553953, -4936.79899194, -1591.8291911 , -3716.06617255, -4293.79197464, -1390.00486495, -1600.57946369, -2124.30041954])
# Get the average of the errors after 10 iterations
cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error').mean().round(2)
-2937.17