In [1]:

import pandas as pd
import numpy as np
import sklearn

In [2]:

cols = ['symboling', 
        'normalized-losses', 
        'make', 
        'fuel-type', 
        'aspiration', 
        'num-of-doors', 
        'body-style', 
        'drive-wheels', 
        'engine-location', 
        'wheel-base', 
        'length', 
        'width', 
        'height', 
        'curb-weight', 
        'engine-type', 
        'num-of-cylinders', 
        'engine-size', 
        'fuel-system', 
        'bore', 
        'stroke', 
        'compression-rate', 
        'horsepower', 
        'peak-rpm', 
        'city-mpg', 
        'highway-mpg', 
        'price']
cars = pd.read_csv('imports-85.data', names = cols)
print(cars.head())

   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          3                 ?  alfa-romero       gas        std          two   
1          3                 ?  alfa-romero       gas        std          two   
2          1                 ?  alfa-romero       gas        std          two   
3          2               164         audi       gas        std         four   
4          2               164         audi       gas        std         four   

    body-style drive-wheels engine-location  wheel-base  ...    engine-size  \
0  convertible          rwd           front        88.6  ...            130   
1  convertible          rwd           front        88.6  ...            130   
2    hatchback          rwd           front        94.5  ...            152   
3        sedan          fwd           front        99.8  ...            109   
4        sedan          4wd           front        99.4  ...            136   

   fuel-system  bore  stroke compression-rate horsepower  peak-rpm city-mpg  \
0         mpfi  3.47    2.68              9.0        111      5000       21   
1         mpfi  3.47    2.68              9.0        111      5000       21   
2         mpfi  2.68    3.47              9.0        154      5000       19   
3         mpfi  3.19    3.40             10.0        102      5500       24   
4         mpfi  3.19    3.40              8.0        115      5500       18   

  highway-mpg  price  
0          27  13495  
1          27  16500  
2          26  16500  
3          30  13950  
4          22  17450  

[5 rows x 26 columns]

In [3]:

continuous_cols = [ 
    'normalized-losses', 
    'wheel-base', 
    'length', 
    'width',     
    'height', 
    'curb-weight', 
    'engine-size', 
    'bore', 
    'stroke', 
    'compression-rate', 
    'horsepower', 
    'peak-rpm', 
    'city-mpg', 
    'highway-mpg', 
    'price']
numeric_cars = cars[continuous_cols]
numeric_cars = numeric_cars.replace('?', np.nan)
numeric_cars.head()

Out[3]:

	normalized-losses	wheel-base	length	width	height	curb-weight	engine-size	bore	stroke	compression-rate	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	NaN	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111	5000	21	27	13495
1	NaN	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111	5000	21	27	16500
2	NaN	94.5	171.2	65.5	52.4	2823	152	2.68	3.47	9.0	154	5000	19	26	16500
3	164	99.8	176.6	66.2	54.3	2337	109	3.19	3.40	10.0	102	5500	24	30	13950
4	164	99.4	176.6	66.4	54.3	2824	136	3.19	3.40	8.0	115	5500	18	22	17450

In [4]:

numeric_cars.astype(float)
numeric_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
normalized-losses    164 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-size          205 non-null int64
bore                 201 non-null object
stroke               201 non-null object
compression-rate     205 non-null float64
horsepower           203 non-null object
peak-rpm             203 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                201 non-null object
dtypes: float64(5), int64(4), object(6)
memory usage: 24.1+ KB

In [5]:

print(numeric_cars['normalized-losses'].isna().sum(),' | ', 
      len(numeric_cars['normalized-losses']))

41  |  205

In [6]:

numeric_cars.isna().sum()

Out[6]:

normalized-losses    41
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [7]:

numeric_cars = numeric_cars.dropna(subset=['price'])

In [13]:

numeric_cars.isna().sum()

Out[13]:

normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [18]:

numeric_cars = numeric_cars.fillna(numeric_cars.mean())

In [19]:

numeric_cars.isna().sum()

Out[19]:

normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [ ]: