import pandas as pd
import numpy as np
import sklearn
cols = ['symboling',
'normalized-losses',
'make',
'fuel-type',
'aspiration',
'num-of-doors',
'body-style',
'drive-wheels',
'engine-location',
'wheel-base',
'length',
'width',
'height',
'curb-weight',
'engine-type',
'num-of-cylinders',
'engine-size',
'fuel-system',
'bore',
'stroke',
'compression-rate',
'horsepower',
'peak-rpm',
'city-mpg',
'highway-mpg',
'price']
cars = pd.read_csv('imports-85.data', names = cols)
print(cars.head())
symboling normalized-losses make fuel-type aspiration num-of-doors \ 0 3 ? alfa-romero gas std two 1 3 ? alfa-romero gas std two 2 1 ? alfa-romero gas std two 3 2 164 audi gas std four 4 2 164 audi gas std four body-style drive-wheels engine-location wheel-base ... engine-size \ 0 convertible rwd front 88.6 ... 130 1 convertible rwd front 88.6 ... 130 2 hatchback rwd front 94.5 ... 152 3 sedan fwd front 99.8 ... 109 4 sedan 4wd front 99.4 ... 136 fuel-system bore stroke compression-rate horsepower peak-rpm city-mpg \ 0 mpfi 3.47 2.68 9.0 111 5000 21 1 mpfi 3.47 2.68 9.0 111 5000 21 2 mpfi 2.68 3.47 9.0 154 5000 19 3 mpfi 3.19 3.40 10.0 102 5500 24 4 mpfi 3.19 3.40 8.0 115 5500 18 highway-mpg price 0 27 13495 1 27 16500 2 26 16500 3 30 13950 4 22 17450 [5 rows x 26 columns]
continuous_cols = [
'normalized-losses',
'wheel-base',
'length',
'width',
'height',
'curb-weight',
'engine-size',
'bore',
'stroke',
'compression-rate',
'horsepower',
'peak-rpm',
'city-mpg',
'highway-mpg',
'price']
numeric_cars = cars[continuous_cols]
numeric_cars = numeric_cars.replace('?', np.nan)
numeric_cars.head()
normalized-losses | wheel-base | length | width | height | curb-weight | engine-size | bore | stroke | compression-rate | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | 130 | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | NaN | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | 130 | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | NaN | 94.5 | 171.2 | 65.5 | 52.4 | 2823 | 152 | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 | 164 | 99.8 | 176.6 | 66.2 | 54.3 | 2337 | 109 | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
4 | 164 | 99.4 | 176.6 | 66.4 | 54.3 | 2824 | 136 | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
numeric_cars.astype(float)
numeric_cars.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 205 entries, 0 to 204 Data columns (total 15 columns): normalized-losses 164 non-null object wheel-base 205 non-null float64 length 205 non-null float64 width 205 non-null float64 height 205 non-null float64 curb-weight 205 non-null int64 engine-size 205 non-null int64 bore 201 non-null object stroke 201 non-null object compression-rate 205 non-null float64 horsepower 203 non-null object peak-rpm 203 non-null object city-mpg 205 non-null int64 highway-mpg 205 non-null int64 price 201 non-null object dtypes: float64(5), int64(4), object(6) memory usage: 24.1+ KB
print(numeric_cars['normalized-losses'].isna().sum(),' | ',
len(numeric_cars['normalized-losses']))
41 | 205
numeric_cars.isna().sum()
normalized-losses 41 wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 4 stroke 4 compression-rate 0 horsepower 2 peak-rpm 2 city-mpg 0 highway-mpg 0 price 4 dtype: int64
numeric_cars = numeric_cars.dropna(subset=['price'])
numeric_cars.isna().sum()
normalized-losses 37 wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 4 stroke 4 compression-rate 0 horsepower 2 peak-rpm 2 city-mpg 0 highway-mpg 0 price 0 dtype: int64
numeric_cars = numeric_cars.fillna(numeric_cars.mean())
numeric_cars.isna().sum()
normalized-losses 37 wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 4 stroke 4 compression-rate 0 horsepower 2 peak-rpm 2 city-mpg 0 highway-mpg 0 price 0 dtype: int64