Create a CSV file below
import os
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')
import pandas as pd
data = pd.read_csv(data_file)
print(data)
NumRooms RoofType Price 0 NaN NaN 127500 1 2.0 NaN 106000 2 4.0 Slate 178100 3 NaN NaN 140000
For categorical input fields,
we can treat NaN
as a category
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)
NumRooms RoofType_Slate RoofType_nan 0 NaN False True 1 2.0 False True 2 4.0 True False 3 NaN False True
Replace the NaN
entries with
the mean value of the corresponding column
inputs = inputs.fillna(inputs.mean())
print(inputs)
NumRooms RoofType_Slate RoofType_nan 0 3.0 False True 1 2.0 False True 2 4.0 True False 3 3.0 False True
All the entries in inputs
and targets
are numerical,
we can load them into a tensor
import torch
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(targets.to_numpy(dtype=float))
X, y
(tensor([[3., 0., 1.], [2., 0., 1.], [4., 1., 0.], [3., 0., 1.]], dtype=torch.float64), tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))