数据预处理
import os
os.makedirs(os.path.join( 'data'), exist_ok=True)
data_file = os.path.join( 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
f.write('NumRooms,Alley,Price\n')
f.write('NA,Pave,127500\n')
f.write('2,NA,106000\n')
f.write('4,NA,178100\n')
f.write('NA,NA,140000\n')
import pandas as pd
data = pd.read_csv(data_file)
data
| NumRooms | Alley | Price |
---|
0 | NaN | Pave | 127500 |
---|
1 | 2.0 | NaN | 106000 |
---|
2 | 4.0 | NaN | 178100 |
---|
3 | NaN | NaN | 140000 |
---|
处理缺失值
inputs,outputs=data.iloc[:,0:2],data.iloc[:,2]
inputs=inputs.fillna(inputs.mean())
print(inputs)
NumRooms Alley
0 3.0 Pave
1 2.0 NaN
2 4.0 NaN
3 3.0 NaN
C:\Users\lucifer\AppData\Local\Temp/ipykernel_17604/2223495382.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
inputs=inputs.fillna(inputs.mean())
inputs = pd.get_dummies(inputs,dummy_na=True)
print(inputs)
NumRooms Alley_Pave Alley_nan
0 3.0 1 0
1 2.0 0 1
2 4.0 0 1
3 3.0 0 1
转换为张量
import torch
X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y
(tensor([[3., 1., 0.],
[2., 0., 1.],
[4., 0., 1.],
[3., 0., 1.]], dtype=torch.float64),
tensor([127500, 106000, 178100, 140000]))
|