导读
这篇文章我们介绍如何使用paddlepaddle来预测UCI房价 ,UCI房价数据集一共包含了506条 数据,每条数据包含13个特征 和该类房价 的中位数。
paddlepaddle预测房价
import os,paddle
import numpy as np
BATCH_SIZE = 32
train_datasets = paddle.text.datasets.UCIHousing(mode="train")
val_datasets = paddle.text.datasets.UCIHousing(mode="test")
train_dataloader = paddle.io.DataLoader(train_datasets,batch_size=BATCH_SIZE,shuffle=True,drop_last=True)
val_dataloader = paddle.io.DataLoader(val_datasets,batch_size=BATCH_SIZE,shuffle=True,drop_last=True)
print(train_datasets[0])
"""
(array([-0.0405441 , 0.06636363, -0.32356226, -0.06916996, -0.03435197,
0.05563625, -0.03475696, 0.02682186, -0.37171334, -0.21419305,
-0.33569506, 0.10143217, -0.21172912], dtype=float32),
array([24.], dtype=float32))
"""
- 定义预测网络结构
这里我们使用一个全连接层来构建房价的预测网络,即全连接层的输入是13个节点,输出是1个节点 ,实现的数学公式如下
z
=
ω
1
x
1
+
ω
2
x
2
+
.
.
.
.
.
+
ω
13
x
13
+
b
z = \omega_1x_1+\omega_2x_2+.....+\omega_{13}x_{13}+b
z=ω1?x1?+ω2?x2?+.....+ω13?x13?+b
model = paddle.nn.Linear(13,1)
- 定义损失函数
对于回归问题,我们通常都是采用平方误差函数 作为损失函数
paddle.nn.functional.square_error_cost(predict,true_label)
optimizer = paddle.optimizer.SGD(learning_rate=0.001,parameters=net.parameters())
import matplotlib.pyplot as plt
def draw_loss_change(epoch_list,loss_list,title):
plt.title(title,fontsize=24)
plt.xlabel("epoch",fontsize=24)
plt.ylabel("loss",fontsize=24)
plt.plot(epoch_list,loss_list,color="red",label="loss")
plt.grid()
plt.show()
def evaluate(model):
val_total_loss = 0
val_batch_num = 0
for batch_id,batch_data in enumerate(val_dataloader):
batch_input = paddle.to_tensor(batch_data[0])
batch_label = paddle.to_tensor(batch_data[1])
batch_output = model(batch_input)
batch_loss = paddle.mean(paddle.nn.functional.square_error_cost(batch_output,batch_label)).numpy()[0]
val_total_loss += batch_loss
val_batch_num += 1
return val_total_loss / val_batch_num
def train():
epoch_list = []
train_loss_list = []
val_loss_list = []
best_val_loss = 1000
EPOCH_NUM = 100
for epoch in range(EPOCH_NUM):
epoch_total_loss = 0
epoch_batch_num = 0
for batch_idx,batch_data in enumerate(train_dataloader):
batch_input = paddle.to_tensor(batch_data[0])
batch_label = paddle.to_tensor(batch_data[1])
batch_output = model(batch_input)
batch_loss = paddle.mean(paddle.nn.functional.square_error_cost(batch_output,batch_label))
batch_loss.backward()
optimizer.step()
optimizer.clear_grad()
epoch_total_loss += batch_loss.numpy()[0]
epoch_batch_num += 1
epoch_loss = epoch_total_loss / epoch_batch_num
epoch_list.append(epoch+1)
train_loss_list.append(epoch_loss)
val_loss = evaluate(model)
if val_loss < best_val_loss:
paddle.save(model.state_dict(),"best_fit.pdparams")
print("epoch num:{},train loss:{:.2f},val loss:{:.2f}".format(epoch+1,epoch_loss,val_loss))
draw_loss_change(epoch_list,train_loss_list,title="train loss")
def draw_predict_grounds(ground_truths,predict_results,title="Boston"):
plt.title(title,fontsize=24)
x = np.arange(1,30)
y = x
plt.plot(x,y,color="red")
plt.xlabel("ground truth",fontsize=14)
plt.ylabel("predict result",fontsize=14)
plt.scatter(ground_truths,predict_results,color="green",label="ground-predict cost")
plt.grid()
plt.show()
import paddle
valid_dataset = paddle.text.UCIHousing(mode="test")
valid_data_loader = paddle.io.DataLoader(valid_dataset,batch_size=200)
model = paddle.nn.Linear(13,1)
param = paddle.load("best_fit.pdparams")
model.set_dict(param)
predict_results = []
groud_truths = []
for batch_data in valid_data_loader:
batch_input = paddle.to_tensor(batch_data[0])
batch_predict = model(batch_input).numpy()
predict_results.extend(batch_predict)
groud_truths.extend(batch_data[1].numpy())
predict_results = np.squeeze(np.array(predict_results))
groud_truths = np.squeeze(np.array(groud_truths))
draw_predict_grounds(groud_truths,predict_results)
|