一元线性回归模型
? 一元线性回归顾名思义自变量的个数为1个,因变量的个数也是1个。需要我们根据训练数据去学得一个线性模型以尽可能的准确的预测实值的输出。
f
(
x
)
=
w
X
+
b
f(x)=wX+b
f(x)=wX+b ? 那么我们如何去确定w和b的值呢?根据衡量预测值与实际值之间的区别,我们可以令均方误差最小化。基于这种均方误差最小化来进行模型求解的方法成为最小二乘法。
? 对目标函数分别对w和b求导,并令导数为0,即可解出w和b的值。也可以使用梯度下降法,求出w和b。案例1为使用梯度下降法求解,案例2(鸢尾花)和案例3(波士顿房价)为使用sklearn封装好的函数去拟合。
import matplotlib.pyplot as plt
x = [14.1,15.1,16.1,15.3,14.2,14.5,14.6,14.8,16.3,16.6,15.6,16.7,14.7,16.2,16.7,14.8,14.9,15.9,16.3,17.0,15.0,16.3,17.1,16.0,17.2,17.4,15.5,16.8,17.4,16.9]
y = [15.8,16.3,14.5,16.6,16.7,15.0,14.9,15.2,15.3,16.6,17.0,16.4,16.5,15.4,15.8,14.8,16.0,17.3,15.7,14.4,15.8,16.7,14.9,16.9,15.5,16.6,17.7,18.0,15.9,18.0]
x_train = x[0:20]
y_train = y[0:20]
x_test = x[20:]
y_test = y[20:]
w_start = 1.0
b_start = -1.0
learn = 0.0001
times = 10000
for i in range(times):
sum_w = 0
sum_b = 0
for i in range(len(x_train)):
sum_w += (y_train[i] - w_start * x_train[i]-b_start) * (-x_train[i])
sum_b += (y_train[i] - w_start * x_train[i]-b_start) * (-1)
w_start = w_start - 2 * sum_w * learn
b_start = b_start - 2 * sum_b * learn
plt.figure(figsize=(20,8))
plt.scatter(x_train, y_train,c='r')
plt.plot([i for i in range(0,20)],[(w_start * i + b_start) for i in range(0,20)])
plt.show()
total_train_loss = 0
for i in range(len(x_train)):
y_hat = w_start * x_train[i] + b_start
total_train_loss += (y_train[i] - y_hat) ** 2
print("训练集的误差:"+str(total_train_loss))
total_test_loss = 0
for i in range(len(x_test)):
y_hat = w_start * x_test[i] + b_start
total_test_loss += (y_test[i] - y_hat) ** 2
print("测试集的误差:"+str(total_test_loss))
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
np.set_printoptions(precision=2)
iris = load_iris()
x = iris.data[:,2].reshape(-1,1)
y = iris.data[:,3];
lr = LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
lr.fit(x_train, y_train)
print('权重:', lr.coef_)
print('截距:', lr.intercept_)
y_hat = lr.predict(x_test)
print('实际值:', y_test[:45])
print('预测值:', y_hat[:45])
sum = 0;
for i in range(0,len(y_hat)):
sum = sum + (y_hat[i]-y_test[i])**2;
print('误差平方和为:',sum);
plt.legend();
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.scatter(x_train,y_train,c='r',label='训练集');
plt.scatter(x_test,y_test,c='g',label='测试集');
plt.plot(x,x*lr.coef_+lr.intercept_,c='orange');
plt.xlabel('花瓣长度');
plt.ylabel('花瓣宽度');
plt.show();
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np;
boston = load_boston()
x = boston.data;
y = boston.target;
df = pd.DataFrame(np.concatenate([x, y.reshape(-1,1)], axis=1),columns = boston.feature_names.tolist()+['MEDV'])
print(df.head());
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0);
lr = LinearRegression();
lr.fit(x_train,y_train);
y_hat = lr.predict(x_test);
sum = 0;
for i in range(0,len(y_hat)):
sum = sum + (y_hat[i]-y_test[i])**2;
print('误差平方和为:',sum);
plt.legend();
plt.rcParams['font.sans-serif']=['SimHei'];
plt.rcParams['axes.unicode_minus']=False;
plt.plot(y_test,c='r',label='实际值');
plt.plot(y_hat,c='b',label='预测值');
plt.show();
|