线性回归模型表达式:
- 其中theta.shape=(1,m),m=数据的特征数量。
- 通过直接求导,可以得到theta的最优解,但是会遇到矩阵求逆的问题,而并不是所有的矩阵都可以求逆。
- 由于矩阵求逆的不确定性,因而想到了用梯度下降法来逐步逼近最优解,梯度下降法又分为BGD、SGD、MBGD。
- BGD每次更新需要用全部数据,计算量大且耗时;SGD每次更新需要用一条数据,计算量小速度快,但是可能在最小值附近波动,不收敛;MBGD每次更新用一部分数据(batch_size),计算速度并不会比SGD慢太多,每次使用一个batch_size可以大大减小收敛所需要的迭代次数。
- batch_size根据GPU来选择,一般是32的倍数。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
np.random.seed(4)
class LR:
def __init__(self,iter_n=30,learn_rate=0.001,batch_size=2):
self.iter_n = iter_n
self.learn_rate = learn_rate
self.batch_size = batch_size
def model(self,x,theta):
return np.dot(x,theta)
def mse(self,y,y_hat):
return sum((y-y_hat)**2)/len(y)
def cal_gred(self,y,y_hat,x):
gred = np.zeros((x.shape[-1],1))
for index in range(len(gred)):
gred[index][0] = sum((y-y_hat)*x[:,index:index+1])
return gred
def shuffle(self,x,y,y_hat):
all_data = np.hstack((x,y,y_hat))
print(all_data)
np.random.shuffle(all_data)
return all_data[:,:x.shape[-1]],all_data[:,x.shape[-1]:x.shape[-1]+1],all_data[:,-1:]
def draw_loss(self,count,loss):
plt.plot(range(count+1),loss,color='r',marker='o')
plt.show()
def predict(self,x):
x.insert(0,1)
x = np.array(x)
y_pre = self.model(x,self.theta)
return y_pre
def fit(self,x,y):
x_ones = np.ones((len(x),1))
x = np.hstack((x_ones,x))
theta = np.zeros((x.shape[-1],1))
y_hat = self.model(x,theta)
loss = []
loss.append(self.mse(y,y_hat))
count = 0
while True:
for n in range(int(len(x)/self.batch_size)):
gred = self.cal_gred(y[n*self.batch_size:(n+1)*self.batch_size],y_hat[n*self.batch_size:(n+1)*self.batch_size],x[n*self.batch_size:(n+1)*self.batch_size])
theta = theta+self.learn_rate*gred
y_hat = self.model(x,theta)
loss.append(self.mse(y,y_hat))
count += 1
if count >= self.iter_n:
break
if loss[-2]-loss[-1] < 1e-5:
break
x,y,y_hat = self.shuffle(x,y,y_hat)
self.draw_loss(count,loss)
self.theta = theta
if __name__=='__main__':
train_data = pd.read_excel('lr.xlsx',sheet_name='训练数据')
train_data_x = np.array(train_data.iloc[:,:-1])
train_data_y = np.array(train_data.iloc[:,-1:])
print('------手写结果:以下是手写的线性回归模型结果------')
lr1 = LR(iter_n=30, learn_rate=0.001, batch_size=4)
lr1.fit(train_data_x, train_data_y)
print('预测值为:',lr1.predict([2,4,3]))
print('截距+权重:',lr1.theta.T)
print('------调包结果:以下调用sklearn中线性回归模型结果------')
lr2 = linear_model.LinearRegression()
lr2.fit(train_data_x, train_data_y)
print('预测值为:',lr2.predict(np.array([[2,4,3]])))
print('权重:',lr2.coef_)
print('截距:',lr2.intercept_)
注:lr.xlsx数据是随便写的,不具有参考价值
x1 | x2 | x3 | y |
---|
2 | 4 | 3 | 1.3 | 3 | 4 | 3 | 2.3 | 4 | 5 | 6 | 3.4 | 5 | 5 | 0 | 2.1 | 6 | 4 | 3 | 1 | 7 | 4 | 7 | 5 | 8 | 3 | 7 | 4.5 | 9 | 4 | 7 | 3 | 10 | 7 | 7 | 4 | 11 | 7 | 6 | 7 |
|