import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('../data_test/Salary_Data2.csv')
x = data['YearsExperience']
y = data['Salary']
plt.scatter(x,y,s=50,color='dodgerblue')
#线性回归
import sklearn.linear_model as lm
train_x = pd.DataFrame(x)
train_y = y
#线性回归
model = lm.LinearRegression()
#岭回归
model_ridge = lm.Ridge(alpha=100)
model.fit(train_x,train_y)
model_ridge.fit(train_x,train_y)
pred_y = model.predict(train_x)
pred_y_ridge = model_ridge.predict(train_x)
#回归线可视化
plt.scatter(x,y,s=50,color='dodgerblue')
plt.plot(x,pred_y,color='orangered',label='LR')
plt.plot(x,pred_y_ridge,color='red',label='ridge')
plt.legend()
import sklearn.metrics as sm
### 寻找到岭回归中,最好的正则强度
#拿到一部分数据,进行测试(假设没参加过训练)
test_x = train_x.iloc[:30:4]
test_y = train_y[:30:4]
params = np.arange(90,111,5)
for param in params:
model = lm.Ridge(alpha=param)
model.fit(train_x,train_y)
pred_test_y = model.predict(test_x)
print(param,':',sm.r2_score(test_y,pred_test_y))
|