文章目录
逻辑回归+归一化
对于画决策边界的问题还有待学习,还没有画出来🤦?♀?🤦?♀?😭
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
from sklearn.metrics import classification_report
data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
data.head()
data.describe()
sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2))
sns.lmplot('exam1', 'exam2', hue='admitted', data=data,
size=6,
fit_reg=False,
scatter_kws={"s": 50}
)
plt.show()
def get_X(df):
ones = pd.DataFrame({'ones': np.ones(len(df))})
data = pd.concat([ones, df], axis=1)
data.insert(0, 'Ones', 1)
return data.iloc[:, :-1].values
def get_y(df):
return np.array(df.iloc[:, -1])
def normalize_feature(df):
return df.apply(lambda column: (column - column.mean()) / column.std())
X = get_X(data)
print(X.shape)
y = get_y(data)
print(y.shape)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(np.arange(-10,10,step = 0.01),sigmoid(np.arange(-10,10,step=0.01)))
ax.set_ylim((-0.1,1.1))
ax.set_xlabel('z',fontsize=18)
ax.set_ylabel('g(z)',fontsize=18)
ax.set_title('sigmoid function0',fontsize = 18)
plt.show()
theta = theta = np.zeros(4)
theta
def cost(theta,X,y):
return np.mean(-y * np.log(sigmoid(X @ theta)) - (1-y) * np.log(1-sigmoid(X @ theta)))
cost(theta,X,y)
def gradient(theta,X,y):
return (1/len(X)) * X.T @ (sigmoid(X @ theta) - y)
gradient(theta,X,y)
import scipy.optimize as opt
res = opt.minimize(fun=cost, x0=theta, args=(X, y), method='Newton-CG', jac=gradient)
print(res)
def predict(x,theta):
prob = sigmoid(x @ theta)
return (prob >= 0.5).astype(int)
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))
print(res.x)
coef = -(res.x / res.x[2])
print(coef)
x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x
data.describe()
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('exam1', 'exam2', hue='admitted', data=data,
size=6,
fit_reg=False,
scatter_kws={"s": 25}
)
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('test1', 'test2', hue='accepted', data=df,
size=6,
fit_reg=False,
scatter_kws={"s": 50}
)
plt.title('Regularized Logistic Regression')
plt.show()
def feature_mapping(x, y, power, as_ndarray=False):
data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
for i in np.arange(power + 1)
for p in np.arange(i + 1)
}
if as_ndarray:
return pd.DataFrame(data).iloc[:,:].values
else:
return pd.DataFrame(data)
x1 = np.array(df.test1)
x2 = np.array(df.test2)
data = feature_mapping(x1, x2, power=6)
print(data.shape)
data.head()
theta = np.zeros(data.shape[1])
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
print(X.shape)
y = get_y(df)
print(y.shape)
def regularized_cost(theta, X, y, l=1):
theta_j1_to_n = theta[1:]
regularized_term = (l / (2 * len(X))) * np.power(theta_j1_to_n, 2).sum()
return cost(theta, X, y) + regularized_term
regularized_cost(theta, X, y, l=1)
def regularized_gradient(theta, X, y, l=1):
theta_j1_to_n = theta[1:]
regularized_theta = (l / len(X)) * theta_j1_to_n
regularized_term = np.concatenate([np.array([0]), regularized_theta])
return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
import scipy.optimize as opt
print('init cost = {}'.format(regularized_cost(theta, X, y)))
res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))
def draw_boundary(power, l):
density = 1000
threshhold = 2 * 10**-3
final_theta = feature_mapped_logistic_regression(power, l)
x, y = find_decision_boundary(density, power, final_theta, threshhold)
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})
plt.scatter(x, y, c='R', s=10)
plt.title('Decision boundary')
plt.show()
def feature_mapped_logistic_regression(power, l):
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
x1 = np.array(df.test1)
x2 = np.array(df.test2)
y = get_y(df)
X = feature_mapping(x1, x2, power, as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=regularized_gradient)
final_theta = res.x
return final_theta
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.5, density)
t2 = np.linspace(-1, 1.5, density)
cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
mapped_cord = feature_mapping(x_cord, y_cord, power)
inner_product = mapped_cord.iloc[:,:].values @ theta
decision = mapped_cord[np.abs(inner_product) < threshhold]
return decision.f10, decision.f01
draw_boundary(power=6, l=1)
draw_boundary(power=6,l=0)
draw_boundary(power=6, l=100)
|