P
(
y
∣
x
;
θ
)
=
{
h
θ
(
x
)
,
y
=
1
1
?
h
θ
(
x
)
,
y
=
0
P(y|x;\theta) = \begin{cases}h_{\theta}(x), &y = 1\\1-h_{\theta}(x),& y = 0\end{cases}
P(y∣x;θ)={hθ?(x),1?hθ?(x),?y=1y=0?
P
(
y
∣
x
;
θ
)
=
(
h
θ
(
x
)
)
y
(
1
?
h
θ
(
x
)
)
1
?
y
P(y|x;\theta) = (h_{\theta}(x))^{y}(1 - h_{\theta}(x))^{1-y}
P(y∣x;θ)=(hθ?(x))y(1?hθ?(x))1?y
L
(
θ
)
=
∏
i
=
1
n
P
(
y
(
i
)
∣
x
(
i
)
;
θ
)
L(\theta) = \prod\limits_{i = 1}^nP(y^{(i)}|x^{(i)};\theta)
L(θ)=i=1∏n?P(y(i)∣x(i);θ)
L
(
θ
)
=
∏
i
=
1
n
(
h
θ
(
x
(
i
)
)
)
y
(
i
)
(
1
?
h
θ
(
x
(
i
)
)
)
1
?
y
(
i
)
L(\theta) = \prod\limits_{i=1}^n(h_{\theta}(x^{(i)}))^{y^{(i)}}(1 - h_{\theta}(x^{(i)}))^{1-y^{(i)}}
L(θ)=i=1∏n?(hθ?(x(i)))y(i)(1?hθ?(x(i)))1?y(i)
l
(
θ
)
=
ln
?
L
(
θ
)
=
ln
?
(
∏
i
=
1
n
(
h
θ
(
x
(
i
)
)
)
y
(
i
)
(
1
?
h
θ
(
x
(
i
)
)
)
1
?
y
(
i
)
)
l(\theta) = \ln{L(\theta)} =\ln( \prod\limits_{i=1}^n(h_{\theta}(x^{(i)}))^{y^{(i)}}(1 - h_{\theta}(x^{(i)}))^{1-y^{(i)}})
l(θ)=lnL(θ)=ln(i=1∏n?(hθ?(x(i)))y(i)(1?hθ?(x(i)))1?y(i))
l
(
θ
)
=
ln
?
L
(
θ
)
=
∑
i
=
1
n
(
y
(
i
)
ln
?
(
h
θ
(
x
(
i
)
)
)
+
(
1
?
y
(
i
)
)
ln
?
(
1
?
h
θ
(
x
(
i
)
)
)
)
l(\theta) = \ln{L(\theta)} = \sum\limits_{i = 1}^n(y^{(i)}\ln(h_{\theta}(x^{(i)})) + (1-y^{(i)})\ln(1-h_{\theta}(x^{(i)})))
l(θ)=lnL(θ)=i=1∑n?(y(i)ln(hθ?(x(i)))+(1?y(i))ln(1?hθ?(x(i))))
损失函数
J
(
θ
)
=
?
l
(
θ
)
=
?
∑
i
=
1
n
[
y
(
i
)
ln
?
(
h
θ
(
x
(
i
)
)
)
+
(
1
?
y
(
i
)
)
ln
?
(
1
?
h
θ
(
x
(
i
)
)
)
]
J(\theta) = -l(\theta) = -\sum\limits_{i = 1}^n[y^{(i)}\ln(h_{\theta}(x^{(i)})) + (1-y^{(i)})\ln(1-h_{\theta}(x^{(i)}))]
J(θ)=?l(θ)=?i=1∑n?[y(i)ln(hθ?(x(i)))+(1?y(i))ln(1?hθ?(x(i)))]
梯度下降
θ
j
t
+
1
=
θ
j
t
?
α
?
?
θ
j
J
(
θ
)
\theta_j^{t + 1} = \theta_j^t - \alpha\frac{\partial}{\partial_{\theta_j}}J(\theta)
θjt+1?=θjt??α?θj????J(θ)
h
θ
(
x
)
=
g
(
θ
T
x
)
=
g
(
z
)
=
1
1
+
e
?
z
h_{\theta}(x) = g(\theta^Tx) = g(z) = \frac{1}{1 + e^{-z}}
hθ?(x)=g(θTx)=g(z)=1+e?z1?
g
′
(
z
)
=
?
?
z
1
1
+
e
?
z
=
e
?
z
(
1
+
e
?
z
)
2
=
1
(
1
+
e
?
z
)
2
?
e
?
z
=
1
1
+
e
?
z
?
(
1
?
1
1
+
e
?
z
)
=
g
(
z
)
?
(
1
?
g
(
z
)
)
\begin{aligned} g'(z) &= \frac{\partial}{\partial z}\frac{1}{1 + e^{-z}} \\\\&= \frac{e^{-z}}{(1 + e^{-z})^2}\\\\& = \frac{1}{(1 + e^{-z})^2}\cdot e^{-z}\\\\&=\frac{1}{1 + e^{-z}} \cdot (1 - \frac{1}{1 + e^{-z}})\\\\&=g(z)\cdot (1 - g(z))\end{aligned}
g′(z)?=?z??1+e?z1?=(1+e?z)2e?z?=(1+e?z)21??e?z=1+e?z1??(1?1+e?z1?)=g(z)?(1?g(z))?
J
(
θ
)
=
?
∑
i
=
1
n
(
y
(
i
)
ln
?
(
h
θ
(
x
i
)
)
+
(
1
?
y
(
i
)
)
ln
?
(
1
?
h
θ
(
x
(
i
)
)
)
)
J(\theta) = -\sum\limits_{i = 1}^n(y^{(i)}\ln(h_{\theta}(x^{i})) + (1-y^{(i)})\ln(1-h_{\theta}(x^{(i)})))
J(θ)=?i=1∑n?(y(i)ln(hθ?(xi))+(1?y(i))ln(1?hθ?(x(i))))
?
?
θ
j
J
(
θ
)
=
?
∑
i
=
1
n
(
y
(
i
)
1
h
θ
(
x
(
i
)
)
?
?
θ
j
h
θ
(
x
i
)
+
(
1
?
y
(
i
)
)
1
1
?
h
θ
(
x
(
i
)
)
?
?
θ
j
(
1
?
h
θ
(
x
(
i
)
)
)
)
=
?
∑
i
=
1
n
(
y
(
i
)
1
h
θ
(
x
(
i
)
)
?
?
θ
j
h
θ
(
x
(
i
)
)
?
(
1
?
y
(
i
)
)
1
1
?
h
θ
(
x
(
i
)
)
?
?
θ
j
h
θ
(
x
(
i
)
)
)
=
?
∑
i
=
1
n
(
y
(
i
)
1
h
θ
(
x
(
i
)
)
?
(
1
?
y
(
i
)
)
1
1
?
h
θ
(
x
(
i
)
)
)
?
?
θ
j
h
θ
(
x
(
i
)
)
=
?
∑
i
=
1
n
(
y
(
i
)
1
h
θ
(
x
(
i
)
)
?
(
1
?
y
(
i
)
)
1
1
?
h
θ
(
x
(
i
)
)
)
h
θ
(
x
(
i
)
)
(
1
?
h
θ
(
x
(
i
)
)
)
?
?
θ
j
θ
T
x
=
?
∑
i
=
1
n
(
y
(
i
)
(
1
?
h
θ
(
x
(
i
)
)
)
?
(
1
?
y
(
i
)
)
h
θ
(
x
(
i
)
)
)
?
?
θ
j
θ
T
x
=
?
∑
i
=
1
n
(
y
(
i
)
?
h
θ
(
x
(
i
)
)
)
?
?
θ
j
θ
T
x
=
∑
i
=
1
n
(
h
θ
(
x
(
i
)
)
?
y
(
i
)
)
x
j
(
i
)
\begin{aligned} \frac{\partial}{\partial{\theta_j}}J(\theta) &= -\sum\limits_{i = 1}^n(y^{(i)}\frac{1}{h_{\theta}(x^{(i)})}\frac{\partial}{\partial_{\theta_j}}h_{\theta}(x^{i}) + (1-y^{(i)})\frac{1}{1-h_{\theta}(x^{(i)})}\frac{\partial}{\partial_{\theta_j}}(1-h_{\theta}(x^{(i)}))) \\\\&=-\sum\limits_{i = 1}^n(y^{(i)}\frac{1}{h_{\theta}(x^{(i)})}\frac{\partial}{\partial_{\theta_j}}h_{\theta}(x^{(i)}) - (1-y^{(i)})\frac{1}{1-h_{\theta}(x^{(i)})}\frac{\partial}{\partial_{\theta_j}}h_{\theta}(x^{(i)}))\\\\&=-\sum\limits_{i = 1}^n(y^{(i)}\frac{1}{h_{\theta}(x^{(i)})} - (1-y^{(i)})\frac{1}{1-h_{\theta}(x^{(i)})})\frac{\partial}{\partial_{\theta_j}}h_{\theta}(x^{(i)})\\\\&=-\sum\limits_{i = 1}^n(y^{(i)}\frac{1}{h_{\theta}(x^{(i)})} - (1-y^{(i)})\frac{1}{1-h_{\theta}(x^{(i)})})h_{\theta}(x^{(i)})(1-h_{\theta}(x^{(i)}))\frac{\partial}{\partial_{\theta_j}}\theta^Tx\\\\&=-\sum\limits_{i = 1}^n(y^{(i)}(1-h_{\theta}(x^{(i)})) - (1-y^{(i)})h_{\theta}(x^{(i)}))\frac{\partial}{\partial_{\theta_j}}\theta^Tx\\\\&=-\sum\limits_{i = 1}^n(y^{(i)} - h_{\theta}(x^{(i)}))\frac{\partial}{\partial_{\theta_j}}\theta^Tx\\\\&=\sum\limits_{i = 1}^n(h_{\theta}(x^{(i)}) -y^{(i)})x_j^{(i)}\end{aligned}
?θj???J(θ)?=?i=1∑n?(y(i)hθ?(x(i))1??θj????hθ?(xi)+(1?y(i))1?hθ?(x(i))1??θj????(1?hθ?(x(i))))=?i=1∑n?(y(i)hθ?(x(i))1??θj????hθ?(x(i))?(1?y(i))1?hθ?(x(i))1??θj????hθ?(x(i)))=?i=1∑n?(y(i)hθ?(x(i))1??(1?y(i))1?hθ?(x(i))1?)?θj????hθ?(x(i))=?i=1∑n?(y(i)hθ?(x(i))1??(1?y(i))1?hθ?(x(i))1?)hθ?(x(i))(1?hθ?(x(i)))?θj????θTx=?i=1∑n?(y(i)(1?hθ?(x(i)))?(1?y(i))hθ?(x(i)))?θj????θTx=?i=1∑n?(y(i)?hθ?(x(i)))?θj????θTx=i=1∑n?(hθ?(x(i))?y(i))xj(i)??
?
?
θ
j
J
(
θ
)
=
∑
i
=
1
n
(
h
θ
(
x
(
i
)
)
?
y
(
i
)
)
x
j
(
i
)
\frac{\partial}{\partial{\theta_j}}J(\theta) = \sum\limits_{i = 1}^n(h_{\theta}(x^{(i)}) -y^{(i)})x_j^{(i)}
?θj???J(θ)=i=1∑n?(hθ?(x(i))?y(i))xj(i)?
最终梯度更新表达式:
θ
j
t
+
1
=
θ
j
t
?
α
?
∑
i
=
1
n
(
h
θ
(
x
(
i
)
)
?
y
(
i
)
)
x
j
(
i
)
\theta_j^{t+1} = \theta_j^t - \alpha \cdot \sum\limits_{i=1}^{n}(h_{\theta}(x^{(i)}) -y^{(i)})x_j^{(i)}
θjt+1?=θjt??α?i=1∑n?(hθ?(x(i))?y(i))xj(i)?
|