見出し画像

PRML自習ノート - chapter 5 -

Exercise (5.1) - (5.10)

Exercise (5.1)

$$
\begin{align*}
\sigma(a)&=\frac{1}{2}\left\{1+\tanh\left(\frac{a}{2}\right)\right\}
\end{align*}
$$

の関係式を用いると,

$$
\begin{align*}
\sum_{j=1}^Mw_{kj}^{(2)}\sigma\left(\sum_{j=1}^bw_{ji}^{(1)}x_i+w_{j0}^{(1)}\right)+w_{k0}^{(2)}&=\sum_{j=1}^M\frac{w_{kj}^{(2)}}{2}\left[1+\tanh\left(\sum_{j=1}^b\frac{w_{ji}^{(1)}}{2}x_i+\frac{w_{j0}^{(1)}}{2}\right)\right]+w_{k0}^{(2)}\\
&=\sum_{j=1}^M\frac{w_{kj}^{(2)}}{2}\tanh\left(\sum_{j=1}^b\frac{w_{ji}^{(1)}}{2}x_i+\frac{w_{j0}^{(1)}}{2}\right)+\left(\sum_{j=1}^M\frac{w_{kj}^{(2)}}{2}+w_{k0}^{(2)}\right)
\end{align*}
$$

が得られる。
以下の係数変換をすることにより,sigmoidとtanhで同じネットワークの表現が得られる。

$$
\begin{align*}
w_{j0}^{(1)} &\leftarrow \frac{w_{j0}^{(1)}}{2}\\
w_{ji}^{(1)} &\leftarrow \frac{w_{ji}^{(1)}}{2}\\
w_{k0}^{(2)} &\leftarrow \sum_{j=1}^M\frac{w_{kj}^{(2)}}{2}+w_{k0}^{(2)}\\
w_{kj}^{(2)} &\leftarrow \frac{w_{kj}^{(2)}}{2}
\end{align*}
$$



Exercise (5.2)

$$
\begin{align*}
\ln p(\{\mathbf{t}_n\}|\{\mathbf{x}_n\},\mathbf{w})&=\ln\prod_{n=1}^N p(\mathbf{t}_n|\mathbf{x}_n,\mathbf{w})\\
&=\sum_{n=1}^N\ln p(\mathbf{t}_n|\mathbf{x}_n,\mathbf{w})\\
&=\sum_{n=1}^N\ln \mathcal{N}(\mathbf{t}_n|\mathbf{y}(\mathbf{x}_n,\mathbf{w}), \beta^{-1}\mathbf{I})\\
&=\sum_{n=1}^N\ln \left\{\left(\frac{\beta}{2\pi}\right)^{K/2}\exp\left(-\frac{\beta}{2}\|\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n\|^2\right)\right\}\\
&=-\frac{\beta}{2}\sum_{n=1}^N\|\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n\|^2+\frac{KN}{2}\ln\left(\frac{\beta}{2\pi}\right)
\end{align*}
$$

以上より,式(5.16)に対する誤差関数は式(5.11)で与えられる。



Exercise (5.3)

$$
\begin{align*}
-\ln p(\{\mathbf{t}_n\}|\{\mathbf{x}_n\},\mathbf{w})&=-\ln\prod_{n=1}^N p(\mathbf{t}_n|\mathbf{x}_n,\mathbf{w})\\
&=-\sum_{n=1}^N\ln p(\mathbf{t}_n|\mathbf{x}_n,\mathbf{w})\\
&=-\sum_{n=1}^N\ln \mathcal{N}(\mathbf{t}_n|\mathbf{y}(\mathbf{x}_n,\mathbf{w}), \boldsymbol\Sigma)\\
&=-\sum_{n=1}^N\ln \left[\left(\frac{1}{2\pi}\right)^{K/2}\frac{1}{|\boldsymbol\Sigma|^{1/2}}\exp\left\{-\frac{1}{2}(\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n)^{\rm T}\boldsymbol\Sigma^{-1}(\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n)\right\}\right]\\
&=\frac{1}{2}\sum_{n=1}^N(\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n)^{\rm T}\boldsymbol\Sigma^{-1}(\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n)+\frac{N}{2}\ln|\boldsymbol\Sigma|+\frac{KN}{2}\ln\left(2\pi\right)
\end{align*}
$$

より,$${\boldsymbol\Sigma}$$が固定された条件における$${\mathbf{w}}$$の誤差関数は,

$$
\begin{align*}
E_{\mathbf{w}}(\mathbf{w})&=\frac{1}{2}\sum_{n=1}^N(\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n)^{\rm T}\boldsymbol\Sigma^{-1}(\mathbf{y}(\mathbf{x}_n,\mathbf{w})-\mathbf{t}_n)
\end{align*}
$$

とする。
一方,$${\mathbf{w}}$$が固定された条件における$${\boldsymbol\Sigma}$$の誤差関数を$${E_{\mathbf{w}}(\mathbf{w})}$$の最少値を与える$${\mathbf{w}_{\rm ML}}$$を用いて

$$
\begin{align*}
E_{\boldsymbol\Sigma}(\boldsymbol\Sigma)&=\frac{1}{2}\sum_{n=1}^N(\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n)^{\rm T}\boldsymbol\Sigma^{-1}(\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n)+\frac{N}{2}\ln|\boldsymbol\Sigma|
\end{align*}
$$

と定義する。
$${E_{\boldsymbol\Sigma}(\boldsymbol\Sigma)}$$を$${\boldsymbol\Sigma}$$で微分すると,

$$
\begin{align*}
\frac{\partial E_{\boldsymbol\Sigma}(\boldsymbol\Sigma)}{\partial\boldsymbol\Sigma}&=\frac{1}{2}\sum_{n=1}^N(\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n)^{\rm T}\frac{\partial \boldsymbol\Sigma^{-1}}{\partial\boldsymbol\Sigma}(\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n)+\frac{N}{2}\frac{\partial}{\partial\boldsymbol\Sigma}\ln|\boldsymbol\Sigma|\\
&=-\frac{1}{2}\sum_{n=1}^N\left[\boldsymbol\Sigma^{-1}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\right]^{\rm T}\frac{\partial \boldsymbol\Sigma}{\partial\boldsymbol\Sigma}\left[\boldsymbol\Sigma^{-1}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\right]+\frac{N}{2}\boldsymbol\Sigma^{-1}\\
&=-\frac{1}{2}\sum_{n=1}^N\left[\boldsymbol\Sigma^{-1}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\right]\left[\boldsymbol\Sigma^{-1}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\right]^{\rm T}+\frac{N}{2}\boldsymbol\Sigma^{-1}\\
&=-\frac{1}{2}\boldsymbol\Sigma^{-1}\sum_{n=1}^N\left[\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}^{\rm T}\boldsymbol\Sigma^{-1}-\mathbf{I}\right]
\end{align*}
$$

これより,

$$
\begin{align*}
\sum_{n=1}^N\left[\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}^{\rm T}\boldsymbol\Sigma_{\rm ML}^{-1}-\mathbf{I}\right]&=\mathbf{0}\\
N\mathbf{I}&=\sum_{n=1}^N\left[\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}^{\rm T}\boldsymbol\Sigma_{\rm ML}^{-1}\right]\\
\therefore \boldsymbol\Sigma_{\rm ML}&=\frac{1}{N}\sum_{n=1}^N\left[\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}\left\{\mathbf{y}(\mathbf{x}_n,\mathbf{w}_{\rm ML})-\mathbf{t}_n\right\}^{\rm T}\right] 
\end{align*}
$$



Exercise (5.4)

$${t=1}$$に対応するクラスを$${\mathcal{C}_1}$$,$${t=0}$$に対応するクラスを$${\mathcal{C}_2}$$とする。

$$
\begin{align*}
p(\mathcal{C}_1|\mathbf{x})&=(1-\epsilon)p(t=1|\mathbf{x})+\epsilon p(t=0|\mathbf{x})\\
&=(1-2\epsilon)p(t=1|\mathbf{x})+\epsilon\\
p(\mathcal{C}_2|\mathbf{x})&=\epsilon p(t=1|\mathbf{x})+(1-\epsilon)p(t=0|\mathbf{x})\\
&=(2\epsilon-1)p(t=1|\mathbf{x})+(1-\epsilon)
\end{align*}
$$

これより,$${\mathbf{x}}$$が与えられたときのクラス$${\mathcal{C}}$$の条件付確率$${p(\mathcal{C}|\mathbf{x})}$$は

$$
\begin{align*}
p(\mathcal{C}|\mathbf{x})&=\left\{p(\mathcal{C}_1|\mathbf{x})\right\}^t\left\{p(\mathcal{C}_2|\mathbf{x})\right\}^{1-t}\\
&=\left\{(1-2\epsilon)p(t=1|\mathbf{x})+\epsilon\right\}^t\left\{(2\epsilon-1)p(t=1|\mathbf{x})+(1-\epsilon)\right\}^{1-t}\\
&=\left\{(1-2\epsilon)y(\mathbf{x},\mathbf{w})+\epsilon\right\}^t\left\{(2\epsilon-1) y(\mathbf{x},\mathbf{w})+(1-\epsilon)\right\}^{1-t}
\end{align*}
$$

となる。
上式を用いて対数尤度関数の表式を求めると,

$$
\begin{align*}
E(\mathbf{w})&:=-\ln\prod_{n=1}^N\left\{(1-2\epsilon)y(\mathbf{x}_n,\mathbf{w})+\epsilon\right\}^t_n\left\{(2\epsilon-1)y(\mathbf{x}_n,\mathbf{w})+(1-\epsilon)\right\}^{1-t_n}\\
&=-\sum_{n=1}^N\left[t_n\ln\left\{(1-2\epsilon)y(\mathbf{x}_n,\mathbf{w})+\epsilon\right\}+(1-t_n)\ln\left\{(2\epsilon-1)y(\mathbf{x}_n,\mathbf{w})+(1-\epsilon)\right\}\right]
\end{align*}
$$

$${\epsilon=0}$$のとき,式(5.21)に帰着する。



Exercise (5.5)

$$
\begin{align*}
p(\mathbf{t}|\mathbf{x})&=\prod_{k=1}^K\left\{p(t_k=1|\mathbf{x})\right\}^{t_k}\\
&=\prod_{k=1}^K\left\{y_k(\mathbf{x},\mathbf{w})\right\}^{t_k}
\end{align*}
$$

より,

$$
\begin{align*}
E(\mathbf{w})&:=-\ln\prod_{n=1}^Np(\mathbf{t}_n|\mathbf{x}_n)\\
&=-\ln\prod_{n=1}^N\prod_{k=1}^K\left\{y_k(\mathbf{x}_n,\mathbf{w})\right\}^{t_{nk}}\\
&=-\sum_{n=1}^N\sum_{k=1}^Kt_{nk}\ln\left\{y_k(\mathbf{x}_n,\mathbf{w})\right\}
\end{align*}
$$



Exercise (5.6)

$$
\begin{align*}
\frac{\partial E}{\partial a_k}&=\frac{\partial y_k}{\partial a_k}\frac{\partial E}{\partial y_k}\\
&=-y_k(1-y_k)\left(\frac{t_k}{y_k}-\frac{1-t_k}{1-y_k}\right)\\
&=y_k(1-t_k)-(1-y_k)t_k\\
&=y_k-t_k
\end{align*}
$$



Exercise (5.7)

$${a_{nk}:=a_k(\mathbf{x}_n,\mathbf{w})}$$とおくと,

$$
\begin{align*}
\frac{\partial E}{\partial a_{nk}}&=-\frac{\partial}{\partial a_{nk}}\left[\sum_{k=1}^Kt_{nk}\ln\left\{y_k(\mathbf{x}_n,\mathbf{w})\right\}\right]\\
&=-\left[t_{nk}\frac{\partial \ln\left\{y_k(\mathbf{x}_n,\mathbf{w})\right\}}{\partial a_{nk}} +\sum_{j\neq k}^Kt_{nj}\frac{\partial \ln\left\{y_j(\mathbf{x}_n,\mathbf{w})\right\}}{\partial a_{nk}}\right]\\
&=-\left[t_{nk}(1-y_{nk}) -y_{nk}\sum_{j\neq k}^Kt_{nj}\right]\\
&=y_{nk}-t_{nk}
\end{align*}
$$



Exercise (5.8)

$$
\begin{align*}
\frac{\rm d}{{\rm d}a}\tanh(a)&=\frac{({\rm e}^{a}+{\rm e}^{-a})^2-({\rm e}^{a}-{\rm e}^{-a})^2}{({\rm e}^{a}+{\rm e}^{-a})^2}\\
&=1-\tanh^2(a)
\end{align*}
$$



Exercise (5.9)

$$
\begin{align*}
p(t|\mathbf{x})&:=\frac{1}{2}\left\{1+y(\mathbf{x},\mathbf{w})\right\}^{\frac{1+t}{2}}\left\{1-y(\mathbf{x},\mathbf{w})\right\}^{\frac{1-t}{2}}
\end{align*}
$$

と定義することにより,

$$
\begin{align*}
0\leq p(t=1|\mathbf{x})&=\frac{1}{2}\left\{1+y(\mathbf{x},\mathbf{w})\right\}\leq 1\\
0\leq p(t=-1|\mathbf{x})&=\frac{1}{2}\left\{1-y(\mathbf{x},\mathbf{w})\right\}\leq 1\\
p(t=1|\mathbf{x})&+p(t=-1|\mathbf{x})=1
\end{align*}
$$

を満たす確率を表現できる。
対応する誤差関数は,

$$
\begin{align*}
E(\mathbf{w})&=-\ln\prod_{n=1}^Np(t_n|\mathbf{x}_n)\\
&=-\sum_{n=1}^N\left[\left(\frac{1+t_n}{2}\right)\ln\left\{1+y(\mathbf{x}_n,\mathbf{w})\right\}+\left(\frac{1-t_n}{2}\right)\ln\left\{1-y(\mathbf{x}_n,\mathbf{w})\right\}\right]
\end{align*}
$$

となる。
tanhが活性化関数の候補として挙げられる。



Exercise (5.10)

$${\mathbf{H}}$$の固有ベクトル$${\{\mathbf{u}_i\}}$$を用いて,$${\mathbf{u}_i}$$と同次元の任意のベクトル$${\mathbf{v}}$$を

$$
\begin{align*}
\mathbf{v}&=\sum_i(\mathbf{u}^{\rm T}_i\mathbf{v})\mathbf{u}_i\\
&=:\sum_ic_i\mathbf{u}_i
\end{align*}
$$

と表すとする。
このとき,

$$
\begin{align*}
\mathbf{v}^{\rm T}\mathbf{H}\mathbf{v}&=\sum_i\sum_jc_ic_j\mathbf{u}^{\rm T}_j\mathbf{H}\mathbf{u}_i\\
&=\sum_i\sum_jc_ic_j\lambda_i\mathbf{u}^{\rm T}_j\mathbf{u}_i\\
&=\sum_i\sum_jc_ic_j\lambda_i\delta_{ij}\\
&=\sum_ic_i^2\lambda_i
\end{align*}
$$

となる。
以上より,$${\mathbf{H}}$$の固有値が全て0より大きい場合,$${\mathbf{v}^{\rm T}\mathbf{H}\mathbf{v}>0}$$成立するため,$${\mathbf{H}}$$は正定値行列となる。

Exercise (5.11) - (5.20)

Exercise (5.11)

$${E(\mathbf{w})={\rm const}=:E_0}$$において,

$$
\begin{align*}
E_0&=E(\mathbf{w}^*)+\frac{1}{2}\left(\mathbf{w}-\mathbf{w}^*\right)^{\rm T}\mathbf{H}\left(\mathbf{w}-\mathbf{w}^*\right)\\
2\left\{E_0-E(\mathbf{w}^*)\right\}&=\left(\mathbf{w}-\mathbf{w}^*\right)^{\rm T}\mathbf{H}\left(\mathbf{w}-\mathbf{w}^*\right)\\
&=\sum_i\lambda_i\left(\mathbf{w}-\mathbf{w}^*\right)^{\rm T}\mathbf{u}_i\mathbf{u}_i^{\rm T}\left(\mathbf{w}-\mathbf{w}^*\right)\\
&=\sum_i\lambda_i\left\{\mathbf{u}_i^{\rm T}\left(\mathbf{w}-\mathbf{w}^*\right)\right\}^2\\
\therefore \sum_i\frac{\left\{\mathbf{u}_i^{\rm T}\left(\mathbf{w}-\mathbf{w}^*\right)\right\}^2}{\left(\sqrt{2\left\{E_0-E(\mathbf{w}^*)\right\}\lambda_i^{-1}}\right)^2}&=1
\end{align*}
$$



Exercise (5.12)

$${\mathbf{w}^*}$$がlocal minimumであるとき,$${\mathbf{w}^*}$$近傍の任意の$${\mathbf{w}}$$に対して,

$$
\begin{align*}
E(\mathbf{w})-E(\mathbf{w}^*)&=\frac{1}{2}(\mathbf{w}-\mathbf{w}^*)^{\rm T}\mathbf{H}(\mathbf{w}-\mathbf{w}^*)\\
&>0
\end{align*}
$$

が成立する。
そのため,$${\mathbf{H}}$$は正定値行列となる。


$${\mathbf{H}}$$が正定値行列である場合,$${\mathbf{w}^*}$$近傍の任意の$${\mathbf{w}}$$に対して,

$$
\begin{align*}
\frac{1}{2}(\mathbf{w}-\mathbf{w}^*)^{\rm T}\mathbf{H}(\mathbf{w}-\mathbf{w}^*)&=E(\mathbf{w})-E(\mathbf{w}^*)\\
&>0\\
\therefore E(\mathbf{w})&>E(\mathbf{w}^*)
\end{align*}
$$

が成立する。
以上より,$${\mathbf{H}}$$が正定値行列であることは$${\mathbf{w}^*}$$がlocal minimumであるための必要十分条件である。



Exercise (5.13)

$${W\times W}$$対称行列は上三角部分と対角部分が独立成分となるため,独立成分数は$${W^2-\frac{1}{2}(W^2-W)=\frac{1}{2}W(W+1)}$$となる。
これにベクトルの独立成分数$${W}$$を追加すると,計$${\frac{1}{2}W(W+3)}$$となる。



Exercise (5.14)

$$
\begin{align*}
E_n(w_{ji}+\epsilon)&=E(w_{ji})+\frac{\partial E_n}{\partial w_{ji}}\epsilon+\frac{1}{2}\frac{\partial^2 E_n}{\partial w_{ji}^2}\epsilon^2+\mathcal{O}(\epsilon^3)\\
E_n(w_{ji}-\epsilon)&=E(w_{ji})-\frac{\partial E_n}{\partial w_{ji}}\epsilon+\frac{1}{2}\frac{\partial^2 E_n}{\partial w_{ji}^2}\epsilon^2+\mathcal{O}(\epsilon^3)
\end{align*}
$$

より,

$$
\begin{align*}
2\frac{\partial E_n}{\partial w_{ji}}\epsilon&=E_n(w_{ji}+\epsilon)-E_n(w_{ji}-\epsilon)+\mathcal{O}(\epsilon^3)\\
\therefore \frac{\partial E_n}{\partial w_{ji}}&=\frac{E_n(w_{ji}+\epsilon)-E_n(w_{ji}-\epsilon)}{2\epsilon}+\mathcal{O}(\epsilon^2)
\end{align*}
$$



Exercise (5.15)

$$
\begin{align*}
\frac{\partial y_k}{\partial x_i}&=\sum_jw_{ji}^{(1)}\frac{\partial y_k}{\partial a_j^{(1)}}\\
&=\sum_jw_{ji}^{(1)}\left\{\sum_l\frac{\partial y_k}{\partial a_l^{(2)}}\frac{\partial a_l^{(2)}}{\partial a_j^{(1)}}\right\}\\
&=\sum_jw_{ji}^{(1)}\left\{h'(a_j^{(1)})\sum_lw_{lj}^{(2)}\frac{\partial y_k}{\partial a_l^{(2)}}\right\}\\
&=\sum_l\frac{\partial y_k}{\partial a_l^{(2)}}\left\{\sum_jw_{lj}^{(2)}w_{ji}^{(1)}h'(a_j^{(1)})\right\}
\end{align*}
$$

より,$${h'(a_j^{(1)})\rightarrow \sum_jw_{lj}^{(2)}w_{ji}^{(1)}h'(a_j^{(1)})\rightarrow \frac{\partial y_k}{\partial a_l^{(2)}}\rightarrow\sum_l\frac{\partial y_k}{\partial a_l^{(2)}}\left\{\sum_jw_{lj}^{(2)}w_{ji}^{(1)}h'(a_j^{(1)})\right\}}$$の順番に計算することで$${\frac{\partial y_k}{\partial x_i}}$$を求めることができる。



Exercise (5.16)

$$
\begin{align*}
E&=\frac{1}{2}\sum_{n=1}^N\sum_{k=1}^K(y_{nk}-t_{nk})^2\\
\nabla\nabla E&=\sum_{n=1}^N\sum_{k=1}^K \nabla y_{nk}\nabla y_{nk}+\sum_{n=1}^N\sum_{k=1}(y_{nk}-t_{nk})\nabla\nabla y_{nk}\\
&\simeq \sum_{n=1}^N\sum_{k=1}^K\nabla y_{nk}\nabla y_{nk}\\
&=:\sum_{n=1}^N\sum_{k=1}^K\mathbf{b}_{nk}\mathbf{b}_{nk}^{\rm T} \ \ \ (\mathbf{b}_{nk}:=\nabla y_{nk})\\
&=:\sum_{n=1}^N\mathbf{B}_n\mathbf{B}_n^{\rm T}\ \ \ (\mathbf{B}_n:=(\mathbf{b}_{n1}\cdots\mathbf{b}_{nK}))
\end{align*}
$$



Exercise (5.17)

$$
\begin{align*}
\frac{\partial^2 E}{\partial w_r\partial w_s}&=\int{\rm d}\mathbf{x}\int{\rm d}t\frac{\partial y}{\partial w_r}\frac{\partial y}{\partial w_s}p(\mathbf{x},t)+\int{\rm d}\mathbf{x}\int{\rm d}t\frac{\partial^2y}{\partial w_r\partial w_s}(y-t)p(\mathbf{x},t)
\end{align*}
$$

$${y(\mathbf{x},\mathbf{w})=\mathbb{E}_t[t|\mathbf{x}]}$$のとき,右辺の第2項は

$$
\begin{align*}
\int{\rm d}\mathbf{x}\int{\rm d}t\frac{\partial^2y}{\partial w_r\partial w_s}(y-t)p(\mathbf{x},t)&=\int{\rm d}\mathbf{x}\frac{\partial^2y}{\partial w_r\partial w_s}\int{\rm d}t(\mathbb{E}_t[t|\mathbf{x}]-t)p(\mathbf{x},t)\\
&=\int{\rm d}\mathbf{x}\frac{\partial^2y}{\partial w_r\partial w_s}\left\{\mathbb{E}_t[t|\mathbf{x}]\int{\rm d}tp(\mathbf{x},t)-\int{\rm d}ttp(\mathbf{x},t)\right\}\\
&=\int{\rm d}\mathbf{x}\frac{\partial^2y}{\partial w_r\partial w_s}\left\{\mathbb{E}_t[t|\mathbf{x}]p(\mathbf{x})-p(\mathbf{x})\int{\rm d}ttp(t|\mathbf{x})\right\}\\
&=\int{\rm d}\mathbf{x}\frac{\partial^2y}{\partial w_r\partial w_s}\left\{\mathbb{E}_t[t|\mathbf{x}]p(\mathbf{x})-p(\mathbf{x})\mathbb{E}_t[t|\mathbf{x}]\right\}\\
&=0
\end{align*}
$$

となる。

$$
\begin{align*}
\therefore \frac{\partial^2 E}{\partial w_r\partial w_s}&=\int{\rm d}\mathbf{x}\int{\rm d}t\frac{\partial y}{\partial w_r}\frac{\partial y}{\partial w_s}p(\mathbf{x},t)\\
&=\int{\rm d}\mathbf{x}\frac{\partial y}{\partial w_r}\frac{\partial y}{\partial w_s}\int{\rm d}tp(\mathbf{x},t)\\
&=\int{\rm d}\mathbf{x}\frac{\partial y}{\partial w_r}\frac{\partial y}{\partial w_s}p(\mathbf{x})
\end{align*}
$$



Exercise (5.18)

inputとoutputをdirectにつなぐ重み係数を$${w_{ki}^{(0)}}$$と表すことにすると,$${y_k}$$は

$$
\begin{align*}
y_k&=\sum_{j=0}^Mw_{kj}^{(2)}z_j+\sum_{i=0}^Dw_{ki}^{(0)}x_i
\end{align*}
$$

に拡張される。
$${w_{ki}^{(0)}}$$に対する誤差関数$${E_n}$$の偏微分は,

$$
\begin{align*}
\frac{\partial E_n}{\partial w_{ki}^{(0)}}&=\delta_kx_i\ \ \ (\delta_k=y_k-t_k)
\end{align*}
$$

となる。



Exercise (5.19)

$$
\begin{align*}
\nabla y_n&=\nabla\sigma(a_n)\\
&=\sigma(1-\sigma)\nabla a_n\\
&=y_n(1-y_n)\nabla a_n
\end{align*}
$$

より,

$$
\begin{align*}
\nabla\nabla E&=\nabla\nabla\left[-\sum_{n=1}^N\left\{t_n\ln y_n+(1-t_n)\ln(1-y_n)\right\}\right]\\
&=\nabla\left[-\sum_{n=1}^N\left\{t_ny_n^{-1}-(1-t_n)(1-y_n)^{-1}\right\}\nabla y_n\right]\\
&=\nabla\left[-\sum_{n=1}^N\left\{t_n(1-y_n)-(1-t_n)y_n\right\}\nabla a_n\right]\\
&=\nabla\left[\sum_{n=1}^N(y_n-t_n)\nabla a_n\right]\\
&=\sum_{n=1}^N\left\{\nabla y_n (\nabla a_n)^{\rm T}+(y_n-t_n)\nabla\nabla a_n\right\}\\
&=\sum_{n=1}^N\left\{y_n(1-y_n)\nabla a_n (\nabla a_n)^{\rm T}+(y_n-t_n)\nabla\nabla a_n\right\}\\
&\simeq\sum_{n=1}^Ny_n(1-y_n)\mathbf{b}_n\mathbf{b}_n^{\rm T}
\end{align*}
$$



Exercise (5.20)

$$
\begin{align*}
\nabla y_{nk}&=\sum_i\frac{\partial y_{nk}}{\partial a_{ni}}\nabla a_{ni}\\
&=\sum_i\frac{\partial }{\partial a_{ni}}\left\{\frac{{\rm e}^{a_{nk}}}{\sum_j{\rm e}^{a_{nj}}}\right\}\nabla a_{ni}\\
&=\sum_i\nabla a_{ni}\left\{-\frac{{\rm e}^{a_{nk}}{\rm e}^{a_{ni}}}{\left(\sum_j{\rm e}^{a_{nj}}\right)^2}+\frac{{\rm e}^{a_{nk}}\delta_{ki}}{\sum_j{\rm e}^{a_{nj}}}\right\}\\
&=y_{nk}\left(\nabla a_{nk}-\sum_iy_{ni}\nabla a_{ni}\right)
\end{align*}
$$

より,

$$
\begin{align*}
\nabla E&=\nabla\left(-\sum_{n=1}^N\sum_{k=1}^Kt_{nk}\ln y_{nk}\right)\\
&=-\sum_{n=1}^N\sum_{k=1}^Kt_{nk}y_{nk}^{-1}\nabla y_{nk}\\
&=-\sum_{n=1}^N\sum_{k=1}^Kt_{nk}\left(\nabla a_{nk}-\sum_{i=1}^Ky_{ni}\nabla a_{ni}\right)\\
&=-\sum_{n=1}^N\sum_{k=1}^Kt_{nk}\nabla a_{nk}+\sum_{n=1}^N\sum_{i=1}^Ky_{ni}\nabla a_{ni}\sum_{k=1}^Kt_{nk}\\
&=\sum_{n=1}^N\sum_{k=1}^K(y_{nk}-t_{nk})\nabla a_{nk}
\end{align*}
$$

$$
\begin{align*}
\nabla\nabla E&=\nabla\left\{\sum_{n=1}^N\sum_{k=1}^K(y_{nk}-t_{nk})\nabla a_{nk}\right\}\\
&=\sum_{n=1}^N\sum_{k=1}^K\nabla y_{nk}(\nabla a_{nk})^{\rm T}+\sum_{n=1}^N\sum_{k=1}^K(y_{nk}-t_{nk})\nabla\nabla a_{nk}\\
&=\sum_{n=1}^N\sum_{k=1}^Ky_{nk}\left(\nabla a_{nk}-\sum_{i=1}^Ky_{ni}\nabla a_{ni}\right)(\nabla a_{nk})^{\rm T}+\sum_{n=1}^N\sum_{k=1}^K(y_{nk}-t_{nk})\nabla\nabla a_{nk}\\
&\simeq\sum_{n=1}^N\mathbf{B}_n^{\rm T}\left\{{\rm diag}(y_{n1},y_{n2},\cdots,y_{nK})-\mathbf{y}_n\mathbf{y}_n^{\rm T}\right\}\mathbf{B}_n
\end{align*}
$$

ここで,$${\mathbf{B}_n=(\nabla a_{n1}\ \nabla a_{n2}\ \cdots \nabla a_{nK})^{\rm T}}$$である。

Exercise (5.21) - (5.30)

Exercise (5.21)

式(5.86)を$${K>2}$$に拡張すると,

$$
\begin{align*}
\mathbf{H}_N&=\sum_{n=1}^N\sum_{k=1}^K\mathbf{b}_{nk}\mathbf{b}_{nk}^{\rm T}\\
&=\sum_{n=1}^N\mathbf{B}_{n}^{\rm T}\mathbf{B}_{n}
\end{align*}
$$

となる。ここで,$${\mathbf{B}_{n}=\begin{pmatrix}\mathbf{b}_{n1}&\mathbf{b}_{n2}&\cdots&\mathbf{b}_{nK}\end{pmatrix}^{\rm T}}$$である。
このとき,

$$
\begin{align*}
\mathbf{H}_{L+1}&=\mathbf{H}_{L}+\mathbf{B}_{L+1}^{\rm T}\mathbf{B}_{L+1}
\end{align*}
$$

が成立する。
$${\mathbf{H}_{L+1}}$$の逆行列は

$$
\begin{align*}
\mathbf{H}_{L+1}^{-1}&=\left(\mathbf{H}_{L}+\mathbf{B}_{L+1}^{\rm T}\mathbf{B}_{L+1}\right)^{-1}\\
&=\left(\mathbf{H}_{L}+\mathbf{B}_{L+1}^{\rm T}\mathbf{I}\mathbf{B}_{L+1}\right)^{-1}\\
&=\mathbf{H}_{L}^{-1}-\mathbf{H}_{L}^{-1}\mathbf{B}_{L+1}^{\rm T}\left(\mathbf{I}+\mathbf{B}_{L+1}^{\rm T}\mathbf{H}_{L}^{-1}\mathbf{B}_{L+1}\right)^{-1}\mathbf{B}_{L+1}\mathbf{H}_{L}^{-1}
\end{align*}
$$

となる。



Exercise (5.22)

$$
\begin{align*}
\frac{\partial^2E_n }{\partial w_{kj}^{(2)}\partial w_{k'j'}^{(2)}}&=\frac{\partial }{\partial w_{k'j'}^{(2)}}\left\{\frac{\partial E_n }{\partial w_{kj}^{(2)}}\right\}\\
&=\frac{\partial }{\partial w_{k'j'}^{(2)}}\left\{\frac{\partial a_k}{\partial w_{kj}^{(2)}}\frac{\partial E_n }{\partial a_k}\right\}\\
&=z_j\frac{\partial }{\partial w_{k'j'}^{(2)}}\left\{\frac{\partial E_n }{\partial a_k}\right\}\\
&=z_j\frac{\partial a_{k'}}{\partial w_{k'j'}^{(2)}}\frac{\partial^2 E_n }{\partial a_k\partial a_{k'}}\\
&=z_jz_{j'}M_{kk'}
\end{align*}
$$


$$
\begin{align*}
\frac{\partial E_n }{\partial w_{j'i'}^{(1)}}&=\sum_k\frac{\partial E_n}{\partial a_k}\frac{\partial a_k}{\partial w_{j'i'}^{(1)}}\\
&=\frac{\partial z_{j'}}{\partial w_{j'i'}^{(1)}}\sum_k\frac{\partial E_n}{\partial a_k}w_{kj'}^{(2)}\\
&=x_{i'}h'(a_{j'})\sum_k\frac{\partial E_n}{\partial a_k}w_{kj'}^{(2)}\\
\frac{\partial^2 E_n }{\partial w_{ji}^{(1)}\partial w_{j'i'}^{(1)}}&=x_{i'}\left[\frac{\partial h'(a_{j'})}{\partial w_{ji}^{(1)}}\sum_k\frac{\partial E_n}{\partial a_k}w_{kj'}^{(2)}+h'(a_{j'})\frac{\partial }{\partial w_{j'i}^{(1)}}\left\{\sum_k\frac{\partial E_n}{\partial a_k}w_{kj'}^{(2)}\right\}\right]\\
&=x_{i'}\left[x_{i}h''(a_{j'})I_{jj'}\sum_k\frac{\partial E_n}{\partial a_k}w_{kj'}^{(2)}+h'(a_{j'})\sum_k\sum_{k'}\frac{\partial^2 E_n}{\partial a_k\partial a_{k'}}\frac{\partial a_{k'}}{\partial w_{ji}^{(1)}}w_{kj'}^{(2)}\right]\\
&=x_{i'}\left[x_{i}h''(a_{j'})I_{jj'}\sum_k\frac{\partial E_n}{\partial a_k}w_{kj'}^{(2)}+h'(a_{j'})\frac{\partial z_j}{\partial w_{ji}^{(1)}}\sum_k\sum_{k'}\frac{\partial^2 E_n}{\partial a_k\partial a_{k'}}w_{kj'}^{(2)}w_{k'j}^{(2)}\right]\\
&=x_{i'}\left[x_{i}h''(a_{j'})I_{jj'}\sum_k\frac{\partial E_n}{\partial a_k}w_{kj'}^{(2)}+x_ih'(a_{j'})h'(a_{j})\sum_k\sum_{k'}\frac{\partial^2 E_n}{\partial a_k\partial a_{k'}}w_{kj'}^{(2)}w_{k'j}^{(2)}\right]\\
&=x_{i}x_{i'}h''(a_{j'})I_{jj'}\sum_kw_{kj'}^{(2)}\delta_k+x_{i}x_{i'}h'(a_{j'})h'(a_{j})\sum_k\sum_{k'}w_{kj'}^{(2)}w_{k'j}^{(2)}M_{kk'}
\end{align*}
$$


$$
\begin{align*}
\frac{\partial^2E_n }{\partial w_{ji}^{(1)}\partial w_{kj'}^{(2)}}&=\frac{\partial}{\partial w_{kj'}^{(2)}}\left\{\frac{\partial E_n }{\partial w_{ji}^{(1)}}\right\}\\
&=\frac{\partial}{\partial w_{kj'}^{(2)}}\left\{x_{i}h'(a_{j})\sum_{k'}\frac{\partial E_n}{\partial a_{k'}}w_{k'j}^{(2)}\right\}\\
&=x_{i}h'(a_{j})\sum_{k'}\left\{\frac{\partial}{\partial w_{kj'}^{(2)}}\left(\frac{\partial E_n}{\partial a_{k'}}\right)w_{k'j}^{(2)}+\frac{\partial E_n}{\partial a_{k'}}I_{jj'}I_{kk'}\right\}\\
&=x_{i}h'(a_{j})\left\{z_{j'}\sum_{k'}w_{k'j}^{(2)}M_{kk'}+\delta_kI_{jj'}\right\}
\end{align*}
$$



Exercise (5.23)

入力層と出力層を直接つなぐ重み係数を$${w_{ki}^{(0)}}$$と表すことにし,$${a_k}$$を

$$
\begin{align*}
a_k&=\sum_jw_{kj}^{(2)}z_j+\sum_iw_{ki}^{(0)}x_i
\end{align*}
$$

と拡張することを考える。
このとき,

$$
\begin{align*}
\frac{\partial^2 E_n}{\partial w_{ki}^{(0)} \partial w_{k'i'}^{(0)}}&=\frac{\partial}{\partial w_{k'i'}^{(0)}}\left\{\frac{\partial a_k}{\partial w_{ki}^{(0)}}\frac{\partial E_n}{\partial a_k}\right\}\\
&=x_i\frac{\partial a_{k'}}{\partial w_{k'i'}^{(0)}}\frac{\partial^2 E_n}{\partial a_k\partial a_k'}\\
&=x_ix_{i'}M_{kk'}\\
\frac{\partial^2 E_n}{\partial w_{ki}^{(0)} \partial w_{ji'}^{(1)}}&=\frac{\partial}{\partial w_{ji'}^{(1)}}\left(x_i\frac{\partial E_n}{\partial a_k}\right)\\
&=x_i\sum_{k'}\frac{\partial a_{k'}}{\partial w_{ji'}^{(1)}}\frac{\partial^2 E_n}{\partial a_k\partial a_{k'}}\\
&=x_i\frac{\partial z_j}{\partial w_{ji'}^{(1)}}\sum_{k'}w_{k'j}^{(2)}\frac{\partial^2 E_n}{\partial a_k\partial a_{k'}}\\
&=x_ix_{i'}h'(a_j)\sum_{k'}w_{k'j}^{(2)}M_{kk'}\\
\frac{\partial^2 E_n}{\partial w_{ki}^{(0)} \partial w_{k'j}^{(2)}}&=\frac{\partial}{\partial w_{k'j}^{(2)}}\left(x_i\frac{\partial E_n}{\partial a_k}\right)\\
&=x_i\frac{\partial a_{k'}}{\partial w_{k'j}^{(2)}}\frac{\partial^2 E_n}{\partial a_k\partial a_{k'}}\\
&=x_iz_{j}M_{kk'}
\end{align*}
$$



Exercise (5.24)

$$
\begin{align*}
\tilde{z}_j&=h\left(\sum_i\tilde{w}_{ji}\tilde{x}_i+\tilde{w}_{j0}\right)\\
&=h\left(\sum_i\tilde{w}_{ji}(ax_i+b)+\tilde{w}_{j0}\right)\\
&=h\left(\sum_ia\tilde{w}_{ji}x_i+\left(b\sum_i\tilde{w}_{ji}+\tilde{w}_{j0}\right)\right)
\end{align*}
$$

$${\tilde{z}_j=z_j}$$のとき,

$$
\begin{align*}
a\tilde{w}_{ji}&=w_{ji}\\
b\sum_i\tilde{w}_{ji}+\tilde{w}_{j0}&=w_{j0}\\
\therefore \tilde{w}_{ji}&=\frac{1}{a}w_{ji}\\
\tilde{w}_{j0}&=w_{j0}-b\sum_i\tilde{w}_{ji}\\
&=w_{j0}-\frac{b}{a}\sum_iw_{ji}
\end{align*}
$$


$$
\begin{align*}
\tilde{y}_k&=cy_k+d\\
&=\sum_jcw_{kj}z_j+(cw_{k0}+d)\\
&=\sum_j\tilde{w}_{kj}z_j+\tilde{w}_{k0}\\
\therefore \tilde{w}_{kj}&=cw_{kj}\\
\tilde{w}_{k0}&=cw_{k0}+d
\end{align*}
$$



Exercise (5.25)

$$
\begin{align*}
\nabla E^{(\tau)}&=\mathbf{H}\left\{\mathbf{w}^{(\tau)}-\mathbf{w}^*\right\}\\
&=\sum_i\eta_i\mathbf{u}_i\mathbf{u}_i^{\rm T}\left\{\mathbf{w}^{(\tau)}-\mathbf{w}^*\right\}\\
&=\sum_i\eta_i\left\{w_i^{(\tau)}-w_i^*\right\}\mathbf{u}_i\\
\mathbf{w}^{(\tau)}&=\mathbf{w}^{(\tau-1)}-\rho\nabla E^{(\tau-1)}\\
&=\mathbf{w}^{(\tau-1)}-\rho\sum_i\eta_i\left\{w_i^{(\tau-1)}-w_i^*\right\}\mathbf{u}_i\\
w_j^{(\tau)}&=w_j^{(\tau-1)}-\rho\sum_i\eta_i\left\{w_i^{(\tau-1)}-w_i^*\right\}\delta_{ij}\\
&=w_j^{(\tau-1)}-\rho\eta_j\left\{w_j^{(\tau-1)}-w_j^*\right\}
\end{align*}
$$

上式と帰納法を用いて,式(5.197)が成立することを証明することを考える。
$${\tau=1}$$のとき,

$$
\begin{align*}
w_j^{(1)}&=w_j^{(0)}-\rho\eta_j\left\{w_j^{(0)}-w_j^*\right\}\\
&=\rho\eta_jw_j^*\\
&=\left\{1-(1-\rho\eta_j)^1\right\}w_j^*
\end{align*}
$$

となり,式(5.197)が成立する。
$${\tau=k-1}$$のとき,式(5.197)が成立すると仮定する。
このとき,$${\tau=k}$$の場合を考えると,

$$
\begin{align*}
w_j^{(k)}&=w_j^{(k-1)}-\rho\eta_j\left\{w_j^{(k-1)}-w_j^*\right\}\\
&=(1-\rho\eta_j)w_j^{(k-1)}+\rho\eta_jw_j^*\\
&=(1-\rho\eta_j)\left\{1-(1-\rho\eta_j)^{k-1}\right\}w_j^*+\rho\eta_jw_j^*\\
&=\left\{1-(1-\rho\eta_j)^k\right\}w_j^*
\end{align*}
$$

となり,$${\tau=k}$$でも式(5.197)が成立する。
以上より,$${\tau\geq 1}$$で式(5.197)が成立する。


$${|1-\rho\eta_j|<1}$$のとき,

$$
\begin{align*}
\lim_{\tau\rightarrow\infty}w_j^{(\tau)}&=\lim_{\tau\rightarrow\infty}\left\{1-(1-\rho\eta_j)^{\tau}\right\}w_j^*\\
&=w_j^*
\end{align*}
$$


$${\tau\gg\frac{1}{\rho\eta_j}}$$のとき,$${0<\rho\eta_j<1}$$を考慮すると,$${\tau\gg1}$$となるため,

$$
\begin{align*}
(1-\rho\eta_j)^{\tau}&\simeq 0\\
\therefore w_j^{(\tau)}&\simeq w_j^*
\end{align*}
$$

一方,$${\tau\ll\frac{1}{\rho\eta_j}}$$のとき,$${1 \gg \frac{1}{\tau}\gg \rho\eta_j}$$となるため,

$$
\begin{align*}
(1-\rho\eta_j)^{\tau}&\simeq 1-\tau\rho\eta_j\\
w_j^{(\tau)}&\simeq -\tau\rho\eta_jw_j^*\\
\frac{w_j^{(\tau)}}{w_j^*}&\simeq-\tau\rho\eta_j\\
\therefore |w_j^{(\tau)}|&\ll|w_j^*|
\end{align*}
$$



Exercise (5.26)

$$
\begin{align*}
\Omega_n&=\frac{1}{2}\sum_k\left(\sum_{i=1}^DJ_{ki}\tau_i\right)^2\\
&=\frac{1}{2}\sum_k\left(\sum_{i=1}^D\tau_i\frac{\partial y_k}{\partial x_i}\right)^2\\
&=\frac{1}{2}\sum_k\left\{\left(\sum_{i=1}^D\tau_i\frac{\partial }{\partial x_i}\right)y_k\right\}^2\\
&=\frac{1}{2}\sum_k\left(\mathcal{G}y_k\right)^2
\end{align*}
$$


$${\alpha_j:=\mathcal{G}z_j,\ \beta_j:=\mathcal{G}a_j}$$と定義すると,

$$
\begin{align*}
\alpha_j&=\mathcal{G}z_j\\
&=\mathcal{G}h(a_j)\\
&=h'(a_j)\mathcal{G}a_j\\
&=h'(a_j)\beta_j\\
\beta_j&=\mathcal{G}a_j\\
&=\sum_iw_{ji}\mathcal{G}z_i\\
&=\sum_iw_{ji}\alpha_i
\end{align*}
$$

を満たす。


$${\mathcal{G}y_k:=\alpha_k}$$とおくと,

$$
\begin{align*}
\frac{\partial \Omega_n}{\partial w_{rs}}&=\sum_k\mathcal{G}y_k\mathcal{G}\frac{\partial y_k}{\partial w_{rs}}\\
&=\sum_k\alpha_k\mathcal{G}\sum_j\frac{\partial y_k}{\partial a_j}\frac{\partial a_j}{\partial w_{rs}}\\
&=\sum_k\alpha_k\mathcal{G}\left\{\sum_j\delta_{kj}\frac{\partial }{\partial w_{rs}}\sum_iw_{ji}z_i\right\}\\
&=\sum_k\alpha_k\mathcal{G}\left\{\sum_j\delta_{kj}I_{rj}\sum_iI_{si}z_i\right\}\\
&=\sum_k\alpha_k\mathcal{G}\left\{\sum_j\delta_{kj}I_{rj}z_s\right\}\\
&=\sum_k\alpha_k\mathcal{G}\left\{\delta_{kr}z_s\right\}\\
&=\sum_k\alpha_k\left\{\mathcal{G}\delta_{kr}z_s+\delta_{kr}\mathcal{G}z_s\right\}\\
&=\sum_k\alpha_k\left\{\phi_{kr}z_s+\delta_{kr}\alpha_s\right\}
\end{align*}
$$

$${w_{rs}}$$が入力層と隠れ層の間の重み係数,隠れ層と出力層の重み係数のどちらであっても同じ定式に帰着するが,$${\delta_{kr}}$$の計算内容は変わってくることに注意する。


$${y_k=f(\{a_k\}),\ a_k=\sum_jw_{kj}z_j=\sum_jw_{kj}h(a_j)}$$とおく。
$${w_{rs}}$$が隠れ層と出力層の重み係数の場合の$${\delta_{kr},\ \phi_{kr}}$$をそれぞれ$${\delta_{kr}^{(2)},\ \phi_{kr}^{(2)}}$$とおくと,

$$
\begin{align*}
\delta_{kr}^{(2)}&=\frac{\partial y_k}{\partial a_r}\\
\phi_{kr}^{(2)}&=\mathcal{G}\frac{\partial y_k}{\partial a_r}\\
&=\frac{\partial }{\partial a_r}\mathcal{G}y_k\\
&=\frac{\partial \alpha_k}{\partial a_r}
\end{align*}
$$

から計算される。
$${w_{rs}}$$が入力層と隠れ層の間の重み係数の$${\delta_{kr},\ \phi_{kr}}$$をそれぞれ$${\delta_{kr}^{(1)},\ \phi_{kr}^{(1)}}$$とおくと,

$$
\begin{align*}
\delta_{kr}^{(1)}&=\frac{\partial y_k}{\partial a_r}\\
&=\sum_{t}\delta_{kt}^{(2)}\frac{\partial a_t}{\partial a_r}\\
&=\sum_{t}\delta_{kt}^{(2)}\frac{\partial}{\partial a_r}\sum_jw_{tj}h(a_j)\\
&=\sum_{t}\delta_{kt}^{(2)}\sum_jw_{tj}h'(a_j)I_{rj}\\
&=h'(a_r)\sum_{t}w_{tr}\delta_{kt}^{(2)}\\
\phi_{kr}^{(1)}&=\mathcal{G}\left\{h'(a_r)\sum_{t}w_{tr}\delta_{kt}^{(2)}\right\}\\
&=h''(a_r)\beta_r\sum_{t}w_{tr}\delta_{kt}^{(2)}+h'(a_r)\sum_{t}w_{tr}\phi_{kt}^{(2)}
\end{align*}
$$

となるため,$${\delta_{kt}^{(2)}, \phi_{kt}^{(2)}}$$を各$${t}$$について計算→$${\delta_{kr}^{(1)}, \phi_{kr}^{(1)}}$$を計算という流れで計算できる。



Exercise (5.27)

$${\mathbf{s}=\mathbf{x}+\boldsymbol\xi}$$のとき,

$$
\begin{align*}
y(\mathbf{s})&=y(\mathbf{x}+\boldsymbol\xi)\\
&=y(\mathbf{x})+\boldsymbol\xi^{\rm T}\nabla y(\mathbf{x})+\frac{1}{2}\boldsymbol\xi^{\rm T}\nabla\nabla y(\mathbf{x})\boldsymbol\xi+\mathcal{O}(\|\boldsymbol\xi\|^3)
\end{align*}
$$

より,

$$
\begin{align*}
\left\{y(\mathbf{x}+\boldsymbol\xi)-t\right\}^2&=\left\{y(\mathbf{x})+\boldsymbol\xi^{\rm T}\nabla y(\mathbf{x})+\frac{1}{2}\boldsymbol\xi^{\rm T}\nabla\nabla y(\mathbf{x})\boldsymbol\xi+\mathcal{O}(\|\boldsymbol\xi\|^3)-t\right\}^2\\
&=\left\{y(\mathbf{x})-t\right\}^2+\left\{\nabla y(\mathbf{x})\right\}^{\rm T}\boldsymbol\xi\boldsymbol\xi^{\rm T}\nabla y(\mathbf{x})+2\left\{y(\mathbf{x})-t\right\}\boldsymbol\xi^{\rm T}\nabla y(\mathbf{x})+\left\{y(\mathbf{x})-t\right\}\boldsymbol\xi^{\rm T}\nabla\nabla y(\mathbf{x})\boldsymbol\xi+\mathcal{O}(\|\boldsymbol\xi\|^3)
\end{align*}
$$

$$
\begin{align*}
\widetilde{E}&=\frac{1}{2}\int{\rm d}\mathbf{x}\int{\rm d}t\int{\rm d}\boldsymbol\xi\left\{y(\mathbf{x}+\boldsymbol\xi)-t\right\}^2p(t|\mathbf{x})p(\mathbf{x})p(\boldsymbol\xi)\\
&=\frac{1}{2}\int{\rm d}\mathbf{x}\int{\rm d}t\int{\rm d}\boldsymbol\xi\left[\left\{y(\mathbf{x})-t\right\}^2+\left\{\nabla y(\mathbf{x})\right\}^{\rm T}\boldsymbol\xi\boldsymbol\xi^{\rm T}\nabla y(\mathbf{x})+2\left\{y(\mathbf{x})-t\right\}\boldsymbol\xi^{\rm T}\nabla y(\mathbf{x})+\left\{y(\mathbf{x})-t\right\}\boldsymbol\xi^{\rm T}\nabla\nabla y(\mathbf{x})\boldsymbol\xi+\mathcal{O}(\|\boldsymbol\xi\|^3)\right]p(t|\mathbf{x})p(\mathbf{x})p(\boldsymbol\xi)\\
&=\frac{1}{2}\int{\rm d}\mathbf{x}\int{\rm d}tp(t|\mathbf{x})p(\mathbf{x})\left\{y(\mathbf{x})-t\right\}^2\int{\rm d}\boldsymbol\xi p(\boldsymbol\xi)\\
&\ \ \ \ \ \ \ +\frac{1}{2}\int{\rm d}\mathbf{x}\int{\rm d}t p(t|\mathbf{x})p(\mathbf{x})\left\{\nabla y(\mathbf{x})\right\}^{\rm T}\left\{\int{\rm d}\boldsymbol\xi\boldsymbol\xi\boldsymbol\xi^{\rm T}p(\boldsymbol\xi)\right\}\nabla y(\mathbf{x})\\
&\ \ \ \ \ \ \ +\int{\rm d}\boldsymbol\xi\boldsymbol\xi^{\rm T}\int{\rm d}\mathbf{x}\int{\rm d}tp(t|\mathbf{x})p(\mathbf{x})\nabla y(\mathbf{x})\left\{y(\mathbf{x})-t\right\}\\
&\ \ \ \ \ \ \ +\frac{1}{2}\int{\rm d}\mathbf{x}\int{\rm d}tp(t|\mathbf{x})p(\mathbf{x})\int{\rm d}\boldsymbol\xi\left\{y(\mathbf{x})-t\right\}\boldsymbol\xi^{\rm T}\nabla\nabla y(\mathbf{x})\boldsymbol\xi+\mathcal{O}(\|\boldsymbol\xi\|^3)\\
&=E+\frac{\lambda}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\|\nabla y(\mathbf{x})\|^2+\mathbb{E}\left[\boldsymbol\xi^{\rm T}\right]\int{\rm d}\mathbf{x}\int{\rm d}tp(t|\mathbf{x})p(\mathbf{x})\nabla y(\mathbf{x})\left\{y(\mathbf{x})-t\right\}+\frac{1}{2}\int{\rm d}\mathbf{x}\int{\rm d}tp(t|\mathbf{x})p(\mathbf{x})\int{\rm d}\boldsymbol\xi\left\{y(\mathbf{x})-t\right\}\boldsymbol\xi^{\rm T}\nabla\nabla y(\mathbf{x})\boldsymbol\xi+\mathcal{O}(\|\boldsymbol\xi\|^3)\\
&=E+\frac{\lambda}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\|\nabla y(\mathbf{x})\|^2+\frac{1}{2}\int{\rm d}\mathbf{x}\int{\rm d}tp(t|\mathbf{x})p(\mathbf{x})\int{\rm d}\boldsymbol\xi\left\{y(\mathbf{x})-t\right\}\boldsymbol\xi^{\rm T}\nabla\nabla y(\mathbf{x})\boldsymbol\xi+\mathcal{O}(\|\boldsymbol\xi\|^3)
\end{align*}
$$

上記の式変形において,$${\boldsymbol\xi\sim\mathcal{N}(\boldsymbol\xi|0, \lambda\mathbf{I})}$$であるため,

$$
\begin{align*}
\mathbb{E}\left[\boldsymbol\xi\right]&=\mathbf{0}\\
\int{\rm d}\boldsymbol\xi\boldsymbol\xi\boldsymbol\xi^{\rm T}p(\boldsymbol\xi)&=\mathbb{E}\left[\boldsymbol\xi\boldsymbol\xi^{\rm T}\right]-\mathbb{E}\left[\boldsymbol\xi\right]\mathbb{E}\left[\boldsymbol\xi^{\rm T}\right]\\
&=\lambda\mathbf{I}
\end{align*}
$$

が成立することを利用した。
$${y(\mathbf{x})=\mathbb{E}[t|\mathbf{x}]+\mathcal{O}(\|\boldsymbol\xi\|)}$$が成立すると仮定すると,

$$
\begin{align*}
\widetilde{E}&=E+\frac{\lambda}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\|\nabla y(\mathbf{x})\|^2+\mathcal{O}(\|\boldsymbol\xi\|^3)\\
&\simeq E+\frac{\lambda}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\|\nabla y(\mathbf{x})\|^2\\
\therefore \Omega&=\frac{1}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\|\nabla y(\mathbf{x})\|^2
\end{align*}
$$



Exercise (5.28)

$${j}$$番目のユニットに関する重み係数を共有するとき,$${w_{ji}}$$が$${i}$$によらず同じ値となるため,$${w_{j}}$$と表現できることを意味する。
このとき,

$$
\begin{align*}
\frac{\partial E_n}{\partial w_{kj}}&\rightarrow\frac{\partial E_n}{\partial w_{k}}\\
&=\frac{\partial E_n}{\partial a_{k}}\frac{\partial a_k}{\partial w_{k}}\\
&=\delta_k\sum_jz_j\\
\frac{\partial E_n}{\partial w_{ji}}&\rightarrow\frac{\partial E_n}{\partial w_{j}}\\
&=\frac{\partial E_n}{\partial a_{j}}\frac{\partial a_j}{\partial w_{j}}\\
&=\sum_k\frac{\partial E_n}{\partial a_{k}}\frac{\partial a_k}{\partial a_{j}}\sum_iz_i\\
&=h'(a_j)\sum_k w_k\delta_k\sum_iz_i
\end{align*}
$$



Exercise (5.29)

$$
\begin{align*}
\frac{\partial \Omega}{\partial w_i}&=-\frac{\partial }{\partial w_i}\left[\sum_i\ln\left\{\sum_j\pi_j\mathcal{N}(\omega_i|\mu_j,\sigma_j^2)\right\}\right]\\
&=-\frac{\partial }{\partial w_i}\left\{\sum_j\pi_j\mathcal{N}(\omega_i|\mu_j,\sigma_j^2)\right\}\frac{1}{\sum_k\pi_k\mathcal{N}(\omega_i|\mu_k,\sigma_k^2)}\\
&=\sum_j\frac{w_i-\mu_j}{\sigma_j^2}\left\{\frac{\pi_j\mathcal{N}(\omega_i|\mu_j,\sigma_j^2)}{\sum_k\pi_k\mathcal{N}(\omega_i|\mu_k,\sigma_k^2)}\right\}\\
&=\sum_j\gamma_j(w_i)\frac{w_i-\mu_j}{\sigma_j^2}\\
\therefore \frac{\partial \widetilde{E}}{\partial w_i}&=\frac{\partial E}{\partial w_i}+\lambda\sum_j\gamma_j(w_i)\frac{w_i-\mu_j}{\sigma_j^2}
\end{align*}
$$



Exercise (5.30)

$$
\begin{align*}
\frac{\partial \Omega}{\partial \mu_j}&=-\frac{\partial }{\partial \mu_j}\left[\sum_i\ln\left\{\sum_{j'}\pi_{j'}\mathcal{N}(\omega_i|\mu_{j'},\sigma_{j'}^2)\right\}\right]\\
&=-\sum_i\frac{\partial }{\partial \mu_j}\left\{\sum_{j'}\pi_{j'}\mathcal{N}(\omega_i|\mu_{j'},\sigma_{j'}^2)\right\}\frac{1}{\sum_k\pi_k\mathcal{N}(\omega_i|\mu_k,\sigma_k^2)}\\
&=\sum_i\sum_{j'}\delta_{jj'}\frac{\mu_{j'}-w_i}{\sigma_{j'}^2}\left\{\frac{\pi_{j'}\mathcal{N}(\omega_i|\mu_{j'},\sigma_{j'}^2)}{\sum_k\pi_k\mathcal{N}(\omega_i|\mu_k,\sigma_k^2)}\right\}\\
&=\sum_i\gamma_j(w_i)\frac{\mu_j-w_i}{\sigma_j^2}\\
\therefore \frac{\partial \widetilde{E}}{\partial \mu_j}&=\lambda\sum_i\gamma_j(w_i)\frac{\mu_j-w_i}{\sigma_j^2}
\end{align*}
$$

Exercise (5.31) - (5.41)

Exercise (5.31)

$$
\begin{align*}
\frac{\partial \Omega}{\partial \sigma_j}&=-\frac{\partial }{\partial \sigma_j}\left[\sum_i\ln\left\{\sum_{j'}\pi_{j'}\mathcal{N}(\omega_i|\mu_{j'},\sigma_{j'}^2)\right\}\right]\\
&=-\sum_i\frac{\partial }{\partial \sigma_j}\left\{\sum_{j'}\pi_{j'}\mathcal{N}(\omega_i|\mu_{j'},\sigma_{j'}^2)\right\}\frac{1}{\sum_k\pi_k\mathcal{N}(\omega_i|\mu_k,\sigma_k^2)}\\
&=\sum_i\sum_{j'}\delta_{jj'}\left\{\frac{1}{\sigma_{j'}}-\frac{(w_i-\mu_{j'})^2}{\sigma_{j'}^3}\right\}\left\{\frac{\pi_{j'}\mathcal{N}(\omega_i|\mu_{j'},\sigma_{j'}^2)}{\sum_k\pi_k\mathcal{N}(\omega_i|\mu_k,\sigma_k^2)}\right\}\\
&=\sum_i\gamma_j(w_i)\left\{\frac{1}{\sigma_j}-\frac{(w_i-\mu_j)^2}{\sigma_j^3}\right\}\\
\therefore \frac{\partial \widetilde{E}}{\partial \sigma_j}&=\lambda\sum_i\gamma_j(w_i)\left\{\frac{1}{\sigma_j}-\frac{(w_i-\mu_j)^2}{\sigma_j^3}\right\}
\end{align*}
$$



Exercise (5.32)

$$
\begin{align*}
\frac{\partial\pi_k}{\partial\eta_j}&=\frac{\partial}{\partial\eta_j}\left(\frac{{\rm e}^{\eta_k}}{\sum_{k'}{\rm e}^{\eta_{k'}}}\right)\\
&=\frac{\frac{\partial{\rm e}^{\eta_k}}{\partial\eta_j}\sum_{k'}{\rm e}^{\eta_{k'}}-{\rm e}^{\eta_k}\frac{\partial}{\partial\eta_j}\sum_{k'}{\rm e}^{\eta_{k'}}}{\left\{\sum_{k'}{\rm e}^{\eta_{k'}}\right\}^2}\\
&=\frac{\delta_{jk}\sum_{k'}{\rm e}^{\eta_{k'}}-{\rm e}^{\eta_k}{\rm e}^{\eta_j}}{\left\{\sum_{k'}{\rm e}^{\eta_{k'}}\right\}^2}\\
&=\delta_{jk}\pi_j-\pi_j\pi_k
\end{align*}
$$


$$
\begin{align*}
\frac{\partial \Omega}{\partial \eta_j}&=\sum_k\frac{\partial \Omega}{\partial \pi_k}\frac{\partial \pi_k}{\partial \eta_j}\\
&=\sum_l\left\{-\frac{\sum_i\mathcal{N}(w_i|\mu_k,\sigma_k^2)}{\sum_{k'}\pi_{k'}\mathcal{N}(w_i|\mu_{k'},\sigma_{k'}^2)}\right\}(\delta_{jk}\pi_j-\pi_j\pi_k)\\
&=\frac{\pi_j\sum_i\sum_k\pi_k\mathcal{N}(w_i|\mu_k,\sigma_k^2)}{\sum_{k'}\pi_{k'}\mathcal{N}(w_i|\mu_{k'},\sigma_{k'}^2)}-\frac{\sum_i\pi_j\mathcal{N}(w_i|\mu_j,\sigma_j^2)}{\sum_{k'}\pi_{k'}\mathcal{N}(w_i|\mu_{k'},\sigma_{k'}^2)}\\
&=\sum_i\left\{\pi_j-\gamma_j(w_i)\right\}\\
\therefore \frac{\partial \widetilde{E}}{\partial \eta_j}&=\lambda\sum_i\left\{\pi_j-\gamma_j(w_i)\right\}
\end{align*}
$$



Exercise (5.33)

$$
\begin{align*}
x_1&=-L_1\cos(\pi-\theta_1)+L_2\cos\left\{\theta_2-(\pi-\theta_1)\right\}\\
&=L_1\cos\theta_1-L_2\cos(\theta_1+\theta_2)\\
x_2&=L_1\sin(\pi-\theta_1)+L_2\sin\left\{\theta_2-(\pi-\theta_1)\right\}\\
&=L_1\sin\theta_1-L_2\sin(\theta_1+\theta_2)
\end{align*}
$$



Exercise (5.34)

$$
\begin{align*}
\frac{\partial E_n}{\partial a_{nk}^{\pi}}&=-\frac{\partial}{\partial a_{nk}^{\pi}}\left(\ln\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}\right)\\
&=-\frac{\sum_{l=1}^K\frac{\partial \pi_{nl}}{\partial a_{nk}^{\pi}}\mathcal{N}_{nl}}{\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}}\\
&=\frac{\sum_{l=1}^K(\pi_{nl}\pi_{nk}-\delta_{kl}\pi_{nl})\mathcal{N}_{nl}}{\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}}\\
&=\frac{\pi_{nk}\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}-\pi_{nk}\mathcal{N}_{nk}}{\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}}\\
&=\pi_{nk}-\gamma_{nk}
\end{align*}
$$



Exercise (5.35)

$$
\begin{align*}
\frac{\partial E_n}{\partial a_{nkl}^{\mu}}&=-\frac{\partial}{\partial a_{nkl}^{\mu}}\left(\ln\sum_{m=1}^K\pi_{nm}\mathcal{N}_{nm}\right)\\
&=-\frac{\sum_{m=1}^K\pi_{nm}\frac{\partial \mathcal{N}_{nm}}{\partial a_{nkl}^{\mu}}}{\sum_{m=1}^K\pi_{nm}\mathcal{N}_{nm}}\\
&=\frac{\sum_{m=1}^K\pi_{nm}\delta_{km}\frac{\mu_{nkl}-t_{nl}}{\sigma_{nk}^2}\mathcal{N}_{nm}}{\sum_{m=1}^K\pi_{nm}\mathcal{N}_{nm}}\\
&=\frac{\pi_{nk}\mathcal{N}_{nk}\frac{\mu_{nkl}-t_{nl}}{\sigma_{nk}^2}}{\sum_{m=1}^K\pi_{nm}\mathcal{N}_{nm}}\\
&=\gamma_{nk}\left(\frac{\mu_{nkl}-t_{nl}}{\sigma_{nk}^2}\right)
\end{align*}
$$



Exercise (5.36)

$$
\begin{align*}
\frac{\partial E_n}{\partial a_{nk}^{\sigma}}&=\frac{\partial E_n}{\partial \sigma_{nk}}\frac{\partial \sigma_{nk}}{\partial a_{nk}^{\sigma}}\\
&=\sigma_{nk}\frac{\partial E_n}{\partial \sigma_{nk}}\\
&=-\sigma_{nk}\frac{\partial}{\partial \sigma_{nk}}\left(\ln\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}\right)\\
&=-\sigma_{nk}\frac{\sum_{l=1}^K\pi_{nl}\frac{\partial \mathcal{N}{nl}}{\partial \sigma{nk}}}{\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}}\\
&=-\sigma_{nk}\frac{\sum_{l=1}^K\pi_{nl}\delta_{kl}\left(\frac{\|\mathbf{t}_n-\boldsymbol\mu_{nk}\|^2}{\sigma_{nk}^4}-\frac{1}{\sigma_{nk}^2}\right)\mathcal{N}_{nl}}{\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}}\\
&=-\frac{\pi_{nk}\mathcal{N}_{nk}\left(\frac{\|\mathbf{t}_n-\boldsymbol\mu_{nk}\|^2}{\sigma_{nk}^3}-\frac{1}{\sigma_{nk}}\right)}{\sum_{l=1}^K\pi_{nl}\mathcal{N}_{nl}}\\
&=-\gamma_{nk}\left(\frac{\|\mathbf{t}_n-\boldsymbol\mu_{nk}\|^2}{\sigma_{nk}^3}-\frac{1}{\sigma_{nk}}\right)
\end{align*}
$$



Exercise (5.37)

$$
\begin{align*}
s^2(\mathbf{x})&=\mathbb{E}\left[\left.\left\|\mathbf{t}-\mathbb{E}[\mathbf{t}|\mathbf{x}]\right\|^2\right|\mathbf{x}\right]\\
&=\int{\rm d}\mathbf{t}p(\mathbf{t}|\mathbf{x})\left\|\mathbf{t}-\mathbb{E}[\mathbf{t}|\mathbf{x}]\right\|^2\\
&=\sum_{k=1}^K\pi_k(\mathbf{x})\int{\rm d}\mathbf{t}\mathcal{N}\left(\mathbf{t}\left|\boldsymbol\mu_k(\mathbf{x}),\sigma_k^2(\mathbf{x})\right.\right)\left\|\mathbf{t}-\mathbb{E}[\mathbf{t}|\mathbf{x}]\right\|^2\\
&=\sum_{k=1}^K\pi_k(\mathbf{x})\int{\rm d}\mathbf{t}\mathcal{N}\left(\mathbf{t}\left|\boldsymbol\mu_k(\mathbf{x}),\sigma_k^2(\mathbf{x})\right.\right)\left\|\{\mathbf{t}-\boldsymbol\mu_k(\mathbf{x})\}+\left\{\boldsymbol\mu_k(\mathbf{x})-\mathbb{E}[\mathbf{t}|\mathbf{x}]\right\}\right\|^2\\
&=\sum_{k=1}^K\pi_k(\mathbf{x})\left[\int{\rm d}\mathbf{t}\mathcal{N}\left(\mathbf{t}\left|\boldsymbol\mu_k(\mathbf{x}),\sigma_k^2(\mathbf{x})\right.\right)\left\|\mathbf{t}-\boldsymbol\mu_k(\mathbf{x})\right\|^2+2\left\{\boldsymbol\mu_k(\mathbf{x})-\mathbb{E}[\mathbf{t}|\mathbf{x}]\right\}^{\rm T}\int{\rm d}\mathbf{t}\mathcal{N}\left(\mathbf{t}\left|\boldsymbol\mu_k(\mathbf{x}),\sigma_k^2(\mathbf{x})\right.\right)\left\{\mathbf{t}-\boldsymbol\mu_k(\mathbf{x})\right\}+\left\|\boldsymbol\mu_k(\mathbf{x})-\mathbb{E}[\mathbf{t}|\mathbf{x}]\right\|^2\int{\rm d}\mathbf{t}\mathcal{N}\left(\mathbf{t}\left|\boldsymbol\mu_k(\mathbf{x}),\sigma_k^2(\mathbf{x})\right.\right)\right]\\
&=\sum_{k=1}^K\pi_k(\mathbf{x})\left\{\sigma_k^2(\mathbf{x})+\left\|\boldsymbol\mu_k(\mathbf{x})-\mathbb{E}[\mathbf{t}|\mathbf{x}]\right\|^2\right\}\\
&=\sum_{k=1}^K\pi_k(\mathbf{x})\left\{\sigma_k^2(\mathbf{x})+\left\|\boldsymbol\mu_k(\mathbf{x})-\sum_{l=1}^K\pi_l(\mathbf{x})\boldsymbol\mu_l(\mathbf{x})\right\|^2\right\}
\end{align*}
$$



Exercise (5.38)

$$
\begin{align*}
p(t|\mathbf{x},\mathcal{D},\alpha,\beta)&=\int{\rm d}\mathbf{w}p(t|\mathbf{x},\mathbf{w},\alpha,\beta)p(\mathbf{w}|\mathcal{D})\\
&\simeq \int{\rm d}\mathbf{w}p(t|\mathbf{x},\mathbf{w},\alpha,\beta)q(\mathbf{w}|\mathcal{D})\\
&=\int{\rm d}\mathbf{w}\mathcal{N}\left(t\left|y(\mathbf{x},\mathbf{w}_{\rm MAP})+\mathbf{g}^{\rm T}(\mathbf{w}-\mathbf{w}_{\rm MAP}),\beta^{-1}\right.\right)\mathcal{N}\left(\mathbf{w}\left|\mathbf{w}_{\rm MAP},\mathbf{A}^{-1}\right.\right)\\
&=\int{\rm d}\mathbf{w}\mathcal{N}\left(t\left|\mathbf{g}^{\rm T}\mathbf{w}+\left\{y(\mathbf{x},\mathbf{w}_{\rm MAP})-\mathbf{g}^{\rm T}\mathbf{w}_{\rm MAP}\right\},\beta^{-1}\right.\right)\mathcal{N}\left(\mathbf{w}\left|\mathbf{w}_{\rm MAP},\mathbf{A}^{-1}\right.\right)\\
&=\mathcal{N}\left(t\left|\mathbf{g}^{\rm T}\mathbf{w}_{\rm MAP}+y(\mathbf{x},\mathbf{w}_{\rm MAP})-\mathbf{g}^{\rm T}\mathbf{w}_{\rm MAP},\beta^{-1}+\mathbf{g}^{\rm T}\mathbf{A}^{-1}\mathbf{g}\right.\right)\\
&=\mathcal{N}\left(t\left|y(\mathbf{x},\mathbf{w}_{\rm MAP}),\beta^{-1}+\mathbf{g}^{\rm T}\mathbf{A}^{-1}\mathbf{g}\right.\right)
\end{align*}
$$



Exercise (5.39)

$$
\begin{align*}
p(\mathcal{D}|\alpha,\beta)&=\int{\rm d}\mathbf{w}p(\mathcal{D}|\mathbf{w},\beta)p(\mathbf{w}|\alpha)\\
&=\int{\rm d}\mathbf{w}\left\{\prod_{n=1}^N\mathcal{N}(t_n|y(\mathbf{x}_n,\mathbf{w}),\beta^{-1})\right\}\mathcal{N}(\mathbf{w}|\mathbf{0},\alpha^{-1}\mathbf{I})\\
&=\left(\frac{\beta}{2\pi}\right)^{N/2}\left(\frac{\alpha}{2\pi}\right)^{W/2}\int{\rm d}\mathbf{w}\exp\left[-\frac{\beta}{2}\sum_{n=1}^N\left\{y(\mathbf{x}_n,\mathbf{w})-t_n\right\}^2-\frac{\alpha}{2}\mathbf{w}^{\rm T}\mathbf{w}\right]\\
&\simeq\left(\frac{\beta}{2\pi}\right)^{N/2}\left(\frac{\alpha}{2\pi}\right)^{W/2}\exp\left[-\frac{\beta}{2}\sum_{n=1}^N\left\{y(\mathbf{x}_n,\mathbf{w}_{\rm MAP})-t_n\right\}^2-\frac{\alpha}{2}\mathbf{w}_{\rm MAP}^{\rm T}\mathbf{w}_{\rm MAP}\right]\frac{(2\pi)^{W/2}}{|\mathbf{A}|^{1/2}}\\
&=\left(\frac{\beta}{2\pi}\right)^{N/2}\alpha^{W/2}|\mathbf{A}|^{-1/2}\exp\left\{-E(\mathbf{w}_{\rm MAP})\right\}\\
\therefore \ln p(\mathcal{D}|\alpha,\beta)&\simeq -E(\mathbf{w}_{\rm MAP})-\frac{1}{2}\ln|\mathbf{A}|+\frac{W}{2}\ln\alpha+\frac{N}{2}\ln\beta-\frac{N}{2}\ln(2\pi)
\end{align*}
$$



Exercise (5.40)

$$
\begin{align*}
\ln p(\mathcal{D}|\mathbf{w})&=\sum_n\sum_kt_{nk}\ln y_{nk}
\end{align*}
$$



Exercise (5.41)

$$
\begin{align*}
p(\mathcal{D}|\alpha)&=\int{\rm d}\mathbf{w}p(\mathcal{D}|\mathbf{w})|p(\mathbf{w}|\alpha)\\
&=\left(\frac{\alpha}{2\pi}\right)^{W/2}\int{\rm d}\mathbf{w}\exp\left[\sum_n\left\{t_ny(\mathbf{x},\mathbf{w})+(1-t_n)(1-y(\mathbf{x},\mathbf{w}))\right\}-\frac{\alpha}{2}\mathbf{w}^{\rm T}\mathbf{w}\right]\\
&=\left(\frac{\alpha}{2\pi}\right)^{W/2}\int{\rm d}\mathbf{w}\exp\left\{-E(\mathbf{w})\right\}\\
&\simeq \left(\frac{\alpha}{2\pi}\right)^{W/2}\exp\left\{-E(\mathbf{w}_{\rm MAP})\right\}\frac{(2\pi)^{W/2}}{|\mathbf{A}|^{1/2}}\\
&=\frac{\alpha^{W/2}}{|\mathbf{A}|^{1/2}}\exp\left\{-E(\mathbf{w}_{\rm MAP})\right\}\\
\therefore \ln p(\mathcal{D}|\alpha)&\simeq -E(\mathbf{w}_{\rm MAP})-\frac{1}{2}\ln|\mathbf{A}|+\frac{W}{2}\ln\alpha
\end{align*}
$$

参考文献

  1. Christopher Bishop, Pattern Recognition and Machine Learning


いいなと思ったら応援しよう!