PRML自習ノート - chapter 10 -
Exercise (10.1) - (10.10)
Exercise (10.1)
$$
\begin{align*}
\ln p(\mathbf{X})&=\ln p(\mathbf{X})\int{\rm d}\mathbf{Z}q(\mathbf{Z})\\
&=\int{\rm d}\mathbf{Z}q(\mathbf{Z})\ln p(\mathbf{X})\\
&=\int{\rm d}\mathbf{Z}q(\mathbf{Z})\ln \left\{\frac{p(\mathbf{X},\mathbf{Z})}{p(\mathbf{Z}|\mathbf{X})}\right\}\\
&=\int{\rm d}\mathbf{Z}q(\mathbf{Z})\ln \left\{\frac{p(\mathbf{X},\mathbf{Z})}{q(\mathbf{Z})}\frac{q(\mathbf{Z})}{p(\mathbf{Z}|\mathbf{X})}\right\}\\
&=\int{\rm d}\mathbf{Z}q(\mathbf{Z})\ln \left\{\frac{p(\mathbf{X},\mathbf{Z})}{q(\mathbf{Z})}\right\}-\int{\rm d}\mathbf{Z}q(\mathbf{Z})\ln \left\{\frac{p(\mathbf{Z}|\mathbf{X})}{q(\mathbf{Z})}\right\}\\
&=\mathcal{L}(q)+{\rm KL}(q|p)
\end{align*}
$$
Exercise (10.2)
式(10.13),(10.15)に$${\mathbb{E}[z_i]=m_i}$$を代入すると,
$$
\begin{align*}
m_1&=\mu_1-\Lambda_{11}^{-1}\Lambda_{12}(m_2-\mu_2)\\
m_2&=\mu_2-\Lambda_{22}^{-1}\Lambda_{21}(m_1-\mu_1)\\
\begin{pmatrix}m_1\\ m_2\end{pmatrix}&=\begin{pmatrix}\mu_1\\ \mu_2\end{pmatrix}-\begin{pmatrix}0&\Lambda_{11}^{-1}\Lambda_{12}\\ \Lambda_{22}^{-1}\Lambda_{21}&0\end{pmatrix}\left\{\begin{pmatrix}m_1\\ m_2\end{pmatrix}-\begin{pmatrix}\mu_1\\ \mu_2\end{pmatrix}\right\}\\\\
\therefore& \begin{pmatrix}1&\Lambda_{11}^{-1}\Lambda_{12}\\ \Lambda_{22}^{-1}\Lambda_{21}&1\end{pmatrix}\left\{\begin{pmatrix}m_1\\ m_2\end{pmatrix}-\begin{pmatrix}\mu_1\\ \mu_2\end{pmatrix}\right\}=\mathbf{0}\\
\end{align*}
$$
$$
\begin{align*}
\begin{vmatrix}1&\Lambda_{11}^{-1}\Lambda_{12}\\ \Lambda_{22}^{-1}\Lambda_{21}&1\end{vmatrix}&=1-\frac{\Lambda_{12}\Lambda_{21}}{\Lambda_{11}\Lambda_{22}}\\
&=\frac{\Lambda_{11}\Lambda_{22}-\Lambda_{12}\Lambda_{21}}{\Lambda_{11}\Lambda_{22}}\\
&=\frac{|\boldsymbol\Lambda|}{\Lambda_{11}\Lambda_{22}}\\
&\neq 0\\
\therefore \begin{pmatrix}m_1\\ m_2\end{pmatrix}&=\begin{pmatrix}\mu_1\\ \mu_2\end{pmatrix}
\end{align*}
$$
Exercise (10.3)
変分対象の$${q(\mathbf{Z})}$$を変関数にもつ汎関数$${\mathcal{L}[q]}$$を未定乗数$${\lambda}$$を用いて
$$
\begin{align*}
\mathcal{L}[q]&:=-\int{\rm d}\mathbf{Z}p(\mathbf{Z})\ln q(\mathbf{Z})+\lambda\left\{\int{\rm d}\mathbf{Z}q(\mathbf{Z})-1\right\}\\
&=-\sum_i\int{\rm d}\mathbf{Z}p(\mathbf{Z})\ln q_i(\mathbf{Z}_i)+\lambda\left\{\prod_i\int{\rm d}\mathbf{Z}_iq_i(\mathbf{Z}_i)-1\right\}
\end{align*}
$$
と定義する。
このとき,$${q_j(\mathbf{Z}_j)}$$に関する変分を考えると,
$$
\begin{align*}
\mathcal{L}[q+\delta q_j]-\mathcal{L}[q]&=-\int{\rm d}\mathbf{Z}p(\mathbf{Z})\ln\left\{1+\frac{\delta q_j(\mathbf{Z}_j)}{q_j(\mathbf{Z}_j)}\right\}+\lambda\int{\rm d}\mathbf{Z}\delta q_j(\mathbf{Z}_j)\prod_{i\neq j}q_i(\mathbf{Z}_i)\\
&=-\int{\rm d}\mathbf{Z}p(\mathbf{Z})\left\{\frac{\delta q_j(\mathbf{Z}_j)}{q_j(\mathbf{Z}_j)}+\mathcal{O}\left((\delta q_j)^2\right)\right\}+\lambda\int{\rm d}\mathbf{Z}\delta q_j(\mathbf{Z}_j)\prod_{i\neq j}q_i(\mathbf{Z}_i)\\
&=-\int{\rm d}\mathbf{Z}_j\delta q_j(\mathbf{Z}_j)\left\{\frac{\prod_{i\neq j}\int{\rm d}\mathbf{Z}_ip(\mathbf{Z})}{q_j(\mathbf{Z}_j)}-\lambda\prod_{i\neq j}\int{\rm d}\mathbf{Z}_iq(\mathbf{Z}_i)\right\}+\mathcal{O}\left((\delta q_j)^2\right)\\
\therefore q_j^*(\mathbf{Z}_j)&=\frac{\prod_{i\neq j}\int{\rm d}\mathbf{Z}_ip(\mathbf{Z})}{\lambda\prod_{i\neq j}\int{\rm d}\mathbf{Z}_iq(\mathbf{Z}_i)}
\end{align*}
$$
$${q_j(\mathbf{Z}_j)}$$の規格化条件より,
$$
\begin{align*}
\int{\rm d}\mathbf{Z}_jq_j^*(\mathbf{Z}_j)&=\frac{\int{\rm d}\mathbf{Z}p(\mathbf{Z})}{\lambda\prod_{i\neq j}\int{\rm d}\mathbf{Z}_iq(\mathbf{Z}_i)}\\
&=\frac{1}{\lambda\prod_{i\neq j}\int{\rm d}\mathbf{Z}_iq(\mathbf{Z}_i)}\\
&=1\\
\therefore q_j^*(\mathbf{Z}_j)&=\prod_{i\neq j}\int{\rm d}\mathbf{Z}_ip(\mathbf{Z})\\
&=p(\mathbf{Z}_j)
\end{align*}
$$
Exercise (10.4)
$$
\begin{align*}
{\rm KL}(p||q)&=-\int{\rm d}\mathbf{x}p(\mathbf{x})\ln\left\{\frac{q(\mathbf{x})}{p(\mathbf{x})}\right\}\\
&=-\int{\rm d}\mathbf{x}p(\mathbf{x})\ln q(\mathbf{x})+\int{\rm d}\mathbf{x}p(\mathbf{x})\ln p(\mathbf{x})\\
&=-\int{\rm d}\mathbf{x}p(\mathbf{x})\ln \mathcal{N}(\mathbf{x}|\boldsymbol\mu,\boldsymbol\Sigma)+\int{\rm d}\mathbf{x}p(\mathbf{x})\ln p(\mathbf{x})\\
&=\frac{1}{2}\ln|\boldsymbol\Sigma|+\frac{1}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}^{\rm T}\boldsymbol\Sigma^{-1}\mathbf{x}-\boldsymbol\mu^{\rm T}\boldsymbol\Sigma^{-1}\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}+\frac{1}{2}\boldsymbol\mu^{\rm T}\boldsymbol\Sigma^{-1}\boldsymbol\mu+\frac{D}{2}\ln(2\pi)+\int{\rm d}\mathbf{x}p(\mathbf{x})\ln p(\mathbf{x})
\end{align*}
$$
より,
$$
\begin{align*}
\frac{\partial}{\partial\boldsymbol\mu}{\rm KL}(p||q)&=-\boldsymbol\Sigma^{-1}\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}+\boldsymbol\Sigma^{-1}\boldsymbol\mu\\
&=-\boldsymbol\Sigma^{-1}\left\{\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}-\boldsymbol\mu\right\}\\
&=-\boldsymbol\Sigma^{-1}\left\{\mathbb{E}[\mathbf{x}]-\boldsymbol\mu\right\}\\
&=\mathbf{0}\\
\therefore \boldsymbol\mu^*&=\mathbb{E}[\mathbf{x}]
\end{align*}
$$
$${{\rm KL}(p||q)}$$に$${\boldsymbol\mu=\boldsymbol\mu^*=\mathbb{E}[\mathbf{x}]}$$を代入して$${\boldsymbol\Sigma}$$で微分すると,
$$
\begin{align*}
\frac{\partial}{\partial\boldsymbol\Sigma}{\rm KL}(p||q)&=\frac{1}{2}\frac{\partial}{\partial\boldsymbol\Sigma}\ln|\boldsymbol\Sigma|+\frac{1}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}^{\rm T}\frac{\partial\boldsymbol\Sigma^{-1}}{\partial\boldsymbol\Sigma}\mathbf{x}-\frac{1}{2}\mathbb{E}[\mathbf{x}^{\rm T}]\frac{\partial\boldsymbol\Sigma^{-1}}{\partial\boldsymbol\Sigma}\mathbb{E}[\mathbf{x}]\\
&=\frac{1}{2}\boldsymbol\Sigma^{-1}-\frac{1}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}^{\rm T}\boldsymbol\Sigma^{-1}\frac{\partial\boldsymbol\Sigma}{\partial\boldsymbol\Sigma}\boldsymbol\Sigma^{-1}\mathbf{x}+\frac{1}{2}\mathbb{E}[\mathbf{x}^{\rm T}]\boldsymbol\Sigma^{-1}\frac{\partial\boldsymbol\Sigma}{\partial\boldsymbol\Sigma}\boldsymbol\Sigma^{-1}\mathbb{E}[\mathbf{x}]\\
&=\frac{1}{2}\boldsymbol\Sigma^{-1}-\frac{1}{2}\int{\rm d}\mathbf{x}p(\mathbf{x})\boldsymbol\Sigma^{-1}\mathbf{x}\left(\boldsymbol\Sigma^{-1}\mathbf{x}\right)^{\rm T}+\frac{1}{2}\boldsymbol\Sigma^{-1}\mathbb{E}[\mathbf{x}]\left(\boldsymbol\Sigma^{-1}\mathbb{E}[\mathbf{x}]\right)^{\rm T}\\
&=\frac{1}{2}\boldsymbol\Sigma^{-1}\left[\mathbf{I}-\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}\mathbf{x}^{\rm T}\boldsymbol\Sigma^{-1}+\mathbb{E}[\mathbf{x}]\mathbb{E}[\mathbf{x}^{\rm T}]\boldsymbol\Sigma^{-1}\right]\\
&=\frac{1}{2}\boldsymbol\Sigma^{-1}\left[\boldsymbol\Sigma-\int{\rm d}\mathbf{x}p(\mathbf{x})\mathbf{x}\mathbf{x}^{\rm T}+\mathbb{E}[\mathbf{x}]\mathbb{E}[\mathbf{x}^{\rm T}]\right]\boldsymbol\Sigma^{-1}\\
&=\frac{1}{2}\boldsymbol\Sigma^{-1}\left(\boldsymbol\Sigma-\mathbb{E}\left[\mathbf{x}\mathbf{x}^{\rm T}\right]+\mathbb{E}[\mathbf{x}]\mathbb{E}[\mathbf{x}^{\rm T}]\right)\boldsymbol\Sigma^{-1}\\
&=\frac{1}{2}\boldsymbol\Sigma^{-1}\left(\boldsymbol\Sigma-{\rm cov}[\mathbf{x}]\right)\boldsymbol\Sigma^{-1}\\
&=\mathbf{0}\\
\therefore \boldsymbol\Sigma^*&={\rm cov}[\mathbf{x}]
\end{align*}
$$
Exercise (10.5)
$${q_{\theta}(\boldsymbol\theta)\simeq\delta(\boldsymbol\theta-\boldsymbol\theta_0)}$$で与えられるとき,$${\mathbf{z}}$$に関しては
$$
\begin{align*}
\mathbb{E}_{\boldsymbol\theta}\left[\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\right]&=\int{\rm d}\boldsymbol\theta q_{\theta}(\boldsymbol\theta)\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\\
&=\int{\rm d}\boldsymbol\theta \delta(\boldsymbol\theta-\boldsymbol\theta_0)\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\\
&=\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta_0)\\
q_z^*(\mathbf{z})&=\frac{\exp\left(\mathbb{E}_{\boldsymbol\theta}\left[\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\right]\right)}{\int{\rm d}\mathbf{z}\exp\left(\mathbb{E}_{\boldsymbol\theta}\left[\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\right]\right)}\\
&=\frac{p(\mathbf{X},\mathbf{z},\boldsymbol\theta_0)}{\int{\rm d}\mathbf{z}p(\mathbf{X},\mathbf{z},\boldsymbol\theta_0)}\\
&=\frac{p(\mathbf{X},\mathbf{z},\boldsymbol\theta_0)}{p(\mathbf{X},\boldsymbol\theta_0)}\\
&=p(\mathbf{z}|\mathbf{X},\boldsymbol\theta_0)
\end{align*}
$$
となり,これはEMアルゴリズムにおけるE step($${q(\mathbf{Z})=p(\mathbf{Z}|\mathbf{X},\boldsymbol\theta^{\rm old})}$$)と等価である。
一方,$${\boldsymbol\theta}$$に関しては,$${q_{\theta}(\boldsymbol\theta)\simeq\delta(\boldsymbol\theta-\boldsymbol\theta_0)}$$と近似することを考慮すると,$${\mathbb{E}_{\mathbf{z}}\left[\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\right]}$$を最大化する$${\boldsymbol\theta}$$を探索することと等価である。
$$
\begin{align*}
\mathbb{E}_{\mathbf{z}}\left[\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\right]&=\int{\rm d}\mathbf{z} q_{z}(\mathbf{z})\ln p(\mathbf{X},\mathbf{z},\boldsymbol\theta)\\
&=\int{\rm d}\mathbf{z} q_{z}(\mathbf{z})\ln p(\mathbf{X},\mathbf{z}|\boldsymbol\theta)+\ln p(\boldsymbol\theta)\\
&=\int{\rm d}\mathbf{z} p(\mathbf{z}|\mathbf{X},\boldsymbol\theta_0)\ln p(\mathbf{X},\mathbf{z}|\boldsymbol\theta)+\ln p(\boldsymbol\theta)\\
&=\mathcal{Q}(\boldsymbol\theta,\boldsymbol\theta_0)+\ln p(\boldsymbol\theta)\\
\therefore \boldsymbol\theta^{\rm new}&=\argmax_{\boldsymbol\theta}\left\{\mathcal{Q}(\boldsymbol\theta,\boldsymbol\theta_0)+\ln p(\boldsymbol\theta)\right\}\\
\boldsymbol\theta^{\rm new}&\rightarrow \boldsymbol\theta_0
\end{align*}
$$
これはEMアルゴリズムにおけるM stepと等価である。
Exercise (10.6)
$$
\begin{align*}
{\rm D}_{\alpha}(p||q)&=\frac{4}{1-\alpha^2}\left[1-\int{\rm d}x\left\{p(x)\right\}^{\frac{1+\alpha}{2}}\left\{q(x)\right\}^{\frac{1-\alpha}{2}}\right]\\
&=\frac{4}{1-\alpha^2}\left[1-\int{\rm d}xp(x)\left\{\frac{q(x)}{p(x)}\right\}^{\frac{1-\alpha}{2}}\right]\\
&=\frac{4}{1-\alpha^2}\left[1-\int{\rm d}xp(x)\left\{1+\frac{1-\alpha}{2}\ln\left(\frac{q(x)}{p(x)}\right)+\mathcal{O}\left(\left(\frac{1-\alpha}{2}\right)^2\right)\right\}\right]\\
&=\frac{2}{1+\alpha}\left[-\int{\rm d}xp(x)\ln\left(\frac{q(x)}{p(x)}\right)+\mathcal{O}\left(\frac{1-\alpha}{2}\right)\right]
\end{align*}
$$
より,
$$
\begin{align*}
\lim_{\alpha\rightarrow 1}{\rm D}_{\alpha}(p||q)&=-\int{\rm d}xp(x)\ln\left(\frac{q(x)}{p(x)}\right)\\
&={\rm KL}(p||q)
\end{align*}
$$
$$
\begin{align*}
{\rm D}_{\alpha}(p||q)&=\frac{4}{1-\alpha^2}\left[1-\int{\rm d}x\left\{p(x)\right\}^{\frac{1+\alpha}{2}}\left\{q(x)\right\}^{\frac{1-\alpha}{2}}\right]\\
&=\frac{4}{1-\alpha^2}\left[1-\int{\rm d}xq(x)\left\{\frac{p(x)}{q(x)}\right\}^{\frac{1+\alpha}{2}}\right]\\
&=\frac{4}{1-\alpha^2}\left[1-\int{\rm d}xq(x)\left\{1+\frac{1+\alpha}{2}\ln\left(\frac{p(x)}{q(x)}\right)+\mathcal{O}\left(\left(\frac{1+\alpha}{2}\right)^2\right)\right\}\right]\\
&=\frac{2}{1-\alpha}\left[-\int{\rm d}xq(x)\ln\left(\frac{p(x)}{q(x)}\right)+\mathcal{O}\left(\frac{1+\alpha}{2}\right)\right]
\end{align*}
$$
より,
$$
\begin{align*}
\lim_{\alpha\rightarrow -1}{\rm D}_{\alpha}(p||q)&=-\int{\rm d}xq(x)\ln\left(\frac{p(x)}{q(x)}\right)\\
&={\rm KL}(q||p)
\end{align*}
$$
Exercise (10.7)
$$
\begin{align*}
\ln q_{\mu}^*(\mu)&=-\frac{\mathbb{E}[\tau]}{2}\left\{\lambda_0(\mu-\mu_0)^2+\sum_{n=1}^N(x_n-\mu)^2\right\}+{\rm const}\\
&=-\frac{\mathbb{E}_{\mu}[\tau]}{2}\left\{(\lambda_0+N)\mu^2-2(\lambda_0\mu_0+N\bar{x})\mu\right\}+{\rm const}\\
\therefore \lambda_N&=(\lambda_0+N)\mathbb{E}[\tau]\\
\mu_N&=\lambda_N^{-1}(\lambda_0\mu_0+N\bar{x})\mathbb{E}[\tau]\\
&=\frac{\lambda_0\mu_0+N\bar{x}}{\lambda_0+N}
\end{align*}
$$
$$
\begin{align*}
\ln q_{\tau}^*(\tau)&=(a_0-1)\ln\tau-b_0\tau+\frac{N}{2}\ln\tau-\frac{\tau}{2}\mathbb{E}_{\mu}\left[\sum_{n=1}^N(x_n-\mu)^2+\lambda_0(\mu-\mu_0)^2\right]+{\rm const}\\
&=\left\{\left(a_0+\frac{N}{2}\right)-1\right\}\ln\tau-\left\{b_0+\frac{1}{2}\mathbb{E}_{\mu}\left[\sum_{n=1}^N(x_n-\mu)^2+\lambda_0(\mu-\mu_0)^2\right]\right\}\tau+{\rm const}\\
\therefore a_N&=a_0+\frac{N}{2}\\
b_N&=b_0+\frac{1}{2}\mathbb{E}_{\mu}\left[\sum_{n=1}^N(x_n-\mu)^2+\lambda_0(\mu-\mu_0)^2\right]
\end{align*}
$$
Exercise (10.8)
$$
\begin{align*}
\mathbb{E}[\tau]&=\frac{a_N}{b_N}\\
&=\frac{a_0+\frac{N}{2}}{b_0+\frac{1}{2}\mathbb{E}_{\mu}\left[\sum_{n=1}^N(x_n-\mu)^2+\lambda_0(\mu-\mu_0)^2\right]}\\
&=\frac{\frac{a_0}{N}+\frac{1}{2}}{\frac{b_0}{N}+\frac{1}{2}\mathbb{E}_{\mu}\left[\frac{1}{N}\sum_{n=1}^N(x_n-\mu)^2+\frac{\lambda_0}{N}(\mu-\mu_0)^2\right]}\\
&=\frac{\frac{a_0}{N}+\frac{1}{2}}{\frac{b_0}{N}+\frac{1}{2}\mathbb{E}_{\mu}\left[\sigma_{\rm ML}^2(N)+\frac{\lambda_0}{N}(\mu-\mu_0)^2\right]}\\
\end{align*}
$$
より,
$$
\begin{align*}
\lim_{N\rightarrow\infty}\mathbb{E}[\tau]&=\frac{1}{\sigma_{\rm ML}^2(\infty)}\\
\end{align*}
$$
$$
\begin{align*}
{\rm var}[\tau]&=\frac{a_N}{b_N^2}\\
&=\frac{a_0+\frac{N}{2}}{\left(b_0+\frac{1}{2}\mathbb{E}_{\mu}\left[\sum_{n=1}^N(x_n-\mu)^2+\lambda_0(\mu-\mu_0)^2\right]\right)^2}\\
&=\frac{\frac{a_0}{N}+\frac{1}{2}}{N\left(\frac{b_0}{N}+\frac{1}{2}\mathbb{E}_{\mu}\left[\frac{1}{N}\sum_{n=1}^N(x_n-\mu)^2+\frac{\lambda_0}{N}(\mu-\mu_0)^2\right]\right)^2}\\
&=\frac{\frac{a_0}{N}+\frac{1}{2}}{N\left(\frac{b_0}{N}+\frac{1}{2}\mathbb{E}_{\mu}\left[\sigma_{\rm ML}^2(N)+\frac{\lambda_0}{N}(\mu-\mu_0)^2\right]\right)^2}\\
\end{align*}
$$
より,
$$
\begin{align*}
\lim_{N\rightarrow\infty}{\rm var}[\tau]&=0\\
\end{align*}
$$
Exercise (10.9)
$${\mu_0=a_0=b_0=\lambda_0=0}$$のとき,
$$
\begin{align*}
\frac{1}{\mathbb{E}[\tau]}&=\frac{b_N}{a_N}\\
&=\frac{\frac{1}{2}\mathbb{E}_{\mu}\left[\sum_{n=1}^N(x_n-\mu)^2\right]}{\frac{N}{2}}\\
&=\mathbb{E}_{\mu}\left[\frac{1}{N}\sum_{n=1}^N(x_n-\mu)^2\right]\\
&=\overline{x^2}-2\bar{x}\mathbb{E}_{\mu}[\mu]+\mathbb{E}_{\mu}\left[\mu^2\right]\\
&=\overline{x^2}-2\bar{x}\mu_N+\mu_N^2+\frac{1}{\lambda_N}\\
&=\overline{x^2}-2\bar{x}^2+\bar{x}^2+\frac{1}{N\mathbb{E}[\tau]}\\
&=\overline{x^2}-\bar{x}^2+\frac{1}{N\mathbb{E}[\tau]}\\
\frac{N-1}{N\mathbb{E}[\tau]}&=\overline{x^2}-\bar{x}^2\\
\therefore \frac{1}{\mathbb{E}[\tau]}&=\frac{N}{N-1}\left(\overline{x^2}-\bar{x}^2\right)\\
&=\frac{1}{N-1}\sum_{n=1}^N\left(x_n-\bar{x}\right)^2
\end{align*}
$$
Exercise (10.10)
$$
\begin{align*}
\ln p(\mathbf{X})&=\ln p(\mathbf{X})\sum_m\sum_{\mathbf{Z}}q(m,\mathbf{Z})\\
&=\ln p(\mathbf{X})\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\\
&=\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln p(\mathbf{X})\\
&=\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{p(\mathbf{Z},m|\mathbf{X})}\right\}\\
&=\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)q(m)}\frac{q(\mathbf{Z}|m)q(m)}{p(\mathbf{Z},m|\mathbf{X})}\right\}\\
&=\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)q(m)}\right\}-\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln\left\{ \frac{p(\mathbf{Z},m|\mathbf{X})}{q(\mathbf{Z}|m)q(m)}\right\}\\
&=\mathcal{L}-\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln\left\{ \frac{p(\mathbf{Z},m|\mathbf{X})}{q(\mathbf{Z}|m)q(m)}\right\}
\end{align*}
$$
Exercise (10.11) - (10.20)
Exercise (10.11)
$${q(m)}$$を変関数に持つ汎関数$${L'[q]}$$を
$$
\begin{align*}
L'[q]&:=\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)q(m)}\right\}+\lambda\left(\sum_mq(m)-1\right)
\end{align*}
$$
と定義する。
$${q(m)}$$に関する変分を考えると,
$$
\begin{align*}
L'[q+\delta q]-L'[q]&=\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)(q(m)+\delta q(m))\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)(q(m)+\delta q(m))}\right\}+\lambda\left(\sum_m(q(m)+\delta q(m))-1\right)-\sum_m\sum_{\mathbf{Z}}q(\mathbf{Z}|m)q(m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)q(m)}\right\}+\lambda\left(\sum_mq(m)-1\right)\\
&=\sum_m\delta q(m)\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)(q(m)+\delta q(m))}\right\}+
\sum_mq(m)\ln\left\{ \frac{q(m)}{(q(m)+\delta q(m))}\right\}+\sum_m\lambda\delta q(m)\\
&=\sum_m\delta q(m)\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)q(m)}\right\}+(\lambda-1)\sum_m\delta q(m)+\sum_m\mathcal{O}\left((\delta q(m))^2\right)\\
&=\sum_m\left(\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)q(m)}\right\}+(\lambda-1)\right)\delta q(m)+\sum_m\mathcal{O}\left((\delta q(m))^2\right)\\
\therefore q^*(m)&=\exp\left(\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)}\right\}+(\lambda-1)\right)\\
&\propto\exp\left(\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X},m)}{q(\mathbf{Z}|m)}\right\}\right)\\
&\propto\exp\left(\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X}|m)p(m)}{q(\mathbf{Z}|m)}\right\}\right)\\
&\propto p(m)\exp\left(\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X}|m)}{q(\mathbf{Z}|m)}\right\}\right)\\
&\propto p(m)\exp(\mathcal{L}_m)\ \ \ \ \left(\mathcal{L}_m:=\sum_{\mathbf{Z}}q(\mathbf{Z}|m)\ln\left\{ \frac{p(\mathbf{Z},\mathbf{X}|m)}{q(\mathbf{Z}|m)}\right\}\right)
\end{align*}
$$
(※)参考文献の式(10.35)の$${\mathcal{L}_m}$$は誤植であることに注意(解答例でその点が言及されている)
Exercise (10.12)
$${p(\mathbf{X},\mathbf{Z},\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)=p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)p(\mathbf{Z}|\boldsymbol\pi)p(\boldsymbol\pi)p(\boldsymbol\mu|\boldsymbol\Lambda)p(\boldsymbol\Lambda)}$$のとき,
$$
\begin{align*}
\ln q^*(\mathbf{Z})&=\mathbb{E}_{\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda}\left[\ln p(\mathbf{X},\mathbf{Z},\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)\right]+{\rm const}\\
&=\mathbb{E}_{\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda}\left[\ln\left\{ p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)p(\mathbf{Z}|\boldsymbol\pi)p(\boldsymbol\pi)p(\boldsymbol\mu|\boldsymbol\Lambda)p(\boldsymbol\Lambda)\right\}\right]+{\rm const}\\
&=\mathbb{E}_{\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda}\left[\ln\left\{ p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)p(\mathbf{Z}|\boldsymbol\pi)\right\}\right]+{\rm const}\\
&=\mathbb{E}_{\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda}\left[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)\right]+\mathbb{E}_{\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda}\left[\ln p(\mathbf{Z}|\boldsymbol\pi)\right]+{\rm const}\\
&=\mathbb{E}_{\boldsymbol\mu,\boldsymbol\Lambda}\left[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)\right]+\mathbb{E}_{\boldsymbol\pi}\left[\ln p(\mathbf{Z}|\boldsymbol\pi)\right]+{\rm const}\\
&=\sum_{n=1}^N\sum_{k=1}^Kz_{nk}\left(\mathbb{E}_{\boldsymbol\pi}[\ln \pi_k]+\frac{1}{2}\mathbb{E}_{\boldsymbol\Lambda}\left[\ln|\boldsymbol\Lambda_k|\right]-\frac{1}{2}\mathbb{E}_{\boldsymbol\mu,\boldsymbol\Lambda}\left[(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right]\right)+{\rm const}\\
&=\sum_{n=1}^N\sum_{k=1}^Kz_{nk}\ln\rho_{nk}+{\rm const}\\
&=\ln\left(\prod_{n=1}^N\prod_{k=1}^K\rho_{nk}^{z_{nk}}\right)+{\rm const}\\
\therefore q^*(\mathbf{Z})&\propto \prod_{n=1}^N\prod_{k=1}^K\rho_{nk}^{z_{nk}}
\end{align*}
$$
が得られる。ここで,
$${\ln\rho_{nk}:=\mathbb{E}_{\boldsymbol\pi}[\ln \pi_k]+\frac{1}{2}\mathbb{E}_{\boldsymbol\Lambda}\left[\ln|\boldsymbol\Lambda_k|\right]-\frac{1}{2}\mathbb{E}_{\boldsymbol\mu,\boldsymbol\Lambda}\left[(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right]}$$
と定義した。
$${q^*(\mathbf{Z})}$$の規格化因子を計算すると,
$$
\begin{align*}
\sum_{\mathbf{Z}}\prod_{n=1}^N\prod_{k=1}^K\rho_{nk}^{z_{nk}}&=\prod_{n=1}^N\left(\sum_{\mathbf{z}_n}\prod_{k=1}^K\rho_{nk}^{z_{nk}}\right)\\
&=\prod_{n=1}^N\left(\sum_{j=1}^K\rho_{nj}\right)
\end{align*}
$$
となるため,
$$
\begin{align*}
q^*(\mathbf{Z})&=\frac{\prod_{n=1}^N\prod_{k=1}^K\rho_{nk}^{z_{nk}}}{\prod_{n=1}^N\left(\sum_{j=1}^K\rho_{nj}\right)}\\
&=\prod_{n=1}^N\left(\frac{\prod_{k=1}^K\rho_{nk}^{z_{nk}}}{\left(\sum_{j=1}^K\rho_{nj}\right)}\right)\\
&=\prod_{n=1}^N\prod_{k=1}^K\left(\frac{\rho_{nk}}{\sum_{j=1}^K\rho_{nj}}\right)^{z_{nk}}\\
&=\prod_{n=1}^N\prod_{k=1}^Kr_{nk}^{z_{nk}}\\
\end{align*}
$$
となる。
Exercise (10.13)
$$
\begin{align*}
\ln q^*(\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)&=\mathbb{E}_{\mathbf{Z}}\left[\ln p(\mathbf{X},\mathbf{Z},\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)\right]+{\rm const}\\
&=\mathbb{E}_{\mathbf{Z}}\left[\ln\left\{ p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)p(\mathbf{Z}|\boldsymbol\pi)p(\boldsymbol\pi)p(\boldsymbol\mu,\boldsymbol\Lambda)\right\}\right]+{\rm const}\\
&=\mathbb{E}_{\mathbf{Z}}\left[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)\right]+\mathbb{E}_{\mathbf{Z}}\left[\ln p(\mathbf{Z}|\boldsymbol\pi)\right]+\ln p(\boldsymbol\pi)+\ln p(\boldsymbol\mu,\boldsymbol\Lambda)+{\rm const}\\
&=\sum_{n=1}^N\sum_{k=1}^K\mathbb{E}_{\mathbf{Z}}[z_{nk}]\ln\mathcal{N}\left(\mathbf{x}_n\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)+\mathbb{E}_{\mathbf{Z}}\left[\ln p(\mathbf{Z}|\boldsymbol\pi)\right]+\ln p(\boldsymbol\pi)+\sum_{k=1}^K\ln p(\boldsymbol\mu_k,\boldsymbol\Lambda_k)+{\rm const}
\end{align*}
$$
$${q^*(\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)=q^*(\boldsymbol\pi)\prod_{k=1}^Kq^*(\boldsymbol\mu_k,\boldsymbol\Lambda_k)}$$を代入すると,
$$
\begin{align*}
\ln q^*(\boldsymbol\pi)+\sum_{k=1}^K\ln q^*(\boldsymbol\mu_k,\boldsymbol\Lambda_k)&=\sum_{n=1}^N\sum_{k=1}^K\mathbb{E}_{\mathbf{Z}}[z_{nk}]\ln\mathcal{N}\left(\mathbf{x}_n\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)+\mathbb{E}_{\mathbf{Z}}\left[\ln p(\mathbf{Z}|\boldsymbol\pi)\right]+\ln p(\boldsymbol\pi)+\sum_{k=1}^K\ln p(\boldsymbol\mu_k,\boldsymbol\Lambda_k)+{\rm const}\\
&=\left(\ln p(\boldsymbol\pi)+\mathbb{E}_{\mathbf{Z}}\left[\ln p(\mathbf{Z}|\boldsymbol\pi)\right]\right)+\left(\sum_{k=1}^K\left(\sum_{n=1}^N\mathbb{E}_{\mathbf{Z}}[z_{nk}]\ln\mathcal{N}\left(\mathbf{x}_n\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)+\ln p(\boldsymbol\mu_k,\boldsymbol\Lambda_k)\right)\right)+{\rm const}
\end{align*}
$$
が得られる。
$${\boldsymbol\pi}$$を含む部分に着目すると,
$$
\begin{align*}
\ln q^*(\boldsymbol\pi)&=\ln p(\boldsymbol\pi)+\mathbb{E}_{\mathbf{Z}}\left[\ln p(\mathbf{Z}|\boldsymbol\pi)\right]+{\rm const}\\
&=(\alpha_0-1)\sum_{k=1}^K\ln\pi_k+\sum_{n=1}^N\sum_{k=1}^K\mathbb{E}_{\mathbf{Z}}[z_{nk}]\ln\pi_k+{\rm const}\\
&=\sum_{k=1}^K(\alpha_0+N_{k}-1)\ln\pi_k+{\rm const}\\
\therefore q^*(\boldsymbol\pi)&={\rm Dir}(\boldsymbol\pi|\boldsymbol\alpha)
\end{align*}
$$
ここで,$${\boldsymbol\alpha}$$は$${k}$$成分が$${\alpha_0+N_k}$$からなるベクトルである。
$${q^*(\boldsymbol\mu_k,\boldsymbol\Lambda_k)=q^*(\boldsymbol\mu_k|\boldsymbol\Lambda_k)q^*(\boldsymbol\Lambda_k)}$$として$${\boldsymbol\mu_k,\boldsymbol\Lambda_k}$$を含む部分に着目すると,
$$
\begin{align*}
\ln q^*(\boldsymbol\mu_k|\boldsymbol\Lambda_k)+\ln q^*(\boldsymbol\Lambda_k)&=\sum_{n=1}^Nr_{nk}\ln\mathcal{N}\left(\mathbf{x}_n\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)+\ln p(\boldsymbol\mu_k,\boldsymbol\Lambda_k)+{\rm const}\\
&=\sum_{n=1}^Nr_{nk}\ln\mathcal{N}\left(\mathbf{x}_n\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)+\ln\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_0,(\beta_0\boldsymbol\Lambda_k)^{-1}\right.\right)+\ln\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_0,\nu_0)+{\rm const}\\
&=\sum_{n=1}^Nr_{nk}\left(\frac{1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right)+\left(\frac{1}{2}\ln|\beta_0\boldsymbol\Lambda_k|-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_0)^{\rm T}\beta_0\boldsymbol\Lambda_k(\boldsymbol\mu_k-\mathbf{m}_0)\right)+\left(\frac{\nu_0-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k)\right)+{\rm const}\\
&=-\frac{1}{2}\boldsymbol\mu_k^{\rm T}\{(N_k+\beta_0)\boldsymbol\Lambda_k\}\boldsymbol\mu_k+\boldsymbol\mu_k^{\rm T}\{(N_k+\beta_0)\boldsymbol\Lambda_k\}\frac{N_k\bar{\mathbf{x}}_k+\beta_0\mathbf{m}_0}{N_k+\beta_0}-\frac{1}{2}\sum_{n=1}^Nr_{nk}\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-\frac{1}{2}\beta_0\mathbf{m}_0^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_0+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k)+{\rm const}\\
&=-\frac{1}{2}\boldsymbol\mu_k^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}\boldsymbol\mu_k+\boldsymbol\mu_k^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}\mathbf{m}_k-\frac{1}{2}\sum_{n=1}^Nr_{nk}\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-\frac{1}{2}\beta_0\mathbf{m}_0^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_0+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k)+{\rm const}\\
&=-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}(\boldsymbol\mu_k-\mathbf{m}_k)+\frac{\beta_k}{2}\mathbf{m}_k^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k-\frac{1}{2}\sum_{n=1}^Nr_{nk}\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-\frac{1}{2}\beta_0\mathbf{m}_0^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_0+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k)+{\rm const}\\
&=-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}(\boldsymbol\mu_k-\mathbf{m}_k)+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|+\frac{1}{2}{\rm Tr}\left(\beta_k\mathbf{m}_k^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k\right)-\frac{1}{2}{\rm Tr}\left(\sum_{n=1}^Nr_{nk}\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n\right)-\frac{1}{2}{\rm Tr}\left(\beta_0\mathbf{m}_0^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_0\right)-\frac{1}{2}{\rm Tr}(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k)+{\rm const}\\
&=-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}(\boldsymbol\mu_k-\mathbf{m}_k)+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|+\frac{1}{2}{\rm Tr}\left(\beta_k\mathbf{m}_k\mathbf{m}_k^{\rm T}\boldsymbol\Lambda_k\right)-\frac{1}{2}{\rm Tr}\left(\sum_{n=1}^Nr_{nk}\mathbf{x}_n\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\right)-\frac{1}{2}{\rm Tr}\left(\beta_0\mathbf{m}_0\mathbf{m}_0^{\rm T}\boldsymbol\Lambda_k\right)-\frac{1}{2}{\rm Tr}(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k)+{\rm const}\\
&=-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}(\boldsymbol\mu_k-\mathbf{m}_k)+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}\left(\left(\mathbf{W}_0^{-1}+\sum_{n=1}^Nr_{nk}\mathbf{x}_n\mathbf{x}_n^{\rm T}+\beta_0\mathbf{m}_0\mathbf{m}_0^{\rm T}-\beta_k\mathbf{m}_k\mathbf{m}_k^{\rm T}\right)\boldsymbol\Lambda_k\right)+{\rm const}\\
&=-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}(\boldsymbol\mu_k-\mathbf{m}_k)+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}\left(\left(\mathbf{W}_0^{-1}+N_k\mathbf{S}_k+N_k\bar{\mathbf{x}}_k\bar{\mathbf{x}}_k^{\rm T}+\beta_0\mathbf{m}_0\mathbf{m}_0^{\rm T}-\beta_k\mathbf{m}_k\mathbf{m}_k^{\rm T}\right)\boldsymbol\Lambda_k\right)+{\rm const}\\
&=-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}(\boldsymbol\mu_k-\mathbf{m}_k)+\frac{\nu_0+N_k+1-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}\left(\left(\mathbf{W}_0^{-1}+N_k\mathbf{S}_k+\frac{\beta_0N_k}{\beta_0+N_k}(\bar{\mathbf{x}}_k-\mathbf{m}_0)^{\rm T}(\bar{\mathbf{x}}_k-\mathbf{m}_0)\right)\boldsymbol\Lambda_k\right)+{\rm const}\\
&=-\frac{1}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\{\beta_k\boldsymbol\Lambda_k\}(\boldsymbol\mu_k-\mathbf{m}_k)+\frac{1}{2}\ln|\beta_k\boldsymbol\Lambda_k|+\frac{\nu_k-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}\left(\mathbf{W}_k^{-1}\boldsymbol\Lambda_k\right)+{\rm const}\\
&=\ln\left(\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\right)
\end{align*}
$$
以上より,題意は示された。
Exercise (10.14)
$$
\begin{align*}
\mathbb{E}_{\boldsymbol\mu_k,\boldsymbol\Lambda_k}\left[(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right]&=\mathbf{x}_n^{\rm T}\mathbb{E}_{\boldsymbol\mu_k,\boldsymbol\Lambda_k}\left[\boldsymbol\Lambda_k\right]\mathbf{x}_n-2\mathbf{x}_n^{\rm T}\mathbb{E}_{\boldsymbol\mu_k,\boldsymbol\Lambda_k}\left[\boldsymbol\Lambda_k\boldsymbol\mu_k\right]+\mathbb{E}_{\boldsymbol\mu_k,\boldsymbol\Lambda_k}\left[\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k\right]\\
&=\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-2\mathbf{x}_n^{\rm T}\mathbb{E}_{\boldsymbol\Lambda_k}\left[\boldsymbol\Lambda_k\right]\mathbf{m}_k+\mathbb{E}_{\boldsymbol\mu_k,\boldsymbol\Lambda_k}\left[{\rm Tr}\left(\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k\right)\right]\\
&=\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-2\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k+\mathbb{E}_{\boldsymbol\mu_k,\boldsymbol\Lambda_k}\left[{\rm Tr}\left(\boldsymbol\mu_k\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\right)\right]\\
&=\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-2\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k+\mathbb{E}_{\boldsymbol\Lambda_k}\left[{\rm Tr}\left(\left(\mathbf{m}_k\mathbf{m}_k^{\rm T}+\beta_k^{-1}\boldsymbol\Lambda_k^{-1}\right)\boldsymbol\Lambda_k\right)\right]\\
&=\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-2\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k+\mathbf{m}_k^{\rm T}\mathbb{E}_{\boldsymbol\Lambda_k}\left[\boldsymbol\Lambda_k\right]\mathbf{m}_k+\beta_k^{-1}D\\
&=\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-2\nu_k\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k+\nu_k\mathbf{m}_k^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k+\beta_k^{-1}D\\
&=\beta_k^{-1}D+\nu_k(\mathbf{x}_n-\mathbf{m}_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\mathbf{m}_k)
\end{align*}
$$
Exercise (10.15)
$$
\begin{align*}
\mathbb{E}[\pi_k]&=\frac{\alpha_k}{\hat{\alpha}}\\
&=\frac{\alpha_0+N_k}{\sum_{j=1}^K(\alpha_0+N_k)}\\
&=\frac{\alpha_0+N_k}{K\alpha_0+N}
\end{align*}
$$
Exercise (10.16)
$$
\begin{align*}
\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)&=\ln\left(\prod_{k=1}^{K}\prod_{n=1}^{N}\mathcal{N}\left(\mathbf{x}_n|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right)^{z_{nk}}\right)\\
&=\sum_{k=1}^{K}\sum_{n=1}^{N}z_{nk}\ln\mathcal{N}\left(\mathbf{x}_n|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right)\\
&=\frac{1}{2}\sum_{k=1}^{K}\sum_{n=1}^{N}z_{nk}\left(-D\ln(2\pi)+\ln|\boldsymbol\Lambda_k|-(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right)
\end{align*}
$$
より,
$$
\begin{align*}
\mathbb{E}\left[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)\right]&=\frac{1}{2}\sum_{k=1}^{K}\sum_{n=1}^{N}\mathbb{E}\left[z_{nk}\right]\left(-D\ln(2\pi)+\mathbb{E}\left[\ln|\boldsymbol\Lambda_k|\right]-\mathbb{E}\left[(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right]\right)\\
&=\frac{1}{2}\sum_{k=1}^{K}\sum_{n=1}^{N}r_{nk}\left(-D\ln(2\pi)+\ln\widetilde{\boldsymbol\Lambda}_k-\mathbb{E}\left[(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right]\right)\\
&=\frac{1}{2}\sum_{k=1}^{K}\left(N_{k}\left(\ln\widetilde{\boldsymbol\Lambda}_k-D\ln(2\pi)-\mathbb{E}\left[\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k\right]+2\overline{\mathbf{x}}_k^{\rm T}\mathbb{E}\left[\boldsymbol\Lambda_k\boldsymbol\mu_k\right]\right)-\sum_{n=1}^{N}r_{nk}\mathbf{x}_n^{\rm T}\mathbb{E}\left[\boldsymbol\Lambda_k\right]\mathbf{x}_n\right)
\end{align*}
$$
が得られる。
右辺に含まれる期待値を計算すると,
$$
\begin{align*}
\mathbb{E}\left[\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k\right]&=\int{\rm d}\boldsymbol\mu_k\int{\rm d}\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\boldsymbol\Lambda_k\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k)\\
&=\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k){\rm Tr}\left[\boldsymbol\Lambda_k\int{\rm d}\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\boldsymbol\mu_k\boldsymbol\mu_k\boldsymbol\mu_k^{\rm T}\right]\\
&=\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k){\rm Tr}\left[\boldsymbol\Lambda_k\left(\mathbf{m}_k\mathbf{m}_k^{\rm T}+(\beta_k\boldsymbol\Lambda_k)^{-1}\right)\right]\\
&=\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\left(\mathbf{m}_k^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_k+D\beta_k^{-1}\right)\\
&=\mathbf{m}_k^{\rm T}\left(\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\boldsymbol\Lambda_k\right)\mathbf{m}_k+D\beta_k^{-1}\\
&=\nu_k\mathbf{m}_k^{\rm T}\mathbf{W}_k\mathbf{m}_k+D\beta_k^{-1}
\end{align*}
$$
$$
\begin{align*}
\overline{\mathbf{x}}_k^{\rm T}\mathbb{E}\left[\boldsymbol\Lambda_k\boldsymbol\mu_k\right]&=\overline{\mathbf{x}}_k^{\rm T}\int{\rm d}\boldsymbol\mu_k\int{\rm d}\boldsymbol\Lambda_k\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\boldsymbol\Lambda_k\boldsymbol\mu_k\\
&=\overline{\mathbf{x}}_k^{\rm T}\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\boldsymbol\Lambda_k\int{\rm d}\boldsymbol\mu_k\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\boldsymbol\mu_k\\
&=\overline{\mathbf{x}}_k^{\rm T}\left(\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\boldsymbol\Lambda_k\right)\mathbf{m}_k\\
&=\nu_k\overline{\mathbf{x}}_k^{\rm T}\mathbf{W}_k\mathbf{m}_k
\end{align*}
$$
$$
\begin{align*}
\sum_{n=1}^{N}r_{nk}\mathbf{x}_n^{\rm T}\mathbb{E}\left[\boldsymbol\Lambda_k\right]\mathbf{x}_n&=\nu_k\sum_{n=1}^{N}r_{nk}\mathbf{x}_n^{\rm T}\mathbf{W}_k\mathbf{x}_n\\
&=\nu_k{\rm Tr}\left[\mathbf{W}_k\sum_{n=1}^{N}r_{nk}\mathbf{x}_n\mathbf{x}_n^{\rm T}\right]\\
&=\nu_k{\rm Tr}\left[\mathbf{W}_kN_k\left(\mathbf{S}_k+\overline{\mathbf{x}}_k\overline{\mathbf{x}}_k^{\rm T}\right)\right]\\
&=N_k\nu_k\left({\rm Tr}\left[\mathbf{S}_k\mathbf{W}_k\right]+\overline{\mathbf{x}}_k^{\rm T}\mathbf{W}_k\overline{\mathbf{x}}_k\right)
\end{align*}
$$
以上より,
$$
\begin{align*}
\mathbb{E}\left[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)\right]&=\frac{1}{2}\sum_{k=1}^{K}\left(N_{k}\left(\ln\widetilde{\boldsymbol\Lambda}_k-D\ln(2\pi)-\nu_k\mathbf{m}_k^{\rm T}\mathbf{W}_k\mathbf{m}_k-D\beta_k^{-1}+2\nu_k\overline{\mathbf{x}}_k^{\rm T}\mathbf{W}_k\mathbf{m}_k\right)-N_k\nu_k\left({\rm Tr}\left[\mathbf{S}_k\mathbf{W}_k\right]+\overline{\mathbf{x}}_k^{\rm T}\mathbf{W}_k\overline{\mathbf{x}}_k\right)\right)\\
&=\frac{1}{2}\sum_{k=1}^{K}N_{k}\left(\ln\widetilde{\boldsymbol\Lambda}_k-D\beta_k^{-1}-\nu_k{\rm Tr}\left[\mathbf{S}_k\mathbf{W}_k\right]-\nu_k(\overline{\mathbf{x}}_k-\mathbf{m}_k)^{\rm T}\mathbf{W}_k(\overline{\mathbf{x}}_k-\mathbf{m}_k)-D\ln(2\pi)\right)\\
\end{align*}
$$
$$
\begin{align*}
\mathbb{E}[\ln p(\mathbf{Z}|\boldsymbol\pi)]&=\sum_{n=1}^N\sum_{k=1}^K\mathbb{E}[z_{nk}\ln \pi_k]\\
&=\sum_{n=1}^N\sum_{k=1}^K\mathbb{E}[z_{nk}]\mathbb{E}[\ln \pi_k]\\
&=\sum_{n=1}^N\sum_{k=1}^Kr_{nk}\ln \widetilde{\pi}_k\\
\end{align*}
$$
Exercise (10.17)
$$
\begin{align*}
\ln p(\boldsymbol\pi)&=\ln\left\{C(\boldsymbol\alpha_0)\prod_{k=1}^K\pi_k^{\alpha_0-1}\right\}\\
&=\ln C(\boldsymbol\alpha_0)+(\alpha_0-1)\sum_{k=1}^K\ln \pi_k
\end{align*}
$$
より,
$$
\begin{align*}
\mathbb{E}[\ln p(\boldsymbol\pi)]&=\ln C(\boldsymbol\alpha_0)+(\alpha_0-1)\sum_{k=1}^K\mathbb{E}[\ln \pi_k]\\
&=\ln C(\boldsymbol\alpha_0)+(\alpha_0-1)\sum_{k=1}^K\ln \tilde{\pi}_k
\end{align*}
$$
$$
\begin{align*}
\ln p(\boldsymbol\mu,\boldsymbol\Lambda)&=\ln\left\{\prod_{k=1}^K\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_0,(\beta_0\boldsymbol\Lambda_k)^{-1}\right.\right)\mathbf{W}(\boldsymbol\Lambda_k|\mathbf{W}_0,\nu_0)\right\}\\
&=\sum_{k=1}^K\left\{\ln\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_0,(\beta_0\boldsymbol\Lambda_k)^{-1}\right.\right)+\ln\mathbf{W}(\boldsymbol\Lambda_k|\mathbf{W}_0,\nu_0) \right\}\\
&=\sum_{k=1}^K\left\{-\frac{D}{2}\ln(2\pi)+\frac{1}{2}\ln|\beta_0\boldsymbol\Lambda_k|-\frac{\beta_0}{2}(\boldsymbol\mu_k-\mathbf{m}_0)^{\rm T}\boldsymbol\Lambda_k(\boldsymbol\mu_k-\mathbf{m}_0)+\ln B(\mathbf{W}_0,\nu_0)+\frac{\nu_0-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}\left(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k\right) \right\}\\
&=\frac{1}{2}\sum_{k=1}^K\left\{\frac{D}{2}\ln\left(\frac{\beta_0}{2\pi}\right)+\ln|\boldsymbol\Lambda_k|-\beta_0(\boldsymbol\mu_k-\mathbf{m}_0)^{\rm T}\boldsymbol\Lambda_k(\boldsymbol\mu_k-\mathbf{m}_0) \right\}+K\ln B(\mathbf{W}_0,\nu_0)+\frac{\nu_0-D-1}{2}\sum_{k=1}^K\ln|\boldsymbol\Lambda_k|-\frac{1}{2}\sum_{k=1}^K{\rm Tr}\left(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k\right)\\
\end{align*}
$$
より,
$$
\begin{align*}
\mathbb{E}[\ln p(\boldsymbol\mu,\boldsymbol\Lambda)]&=\frac{1}{2}\sum_{k=1}^K\left\{\frac{D}{2}\ln\left(\frac{\beta_0}{2\pi}\right)+\mathbb{E}[\ln|\boldsymbol\Lambda_k|]-\beta_0\mathbb{E}[(\boldsymbol\mu_k-\mathbf{m}_0)^{\rm T}\boldsymbol\Lambda_k(\boldsymbol\mu_k-\mathbf{m}_0)] \right\}+K\ln B(\mathbf{W}_0,\nu_0)+\frac{\nu_0-D-1}{2}\sum_{k=1}^K\mathbb{E}[\ln|\boldsymbol\Lambda_k|]-\frac{1}{2}\sum_{k=1}^K\mathbb{E}[{\rm Tr}\left(\mathbf{W}_0^{-1}\boldsymbol\Lambda_k\right)]\\
&=\frac{1}{2}\sum_{k=1}^K\left\{\frac{D}{2}\ln\left(\frac{\beta_0}{2\pi}\right)+\ln\widetilde{\boldsymbol\Lambda}_k-\beta_0(\nu_k(\mathbf{m}_k-\mathbf{m}_0)^{\rm T}\mathbf{W}_k(\mathbf{m}_k-\mathbf{m}_0)+D\beta_k^{-1}) \right\}+K\ln B(\mathbf{W}_0,\nu_0)+\frac{\nu_0-D-1}{2}\sum_{k=1}^K\ln\widetilde{\boldsymbol\Lambda}_k-\frac{1}{2}\sum_{k=1}^K{\rm Tr}\left(\nu_k\mathbf{W}_0^{-1}\mathbf{W}_k\right)\\
&=\frac{1}{2}\sum_{k=1}^K\left\{\frac{D}{2}\ln\left(\frac{\beta_0}{2\pi}\right)+\ln\widetilde{\boldsymbol\Lambda}_k-\beta_0\nu_k(\mathbf{m}_k-\mathbf{m}_0)^{\rm T}\mathbf{W}_k(\mathbf{m}_k-\mathbf{m}_0)-D\frac{\beta_0}{\beta_k} \right\}+K\ln B(\mathbf{W}_0,\nu_0)+\frac{\nu_0-D-1}{2}\sum_{k=1}^K\ln\widetilde{\boldsymbol\Lambda}_k-\frac{1}{2}\sum_{k=1}^K\nu_k{\rm Tr}\left(\mathbf{W}_0^{-1}\mathbf{W}_k\right)\\
\end{align*}
$$
$$
\begin{align*}
\ln q(\mathbf{Z})&=\ln\left\{\prod_{n=1}^N\prod_{k=1}^Kr_{nk}^{z_{nk}}\right\}\\
&=\sum_{n=1}^N\sum_{k=1}^Kz_{nk}\ln r_{nk}
\end{align*}
$$
より,
$$
\begin{align*}
\mathbb{E}[\ln q(\mathbf{Z})]&=\sum_{n=1}^N\sum_{k=1}^K\mathbb{E}[z_{nk}]\ln r_{nk}\\
&=\sum_{n=1}^N\sum_{k=1}^Kr_{nk}\ln r_{nk}
\end{align*}
$$
$$
\begin{align*}
\ln q(\boldsymbol\pi)&=\ln\left\{C(\boldsymbol\alpha)\prod_{k=1}^K\pi_k^{\alpha_k-1}\right\}\\
&=\ln C(\boldsymbol\alpha)+\sum_{k=1}^K(\alpha_k-1)\ln \pi_k
\end{align*}
$$
より,
$$
\begin{align*}
\mathbb{E}[\ln q(\boldsymbol\pi)]&=\ln C(\boldsymbol\alpha)+\sum_{k=1}^K(\alpha_k-1)\mathbb{E}[\ln \pi_k]\\
&=\sum_{k=1}^K(\alpha_k-1)\ln \tilde{\pi}_k+\ln C(\boldsymbol\alpha)
\end{align*}
$$
$$
\begin{align*}
\ln q(\boldsymbol\mu,\boldsymbol\Lambda)&=\ln\left\{\prod_{k=1}^K\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\mathbf{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\right\}\\
&=\sum_{k=1}^K\left\{\ln\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)+\ln\mathbf{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k) \right\}\\
&=\sum_{k=1}^K\left\{-\frac{D}{2}\ln(2\pi)+\frac{1}{2}\ln|\beta_k\boldsymbol\Lambda_k|-\frac{\beta_k}{2}(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\boldsymbol\Lambda_k(\boldsymbol\mu_k-\mathbf{m}_k)+\ln B(\mathbf{W}_k,\nu_k)+\frac{\nu_k-D-1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}{\rm Tr}\left(\mathbf{W}_k^{-1}\boldsymbol\Lambda_k\right) \right\}\\
\end{align*}
$$
より,
$$
\begin{align*}
\mathbb{E}[\ln q(\boldsymbol\mu,\boldsymbol\Lambda)]&=\sum_{k=1}^K\left\{-\frac{D}{2}\ln(2\pi)+\frac{1}{2}\mathbb{E}[\ln|\beta_k\boldsymbol\Lambda_k|]-\frac{\beta_k}{2}\mathbb{E}[(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\boldsymbol\Lambda_k(\boldsymbol\mu_k-\mathbf{m}_k)]+\ln B(\mathbf{W}_k,\nu_k)+\frac{\nu_k-D-1}{2}\mathbb{E}[\ln|\boldsymbol\Lambda_k|]-\frac{1}{2}\mathbb{E}[{\rm Tr}\left(\mathbf{W}_k^{-1}\boldsymbol\Lambda_k\right)] \right\}\\
&=\sum_{k=1}^K\left\{\frac{D}{2}\ln\left(\frac{\beta_k}{2\pi}\right)+\frac{1}{2}\ln\widetilde{\Lambda}_k-\frac{D}{2}+\ln B(\mathbf{W}_k,\nu_k)+\frac{\nu_k-D-1}{2}\ln\widetilde{\Lambda}_k-\frac{D\nu_k}{2} \right\}\\
&=\sum_{k=1}^K\left\{\frac{1}{2}\ln\widetilde{\Lambda}_k+\frac{D}{2}\ln\left(\frac{\beta_k}{2\pi}\right)-\frac{D}{2}-H[q(\boldsymbol\Lambda_k)] \right\}\\
\end{align*}
$$
Exercise (10.18)
$$
\begin{align*}
\frac{\partial \mathcal{L}}{\partial \alpha_k}&=\frac{\partial }{\partial \alpha_k}\left\{\mathbb{E}[\ln p(\boldsymbol\pi)]+\mathbb{E}[\ln p(\mathbf{Z}|\boldsymbol\pi,\boldsymbol\Lambda)]-\mathbb{E}[\ln q(\boldsymbol\pi)]\right\}\\
&=\frac{\partial }{\partial \alpha_k}\left\{\sum_{k'=1}^K(N_{k'}+\alpha_0-\alpha_{k'})\ln\widetilde{\pi}_{k'}+\ln C(\boldsymbol\alpha_0)-\ln C(\boldsymbol\alpha)\right\}\\
&=-\sum_{k'=1}^K\delta_{k,k'}\ln\widetilde{\pi}_{k'}+\sum_{k'=1}^K(N_{k'}+\alpha_0-\alpha_{k'})\frac{\partial }{\partial \alpha_k}\ln\widetilde{\pi}_{k'}-\frac{\partial }{\partial \alpha_k}\ln C(\boldsymbol\alpha)\\
&=-\ln\widetilde{\pi}_{k}+\sum_{k'=1}^K(N_{k'}+\alpha_0-\alpha_{k'})\frac{\partial }{\partial \alpha_k}\ln\widetilde{\pi}_{k'}+\ln\widetilde{\pi}_{k}\\
&=\sum_{k'=1}^K(N_{k'}+\alpha_0-\alpha_{k'})\frac{\partial }{\partial \alpha_k}\ln\widetilde{\pi}_{k'}\\
&=0\\
\therefore \alpha_k&=\alpha_0+N_k
\end{align*}
$$
$$
\begin{align*}
\frac{\partial \mathcal{L}}{\partial \beta_k}&=\frac{\partial }{\partial \beta_k}\left\{\mathbb{E}[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)]+\mathbb{E}[\ln p(\boldsymbol\mu,\boldsymbol\Lambda)]-\mathbb{E}[\ln q(\boldsymbol\mu,\boldsymbol\Lambda)]\right\}\\
&=\frac{D}{2}N_k\beta_k^{-2}+\frac{D\beta_0}{2}\beta_k^{-2}-\frac{D}{2}\beta_k^{-1}\\
&=\frac{D}{2}\left(N_k+\beta_0-\beta_k\right)\beta_k^{-2}\\
&=0\\
\therefore \beta_k&=\beta_0+N_k
\end{align*}
$$
$$
\begin{align*}
\frac{\partial \mathcal{L}}{\partial \mathbf{m}_k}&=\frac{\partial }{\partial \mathbf{m}_k}\left\{\mathbb{E}[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)]+\mathbb{E}[\ln p(\boldsymbol\mu,\boldsymbol\Lambda)]\right\}\\
&=N_k\nu_k\mathbf{W}_k(\overline{\mathbf{x}}_k-\mathbf{m}_k)+\beta_0\nu_k\mathbf{W}_k(\mathbf{m}_0-\mathbf{m}_k)\\
&=\nu_k\mathbf{W}_k\left\{N_k(\overline{\mathbf{x}}_k-\mathbf{m}_k)+\beta_0(\mathbf{m}_0-\mathbf{m}_k)\right\}\\
&=\nu_k\mathbf{W}_k\left\{(N_k\overline{\mathbf{x}}_k+\beta_0\mathbf{m}_0)-\beta_k\mathbf{m}_k)\right\}\\
&=\mathbf{0}\\
\therefore \mathbf{m}_k&=\beta_k^{-1}(N_k\overline{\mathbf{x}}_k+\beta_0\mathbf{m}_0)
\end{align*}
$$
$$
\begin{align*}
\frac{\partial \mathcal{L}}{\partial \nu_k}&=\frac{\partial }{\partial \nu_k}\left\{\mathbb{E}[\ln p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)]+\mathbb{E}[\ln p(\boldsymbol\mu,\boldsymbol\Lambda)]-\mathbb{E}[\ln q(\boldsymbol\mu,\boldsymbol\Lambda)]\right\}\\
&=\frac{N_k}{2}\left\{\frac{\partial \ln\widetilde{\Lambda}_k}{\partial \nu_k}-{\rm Tr}[\mathbf{S}_k\mathbf{W}_k]-(\overline{\mathbf{x}}_k-\mathbf{m}_k)^{\rm T}\mathbf{W}_k(\overline{\mathbf{x}}_k-\mathbf{m}_k)\right\}+\frac{1}{2}\left\{(\nu_0+D)\frac{\partial \ln\widetilde{\Lambda}_k}{\partial \nu_k}-{\rm Tr}[\mathbf{W}_0^{-1}\mathbf{W}_k]-\beta_0(\mathbf{m}_k-\mathbf{m}_0)^{\rm T}\mathbf{W}_k(\mathbf{m}_k-\mathbf{m}_0)\right\}-\frac{1}{2}\left\{(\nu_k+D)\frac{\partial \ln\widetilde{\Lambda}_k}{\partial \nu_k}+\ln\widetilde{\Lambda}_k+2\frac{\partial \ln H[q(\boldsymbol\Lambda_k)]}{\partial \nu_k}\right\}\\
&=\frac{N_k+\nu_0-\nu_k}{2}\frac{\partial \ln\widetilde{\Lambda}_k}{\partial \nu_k}-\frac{1}{2}{\rm Tr}[(\mathbf{W}_0^{-1}+N_k\mathbf{S}_k)\mathbf{W}_k]-\frac{N_k\beta_0}{2\beta_k}(\overline{\mathbf{x}}_k-\mathbf{m}_0)^{\rm T}\mathbf{W}_k(\overline{\mathbf{x}}_k-\mathbf{m}_0)+\frac{D}{2}\\
&=\frac{N_k+\nu_0-\nu_k}{2}\frac{\partial \ln\widetilde{\Lambda}_k}{\partial \nu_k}+\frac{1}{2}\left\{D-{\rm Tr}\left[\left(\mathbf{W}_0^{-1}+N_k\mathbf{S}_k+\frac{N_k\beta_0}{\beta_k}(\overline{\mathbf{x}}_k-\mathbf{m}_0)(\overline{\mathbf{x}}_k-\mathbf{m}_0)^{\rm T}\right)\mathbf{W}_k\right]\right\}\\
&=0\\
\therefore \nu_k&=\nu_0+N_k\\
\mathbf{W}_k^{-1}&=\mathbf{W}_0^{-1}+N_k\mathbf{S}_k+\frac{N_k\beta_0}{\beta_k}(\overline{\mathbf{x}}_k-\mathbf{m}_0)(\overline{\mathbf{x}}_k-\mathbf{m}_0)^{\rm T}
\end{align*}
$$
Exercise (10.19)
$$
\begin{align*}
p(\widehat{\mathbf{x}}|\mathbf{X})&=\sum_{k=1}^K\int{\rm d}\boldsymbol\pi\int{\rm d}\boldsymbol\mu_k\int{\rm d}\boldsymbol\Lambda_k\pi_k\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)q(\boldsymbol\pi)q(\boldsymbol\mu_k,\boldsymbol\Lambda_k)\\
&=\sum_{k=1}^K\int{\rm d}\boldsymbol\pi {\rm Dir}(\boldsymbol\pi|\boldsymbol\alpha)\pi_k\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\int{\rm d}\boldsymbol\mu_k\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\int{\rm d}\boldsymbol\mu_k\mathcal{N}\left(\boldsymbol\mu_k\left|\mathbf{m}_k,(\beta_k\boldsymbol\Lambda_k)^{-1}\right.\right)\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{\beta_k}{2\pi(1+\beta_k)}\right)^{D/2}\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)|\boldsymbol\Lambda_k|^{1/2}\exp\left(-\frac{\beta_k}{2(1+\beta_k)}(\widehat{\mathbf{x}}-\mathbf{m}_k)^{\rm T}\boldsymbol\Lambda_k(\widehat{\mathbf{x}}-\mathbf{m}_k)\right)\int{\rm d}\boldsymbol\mu_k\mathcal{N}\left(\boldsymbol\mu_k\left|\frac{\widehat{\mathbf{x}}+\beta_k\mathbf{m}_k}{1+\beta_k},((1+\beta_k)\boldsymbol\Lambda_k)^{-1}\right.\right)\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{\beta_k}{2\pi(1+\beta_k)}\right)^{D/2}\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)|\boldsymbol\Lambda_k|^{1/2}\exp\left(-\frac{\beta_k}{2(1+\beta_k)}(\widehat{\mathbf{x}}-\mathbf{m}_k)^{\rm T}\boldsymbol\Lambda_k(\widehat{\mathbf{x}}-\mathbf{m}_k)\right)\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{\beta_k}{2\pi(1+\beta_k)}\right)^{D/2}\frac{B(\mathbf{W}_k,\nu_k)}{B(\mathbf{W}'_k,\nu_k+1)}\int{\rm d}\boldsymbol\Lambda_k\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}'_k,\nu_k+1)\ \ \ \ \left(\mathbf{W}_k^{'-1}:=\mathbf{W}_k^{-1}+\frac{\beta_k}{1+\beta_k}(\widehat{\mathbf{x}}-\mathbf{m}_k)(\widehat{\mathbf{x}}-\mathbf{m}_k)^{\rm T}\right)\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{\beta_k}{2\pi(1+\beta_k)}\right)^{D/2}\frac{B(\mathbf{W}_k,\nu_k)}{B(\mathbf{W}'_k,\nu_k+1)}\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{\beta_k}{2\pi(1+\beta_k)}\right)^{D/2}\frac{|\mathbf{W}_k'|^{\frac{\nu_k+1}{2}}2^{\frac{(\nu_k+1)D}{2}}\prod_{i=1}^D\Gamma\left(\frac{\nu_k+2-i}{2}\right)}{|\mathbf{W}_k|^{\frac{\nu_k}{2}}2^{\frac{\nu_kD}{2}}\prod_{i=1}^D\Gamma\left(\frac{\nu_k+1-i}{2}\right)}\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{\beta_k}{\pi(1+\beta_k)}\right)^{D/2}|\mathbf{W}_k|^{\frac{1}{2}}\frac{\left|\mathbf{I}+\frac{\beta_k}{1+\beta_k}\mathbf{W}_k(\widehat{\mathbf{x}}-\mathbf{m}_k)(\widehat{\mathbf{x}}-\mathbf{m}_k)^{\rm T}\right|^{-\frac{\nu_k+1}{2}}\Gamma\left(\frac{\nu_k+1}{2}\right)}{\Gamma\left(\frac{\nu_k-D+1}{2}\right)}\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{\beta_k}{\pi(1+\beta_k)}\right)^{D/2}|\mathbf{W}_k|^{\frac{1}{2}}\frac{\left(1+\frac{\beta_k}{1+\beta_k}(\widehat{\mathbf{x}}-\mathbf{m}_k)^{\rm T}\mathbf{W}_k(\widehat{\mathbf{x}}-\mathbf{m}_k)\right)^{-\frac{\nu_k+1}{2}}\Gamma\left(\frac{\nu_k+1}{2}\right)}{\Gamma\left(\frac{\nu_k-D+1}{2}\right)}\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k\left(\frac{1}{\pi(\nu_k+1-D)}\right)^{D/2}\left|\frac{\beta_k(\nu_k+1-D)}{1+\beta_k}\mathbf{W}_k\right|^{\frac{1}{2}}\frac{\left(1+\frac{1}{\nu_k+1-D}(\widehat{\mathbf{x}}-\mathbf{m}_k)^{\rm T}\frac{\beta_k(\nu_k+1-D)}{1+\beta_k}\mathbf{W}_k(\widehat{\mathbf{x}}-\mathbf{m}_k)\right)^{-\frac{\nu_k-D+1}{2}-\frac{D}{2}}\Gamma\left(\frac{\nu_k-D+1}{2}+\frac{D}{2}\right)}{\Gamma\left(\frac{\nu_k-D+1}{2}\right)}\\
&=\frac{1}{\widehat{\alpha}}\sum_{k=1}^K\alpha_k{\rm St}\left(\widehat{\mathbf{x}}\left|\mathbf{m}_k,\frac{\beta_k(\nu_k+1-D)}{1+\beta_k}\mathbf{W}_k,\nu_k+1-D\right.\right)
\end{align*}
$$
Exercise (10.20)
$${N\gg 1}$$の場合において,全ての$${k(=1,\cdots,K)}$$に対して$${\pi_k}$$が有限の値を取る($${N_k\gg 1}$$)と仮定する。
また,最初のE stepにおいて$${r_{nk}=\gamma(z_{nk})}$$と選ぶことにして,以下では
$$
\begin{align*}
\overline{\mathbf{x}}_k&=\boldsymbol\mu_k^{(\rm ML)}\\
\mathbf{S}_k&=\boldsymbol\Sigma_k^{(\rm ML)}
\end{align*}
$$
を代入した式変形を考えることにする。
このとき,
$$
\begin{align*}
\mathbb{E}[\boldsymbol\Lambda_k]&=\nu_k\mathbf{W}_k\\
&=(\nu_0+N_k)\left\{\mathbf{W}_0^{-1}+N_k\mathbf{S}_k+\frac{N_k\beta_0}{\beta_0+N_k}(\overline{\mathbf{x}}_k-\mathbf{m}_0)(\overline{\mathbf{x}}_k-\mathbf{m}_0)^{\rm T}\right\}^{-1}\\
&=\left(\frac{\nu_0}{N_k}+1\right)\left\{\frac{1}{N_k}\mathbf{W}_0^{-1}+\mathbf{S}_k+\frac{\beta_0}{\beta_0+N_k}(\overline{\mathbf{x}}_k-\mathbf{m}_0)(\overline{\mathbf{x}}_k-\mathbf{m}_0)^{\rm T}\right\}^{-1}\\
&\simeq \mathbf{S}_k^{-1}=\left(\boldsymbol\Sigma_k^{(\rm ML)}\right)^{-1}\\
{\rm cov}[\boldsymbol\Lambda_{k}]&=2\nu_k\mathbf{W}_k\otimes \mathbf{W}_k\\
&\simeq\frac{2(\beta_0/N_k+1)}{N_k}\mathbf{S}_k^{-1}\otimes\mathbf{S}_k^{-1}\\
&\simeq\mathbf{0}\\
\therefore q^*(\boldsymbol\Lambda_k)&\simeq \delta\left(\boldsymbol\Lambda_k-\left(\boldsymbol\Sigma_k^{(\rm ML)}\right)^{-1}\right)
\end{align*}
$$
$$
\begin{align*}
\mathbb{E}[\boldsymbol\mu_k]&=\mathbf{m}_k\\
&=\frac{1}{\beta_0+N_k}(\beta_0\mathbf{m}_0+N_k\overline{\mathbf{x}}_k)\\
&=\frac{1}{\beta_0/N_k+1}(\beta_0\mathbf{m}_0/N_k+\overline{\mathbf{x}}_k)\\
&\simeq \overline{\mathbf{x}}_k=\boldsymbol\mu_k^{(\rm ML)}\\
{\rm cov}[\boldsymbol\mu_{k}]&=\left(\beta_k\boldsymbol\Lambda_k\right)^{-1}\\
&=\frac{\boldsymbol\Lambda_k^{-1}}{\beta_0+N_k}\\
&\simeq\mathbf{0}\\
\therefore q^*(\boldsymbol\mu_k|\boldsymbol\Lambda_k)&\simeq \delta\left(\boldsymbol\mu_k-\boldsymbol\mu_k^{(\rm ML)}\right)
\end{align*}
$$
$$
\begin{align*}
\mathbb{E}[\pi_k]&=\frac{\alpha_k}{\widehat{\alpha}}\\
&=\frac{\alpha_0+N_k}{K\alpha_0+N}\\
&\simeq \frac{N_k}{N}=\pi_k^{(\rm ML)}\\
{\rm var}[\pi_k]&=\frac{(\alpha+N_k)(K\alpha_0+N-\alpha_0-N_k)}{(N+K\alpha_0)^2(N+K\alpha_0+1)}\\
&=\frac{1}{N}\frac{(\alpha/N+N_k/N)((K-1)\alpha_0/N+1-N_k/N)}{(1+K\alpha_0/N)^2(1+(K\alpha_0+1)/N)}\\
&\simeq 0\\
\therefore q^*(\boldsymbol\pi)&\simeq \prod_{k=1}^K\delta\left(\pi_k-\frac{N_k}{N}\right)=\delta\left(\boldsymbol\pi-\boldsymbol\pi^{(\rm ML)}\right)
\end{align*}
$$
$$
\begin{align*}
\ln\widetilde{\pi}_k&=\psi(\alpha_k)-\psi(\widehat{\alpha})\\
&\simeq\ln\left(\frac{\alpha_0+N_k}{K\alpha_0+N}\right)\\
&\simeq\ln\left(\frac{N_k}{N}\right)=\ln\left(\pi_k^{(\rm ML)}\right)\\
\ln\widetilde{\Lambda}_k&=\sum_{i=1}^D\psi\left(\frac{\nu_k+1-i}{2}\right)+D\ln 2+\ln|\mathbf{W}_k|\\
&\simeq\sum_{i=1}^D\ln(\nu_k+1-i)+\ln\left|(\nu_k\mathbf{S}_k)^{-1}\right|\\
&\simeq\ln\left|\mathbf{S}_k^{-1}\right|=\ln\left|\boldsymbol\Sigma_k^{(\rm ML)}\right|
\end{align*}
$$
より,
$$
\begin{align*}
r_{nk}&\propto \widetilde{\pi}_k\widetilde{\Lambda}_k^{1/2}\exp\left\{-\frac{D}{2\beta_k}-\frac{\nu_k}{2}(\mathbf{x}_n-\mathbf{m}_k)^{\rm T}\mathbf{W}_k(\mathbf{x}_n-\mathbf{m}_k)\right\}\\
&\simeq \pi_k^{(\rm ML)}\left|\boldsymbol\Lambda_k^{(\rm ML)}\right|^{1/2}\exp\left\{-\frac{1}{2}(\mathbf{x}_n-\mathbf{m}_k)^{\rm T}\boldsymbol\Lambda_k^{(\rm ML)}(\mathbf{x}_n-\mathbf{m}_k)\right\}\propto r_{nk}^{(\rm EM)}
\end{align*}
$$
となり,update後もEMアルゴリズムのresponsibilitiesと一致することになる。
以上より,$${N\gg 1}$$において$${p(\widehat{\mathbf{x}}|\mathbf{X})}$$は
$$
\begin{align*}
p(\widehat{\mathbf{x}}|\mathbf{X})&=\sum_{k=1}^K\int{\rm d}\boldsymbol\pi \int{\rm d}\boldsymbol\mu_k\int{\rm d}\boldsymbol\Lambda_k\pi_k\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)q^*(\boldsymbol\pi)q^*(\boldsymbol\mu_k|\boldsymbol\Lambda_k)q^*(\boldsymbol\Lambda_k)\\
&\simeq\sum_{k=1}^K\int{\rm d}\boldsymbol\pi \int{\rm d}\boldsymbol\mu_k\int{\rm d}\boldsymbol\Lambda_k\pi_k\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right.\right)\delta\left(\boldsymbol\pi-\boldsymbol\pi^{(\rm ML)}\right)\delta\left(\boldsymbol\mu_k-\boldsymbol\mu_k^{(\rm ML)}\right)\delta\left(\boldsymbol\Lambda_k-\left(\boldsymbol\Sigma_k^{(\rm ML)}\right)^{-1}\right)\\
&=\sum_{k=1}^K\int{\rm d}\boldsymbol\pi \int{\rm d}\boldsymbol\mu_k\pi_k\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k,\boldsymbol\Sigma_k^{(\rm ML)}\right.\right)\delta\left(\boldsymbol\pi-\boldsymbol\pi^{(\rm ML)}\right)\delta\left(\boldsymbol\mu_k-\boldsymbol\mu_k^{(\rm ML)}\right)\\
&=\sum_{k=1}^K\int{\rm d}\boldsymbol\pi \pi_k\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k^{(\rm ML)},\boldsymbol\Sigma_k^{(\rm ML)}\right.\right)\delta\left(\boldsymbol\pi-\boldsymbol\pi^{(\rm ML)}\right)\\
&=\sum_{k=1}^K \pi_k^{(\rm ML)}\mathcal{N}\left(\widehat{\mathbf{x}}\left|\boldsymbol\mu_k^{(\rm ML)},\boldsymbol\Sigma_k^{(\rm ML)}\right.\right)
\end{align*}
$$
となり,EMアルゴリズムで得られた混合ガウス分布のように振る舞う。
Exercise (10.21) - (10.30)
Exercise (10.21)
例えば,$${(\boldsymbol\mu_i,\boldsymbol\Sigma_i)}$$と$${(\boldsymbol\mu_j,\boldsymbol\Sigma_j)}$$の$${i}$$と$${j}$$のラベルを交換しても分布としては同一である。
このようにK個のラベルの並び方の総数分だけ等価な混合モデルが存在し,その数は$${K!}$$である。
Exercise (10.22)
$${K!}$$回のvariational inference algorithmで得られた単峰性の近似分布$${q_k(\mathbf{Z})\ (k=1,2,\cdots,K!)}$$を用いて$${p(\mathbf{Z}|\mathbf{X})}$$を
$$
\begin{align*}
p(\mathbf{Z}|\mathbf{X})&=\frac{1}{K!}\sum_{k=1}^{K!}q_k(\mathbf{Z})
\end{align*}
$$
と近似することを考える。
各$${q_k}$$の分布の重なりが無視できる前提の下,特定の$${q_j}$$に対して$${\ln p(\mathbf{X})}$$を計算すると,
$$
\begin{align*}
\ln p(\mathbf{X})&=\mathcal{L}[q_j]+{\rm KL}(q_j||p)\\
&=\mathcal{L}[q_j]+{\rm KL}(q_j||p)\\
&=\mathcal{L}[q_j]-\int{\rm d}\mathbf{Z}q_j(\mathbf{Z})\ln\left\{\frac{p(\mathbf{Z}|\mathbf{X})}{q_j(\mathbf{Z})}\right\}\\
&=\mathcal{L}[q_j]-\int{\rm d}\mathbf{Z}q_j(\mathbf{Z})\ln\left\{\frac{\frac{1}{K!}\sum_{k=1}^{K!}q_k(\mathbf{Z})}{q_j(\mathbf{Z})}\right\}\\
&=\mathcal{L}[q_j]+\ln K!-\int{\rm d}\mathbf{Z}q_j(\mathbf{Z})\ln\left\{\frac{\sum_{k=1}^{K!}q_k(\mathbf{Z})}{q_j(\mathbf{Z})}\right\}\\
&=\mathcal{L}[q_j]+\ln K!-\int_{q_j(\mathbf{Z})\neq 0}{\rm d}\mathbf{Z}q_j(\mathbf{Z})\ln\left\{\frac{\sum_{k=1}^{K!}q_k(\mathbf{Z})}{q_j(\mathbf{Z})}\right\}\\
&=\mathcal{L}[q_j]+\ln K!-\int_{q_j(\mathbf{Z})\neq 0}{\rm d}\mathbf{Z}q_j(\mathbf{Z})\ln\left\{\frac{q_j(\mathbf{Z})}{q_j(\mathbf{Z})}\right\}\\
&=\mathcal{L}[q_j]+\ln K!
\end{align*}
$$
となる。
以上より,題意は示された。
Exercise (10.23)
$${\boldsymbol\pi}$$がparameterのとき,variational lower boundの$${\boldsymbol\pi}$$に依存する項は$${\sum_{n=1}^N\sum_{k=1}^Kr_{nk}\ln\pi_k}$$となる。
Lagrangeの未定乗数法を用いて
$$
\begin{align*}
\mathcal{L}(\boldsymbol\pi)&:=\sum_{n=1}^N\sum_{k=1}^Kr_{nk}\ln\pi_k+\lambda\left(\sum_{k=1}^K\pi_k-1\right)
\end{align*}
$$
を最大化する$${\boldsymbol\pi}$$を求める。
$${\mathcal{L}(\boldsymbol\pi)}$$を$${\pi_k}$$で微分すると,
$$
\begin{align*}
\frac{\partial}{\partial \pi_k}\mathcal{L}(\boldsymbol\pi)&=\sum_{n=1}^Nr_{nk}\pi_k^{-1}+\lambda\\
&=0\\
\therefore\pi_k&=-\frac{1}{\lambda}\sum_{n=1}^Nr_{nk}
\end{align*}
$$
$${\boldsymbol\pi}$$の規格化条件を用いて$${\lambda}$$を求めると,
$$
\begin{align*}
\sum_{k=1}^K\pi_k&=-\frac{1}{\lambda}\sum_{n=1}^N\sum_{k=1}^Kr_{nk}\\
&=-\frac{1}{\lambda}\sum_{n=1}^NN_k\\
&=-\frac{N}{\lambda}\\
&=1\\
\therefore \lambda&=-\frac{1}{N}
\end{align*}
$$
となる。
以上より,
$$
\begin{align*}
\pi_k&=\frac{1}{N}\sum_{n=1}^Nr_{nk}
\end{align*}
$$
が得られる。
Exercise (10.24)
$${p(\boldsymbol\mu,\boldsymbol\Lambda)=\prod_{k=1}^K\mathcal{N}(\boldsymbol\mu_k|\mathbf{m}_k,\boldsymbol\Lambda_k^{-1})\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)}$$として,式(9.14)の尤度関数を事後確率の場合に拡張すると,
$$
\begin{align*}
\ln\left\{p(\mathbf{X}|\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)p(\boldsymbol\mu,\boldsymbol\Lambda)\right\}&=\sum_{n=1}^{N}\ln\left\{\sum_{k=1}^K\pi_k\mathcal{N}(\mathbf{x}_n|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1})\right\}+\sum_{k=1}^K\ln\mathcal{N}(\boldsymbol\mu_k|\mathbf{m}_k,\boldsymbol\Lambda_k^{-1})+\sum_{k=1}^K\ln\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_k,\nu_k)\\
&=\sum_{n=1}^{N}\ln\left\{\sum_{k=1}^K\pi_k\mathcal{N}(\mathbf{x}_n|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1})\right\}+\frac{1}{2}\sum_{k=1}^K(\nu_k-D)\ln|\boldsymbol\Lambda_k|-\frac{1}{2}\sum_{k=1}^K(\boldsymbol\mu_k-\mathbf{m}_k)^{\rm T}\boldsymbol\Lambda_k(\boldsymbol\mu_k-\mathbf{m}_k)-\frac{1}{2}\sum_{k=1}^K{\rm Tr}\left(\mathbf{W}_k^{-1}\boldsymbol\Lambda_k\right)+({\rm others})
\end{align*}
$$
となる。
たとえ$${\boldsymbol\mu_j=\mathbf{x}_n}$$と選んだとしても,$${-\frac{1}{2}{\rm Tr}\left(\mathbf{W}_j^{-1}\boldsymbol\Lambda_j\right)}$$の項があるおかげで$${|\boldsymbol\Lambda_j|\rightarrow\infty}$$となるような解に陥ることはない。
Exercise (10.25)
事後分布の分散が過小評価されてしまう場合,空間全体を網羅するために必要となる関数の数を多くしないといけなくなるため,混合分布の最適な混合数を過大評価する結果につながる。
Exercise (10.26)
$${p(\textsf{\textbf{t}},\mathbf{w},\alpha,\beta)=p(\textsf{\textbf{t}}|\mathbf{w},\beta)p(\mathbf{w}|\alpha)p(\alpha)p(\beta)}$$に対する近似分布$${q(\mathbf{w},\alpha,\beta)=q(\mathbf{w})q(\alpha)q(\beta)}$$を変分原理を用いて求めることを考える。
$${q^*(\alpha)}$$については,式(10.93)-(10.95)と同じになる。
$$
\begin{align*}
q^*(\alpha)&={\rm Gam}(\alpha|a_N,b_N)\\
a_N&=a_0+\frac{M}{2}\\
b_N&=b_0+\frac{1}{2}\mathbb{E}_{\mathbf{w}}\left[\mathbf{w}^{\rm T}\mathbf{w}\right]
\end{align*}
$$
$${q^*(\mathbf{w})}$$については,式(10.99)-(10.101)の$${\beta}$$を$${\mathbb{E}_{\beta}[\beta]}$$に置き換えればよい。
$$
\begin{align*}
q^*(\mathbf{w})&=\mathcal{N}(\mathbf{w}|\mathbf{m}_N,\mathbf{S}_N)\\
\mathbf{m}_N&=\mathbb{E}_{\beta}[\beta]\mathbf{S}_N\boldsymbol\Phi^{\rm T}\textsf{\textbf{t}}\\
\mathbf{S}_N&=\left(\mathbb{E}_{\alpha}[\alpha]\mathbf{I}+\mathbb{E}_{\beta}[\beta]\boldsymbol\Phi^{\rm T}\boldsymbol\Phi\right)^{-1}
\end{align*}
$$
$${q^*(\beta)}$$については,$${\ln q^*(\beta)}$$を計算すると
$$
\begin{align*}
\ln q^*(\beta)&=\mathbb{E}_{\mathbf{w}}[\ln p(\textsf{\textbf{t}}|\mathbf{w},\beta)]+\ln p(\beta|c_0,d_0)+({\rm const})\\
&=\frac{N}{2}\ln \beta-\frac{\beta}{2}\sum_{n=1}^N\mathbb{E}_{\mathbf{w}}\left[(t_n-\mathbf{w}^{\rm T}\boldsymbol\phi_n)^2\right]+(c_0-1)\ln\beta-d_0\beta\\
&=\left\{\left(c_0+\frac{N}{2}\right)-1\right\}\ln \beta-\left\{d_0+\frac{1}{2}\sum_{n=1}^N\mathbb{E}_{\mathbf{w}}\left[(t_n-\mathbf{w}^{\rm T}\boldsymbol\phi_n)^2\right]\right\}\beta
\end{align*}
$$
となるため,
$$
\begin{align*}
q^*(\beta)&={\rm Gam}(\beta|c_N,d_N)\\
c_N&=c_0+\frac{N}{2}\\
d_N&=d_0+\frac{1}{2}\sum_{n=1}^N\mathbb{E}_{\mathbf{w}}\left(t_n-\mathbf{w}^{\rm T}\boldsymbol\phi_n)^2\right]\\
&=d_0+\frac{1}{2}\textsf{\textbf{t}}^{\rm T}\textsf{\textbf{t}}-\textsf{\textbf{t}}^{\rm T}\boldsymbol\Phi\mathbb{E}_{\mathbf{w}}[\mathbf{w}]+\frac{1}{2}\boldsymbol\Phi\mathbb{E}_{\mathbf{w}}[\mathbf{w}\mathbf{w}^{\rm T}]\boldsymbol\Phi^{\rm T}
\end{align*}
$$
となることが分かる。
期待値の部分を式変形すると以下のようになる。
$$
\begin{align*}
\mathbb{E}_{\alpha}\left[\alpha\right]&=\frac{a_N}{b_N}\\
&=\frac{a_0+\frac{M}{2}}{b_0+\frac{1}{2}\mathbb{E}_{\mathbf{w}}\left[\mathbf{w}^{\rm T}\mathbf{w}\right]}\\
&=\frac{a_0+\frac{M}{2}}{b_0+\frac{1}{2}\left(\mathbf{m}_N^{\rm T}\mathbf{m}_N+{\rm Tr}(\mathbf{S}_N)\right)}\\
\mathbb{E}_{\beta}\left[\beta\right]&=\frac{c_N}{d_N}\\
&=\frac{c_0+\frac{N}{2}}{d_0+\frac{1}{2}\textsf{\textbf{t}}^{\rm T}\textsf{\textbf{t}}-\textsf{\textbf{t}}^{\rm T}\boldsymbol\Phi\mathbb{E}_{\mathbf{w}}[\mathbf{w}]+\frac{1}{2}\boldsymbol\Phi\mathbb{E}_{\mathbf{w}}[\mathbf{w}\mathbf{w}^{\rm T}]\boldsymbol\Phi^{\rm T}}\\
&=\frac{c_0+\frac{N}{2}}{d_0+\frac{1}{2}\textsf{\textbf{t}}^{\rm T}\textsf{\textbf{t}}-\textsf{\textbf{t}}^{\rm T}\boldsymbol\Phi\mathbf{m}_N+\frac{1}{2}\boldsymbol\Phi(\mathbf{m}_N\mathbf{m}_N^{\rm T}+\mathbf{S}_N)\boldsymbol\Phi^{\rm T}}\\
\end{align*}
$$
また,
$$
\begin{align*}
\mathbf{m}_N&=\mathbb{E}_{\beta}[\beta]\mathbf{S}_N\boldsymbol\Phi^{\rm T}\textsf{\textbf{t}}\\
&=\frac{c_N}{d_N}\mathbf{S}_N\boldsymbol\Phi^{\rm T}\textsf{\textbf{t}}\\
\mathbf{S}_N&=\left(\mathbb{E}_{\alpha}[\alpha]\mathbf{I}+\mathbb{E}_{\beta}[\beta]\boldsymbol\Phi^{\rm T}\boldsymbol\Phi\right)^{-1}\\
&=\left(\frac{a_N}{b_N}\mathbf{I}+\frac{c_N}{d_N}\boldsymbol\Phi^{\rm T}\boldsymbol\Phi\right)^{-1}
\end{align*}
$$
となるため,以下の手順によって$${a_N, b_N,d_N,\mathbf{m}_N,\mathbf{S}_N}$$を数値的に求めることができる。
$${a_N, b_N,c_N,d_N}$$の初期値を用意
$${\mathbf{S}_N}$$を計算
$${\mathbf{m}_N}$$を計算
$${a_N, b_N,d_N}$$の値を更新
手順2-4を$${a_N, b_N,d_N}$$が収束するまで繰り返す
Lower bound$${\mathcal{L}}$$は以下のように展開できる。
$$
\begin{align*}
\mathcal{L}&=\mathbb{E}_{\mathbf{w},\alpha,\beta}\left[\ln p(\textsf{\textbf{t}},\mathbf{w},\alpha,\beta)\right]-\mathbb{E}_{\mathbf{w},\alpha,\beta}\left[\ln q(\mathbf{w},\alpha,\beta)\right]\\
&=\mathbb{E}_{\mathbf{w},\alpha,\beta}\left[\ln \left\{p(\textsf{\textbf{t}}|\mathbf{w},\beta)p(\mathbf{w}|\alpha)p(\alpha)p(\beta)\right\}\right]-\mathbb{E}_{\mathbf{w},\alpha,\beta}\left[\ln \left\{q(\mathbf{w})q(\alpha)q(\beta)\right\}\right]\\
&=\mathbb{E}_{\mathbf{w},\beta}\left[\ln p(\textsf{\textbf{t}}|\mathbf{w},\beta)\right]+\mathbb{E}_{\mathbf{w},\alpha}\left[\ln p(\mathbf{w}|\alpha)\right]+\mathbb{E}_{\alpha}\left[\ln p(\alpha)\right]+\mathbb{E}_{\beta}\left[\ln p(\beta)\right]-\mathbb{E}_{\mathbf{w}}\left[\ln q(\mathbf{w})\right]-\mathbb{E}_{\alpha}\left[\ln q(\alpha)\right]-\mathbb{E}_{\beta}\left[\ln q(\beta)\right]\\
\end{align*}
$$
各項を具体的に計算すると,以下の表式が得られる。
$$
\begin{align*}
\mathbb{E}_{\mathbf{w},\beta}\left[\ln p(\textsf{\textbf{t}}|\mathbf{w},\beta)\right]&=\frac{N}{2}\left(\psi(c_N)-\ln(2\pi d_N)\right)-\frac{c_N}{2d_N}\left(\textsf{\textbf{t}}^{\rm T}\textsf{\textbf{t}}-2\textsf{\textbf{t}}^{\rm T}\boldsymbol\Phi\mathbf{m}_N+\boldsymbol\Phi\left(\mathbf{m}_N\mathbf{m}_N^{\rm T}+\mathbf{S}_N\right)\boldsymbol\Phi^{\rm T}\right)\\
\mathbb{E}_{\mathbf{w},\alpha}\left[\ln p(\mathbf{w}|\alpha)\right]&=\frac{M}{2}\left(\psi(a_N)-\ln(2\pi b_N)\right)-\frac{a_N}{2b_N}\left(\mathbf{m}_N^{\rm T}\mathbf{m}_N+{\rm Tr}(\mathbf{S}_N)\right)\\
\mathbb{E}_{\alpha}\left[\ln p(\alpha)\right]&=-\ln\Gamma(a_0)+a_0\ln b_0+(a_0-1)\left(\psi(a_N)-\ln b_N\right)-\frac{b_0a_N}{b_N}\\
\mathbb{E}_{\beta}\left[\ln p(\beta)\right]&=-\ln\Gamma(c_0)+c_0\ln d_0+(c_0-1)\left(\psi(c_N)-\ln d_N\right)-\frac{d_0c_N}{d_N}\\
\mathbb{E}_{\mathbf{w}}\left[\ln q(\mathbf{w})\right]&=-\frac{M}{2}(1+\ln 2\pi)-\frac{1}{2}\ln|\mathbf{S}_N|\\
\mathbb{E}_{\alpha}\left[\ln q(\alpha)\right]&=-\ln\Gamma(a_N)+(a_N-1)\psi(a_N)+\ln b_N-a_N\\
\mathbb{E}_{\beta}\left[\ln q(\beta)\right]&=-\ln\Gamma(c_N)+(c_N-1)\psi(c_N)+\ln d_N-c_N\\
\end{align*}
$$
予測分布$${p(t|\mathbf{x},\textsf{\textbf{t}})}$$を$${q(\mathbf{w}),q(\beta)}$$を用いて近似的に求めることを考えると,
$$
\begin{align*}
p(t|\mathbf{x},\textsf{\textbf{t}})&=\int{\rm d}\mathbf{w}\int{\rm d}\beta p(t,\mathbf{w},\beta|\mathbf{x},\textsf{\textbf{t}})\\
&=\int{\rm d}\mathbf{w}\int{\rm d}\beta p(t|\mathbf{w},\beta,\mathbf{x})p(\mathbf{w},\beta|\textsf{\textbf{t}})\\
&\simeq\int{\rm d}\mathbf{w}\int{\rm d}\beta p(t|\mathbf{w},\beta,\mathbf{x})q(\mathbf{w})q(\beta)\\
&=\int{\rm d}\mathbf{w}\left\{\int{\rm d}\beta{\rm Gam}(\beta|c_N,d_N)\mathcal{N}\left(t\left|\mathbf{w}^{\rm T}\boldsymbol\phi(\mathbf{x}),\beta^{-1}\right.\right)\right\}\mathcal{N}\left(\mathbf{w}\left|\mathbf{m}_N,\mathbf{S}_N\right.\right)\\
&=\int{\rm d}\mathbf{w}{\rm St}\left(t\left|\mathbf{w}^{\rm T}\boldsymbol\phi(\mathbf{x}),\frac{c_N}{d_N},2c_N\right.\right)\mathcal{N}\left(\mathbf{w}\left|\mathbf{m}_N,\mathbf{S}_N\right.\right)\\
&\simeq\int{\rm d}\mathbf{w}\mathcal{N}\left(t\left|\mathbf{w}^{\rm T}\boldsymbol\phi(\mathbf{x}),\frac{d_N}{c_N}\right.\right)\mathcal{N}\left(\mathbf{w}\left|\mathbf{m}_N,\mathbf{S}_N\right.\right)\ \ \ \ \ (c_N\gg 1と仮定)\\
&=\mathcal{N}\left(t\left|\mathbf{m}_N^{\rm T}\boldsymbol\phi(\mathbf{x}),\frac{d_N}{c_N}+\boldsymbol\phi(\mathbf{x})^{\rm T}\mathbf{S}_N\boldsymbol\phi(\mathbf{x})\right.\right)
\end{align*}
$$
が得られる。上式は式(10.106)に含まれる$${\beta}$$を$${\mathbb{E}_{\beta}[\beta]}$$に置き換えたことに相当する。
Exercise (10.27)
Exercise (10.26)で求めたLower boundの項に含まれる$${\mathbb{E}_{\beta}[\beta]=\frac{c_N}{d_N}}$$を$${\beta}$$に置き換えればよい。
Exercise (10.28)
$$
\begin{align*}
p(\mathbf{X},\mathbf{Z}|\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)&=p(\mathbf{X}|\mathbf{Z},\boldsymbol\mu,\boldsymbol\Lambda)p(\mathbf{Z}|\boldsymbol\pi)\\
&=\prod_{n=1}^N\prod_{k=1}^K\left\{\pi_k\mathcal{N}\left(\mathbf{x}_n|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right)\right\}^{z_{nk}}\\
p(\mathbf{x}_n,\mathbf{z}_n|\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)&:=\prod_{k=1}^K\left\{\pi_k\mathcal{N}\left(\mathbf{x}_n|\boldsymbol\mu_k,\boldsymbol\Lambda_k^{-1}\right)\right\}^{z_{nk}}\\
&=\left(\frac{1}{2\pi}\right)^{D/2}\prod_{k=1}^K\left\{\pi_k|\boldsymbol\Lambda_k|^{1/2}\exp\left(-\frac{1}{2}(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right)\right\}^{z_{nk}}\\
&=\left(\frac{1}{2\pi}\right)^{D/2}\exp\left(\sum_{k=1}^Kz_{nk}\left(\ln\pi_k+\frac{1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)\right)\right)\\
&=\left(\frac{1}{2\pi}\right)^{D/2}\exp\left(\sum_{k=1}^Kz_{nk}\left(\ln\pi_k+\frac{1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k+\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-\frac{1}{2}\mathbf{x}_n^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n\right)\right)\\
&=\left(\frac{1}{2\pi}\right)^{D/2}\exp\left(\sum_{k=1}^Kz_{nk}\left(\ln\pi_k+\frac{1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k+\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-\frac{1}{2}{\rm Tr}(\boldsymbol\Lambda_k\mathbf{x}_n\mathbf{x}_n^{\rm T})\right)\right)\\
\end{align*}
$$
ここで,$${\boldsymbol\Lambda_k=\begin{pmatrix}\boldsymbol\lambda_{k1}&\boldsymbol\lambda_{k2}&\cdots&\boldsymbol\lambda_{kD}\end{pmatrix}^{\rm T}}$$,$${\mathbf{x}_n\mathbf{x}_n^{\rm T}=\begin{pmatrix}\textsf{\textbf{x}}_{n1}&\textsf{\textbf{x}}_{n2}&\cdots&\textsf{\textbf{x}}_{nD}\end{pmatrix}}$$とおくと,
$$
\begin{align*}
({\rm r.h.s.})&=\left(\frac{1}{2\pi}\right)^{M/2}\exp\left(\sum_{k=1}^Kz_{nk}\left(\ln\pi_k+\frac{1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k+\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-\frac{1}{2}\sum_{i=1}^D\boldsymbol\lambda_{ki}^{\rm T}\textsf{\textbf{x}}_{ni}\right)\right)\\
\end{align*}
$$
指数をベクトル表記でまとめると,
$$
\begin{align*}
\sum_{k=1}^Kz_{nk}\left(\ln\pi_k+\frac{1}{2}\ln|\boldsymbol\Lambda_k|-\frac{1}{2}\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k+\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\mathbf{x}_n-\frac{1}{2}\sum_{i=1}^D\boldsymbol\lambda_{ki}^{\rm T}\textsf{\textbf{x}}_{ni}\right)&=\sum_{k=1}^Kz_{nk}\begin{pmatrix}\ln\pi_k&\ln|\boldsymbol\Lambda_k|&\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k&\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k&\frac{1}{2}\boldsymbol\lambda_{k1}^{\rm T}&\cdots&\frac{1}{2}\boldsymbol\lambda_{kD}^{\rm T}\end{pmatrix}\begin{pmatrix}1\\ \frac{1}{2}\\ \\ \mathbf{x}_n\\ -\textsf{\textbf{x}}_{n1}\\ \vdots\\ -\textsf{\textbf{x}}_{nD}\end{pmatrix}\\
&=:\sum_{k=1}^Kz_{nk}\boldsymbol\eta_k^{\rm T}\mathbf{u}(\mathbf{x}_n)\\
&=\begin{pmatrix}\boldsymbol\eta_1^{\rm T}&\cdots&\boldsymbol\eta_K^{\rm T}\end{pmatrix}\begin{pmatrix} z_{n1}\mathbf{u}(\mathbf{x}_n)\\ \vdots\\ z_{nK}\mathbf{u}(\mathbf{x}_n)\end{pmatrix}\\
&=:\boldsymbol\eta^{\rm T}\mathbf{u}(\mathbf{x}_n,\mathbf{z}_n)
\end{align*}
$$
以上より,$${p(\mathbf{X},\mathbf{Z}|\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)}$$は
$$
\begin{align*}
p(\mathbf{X},\mathbf{Z}|\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)&=\prod_{n=1}^Nh(\mathbf{x}_n,\mathbf{z}_n)g(\boldsymbol\eta)\exp\left\{\boldsymbol\eta^{\rm T}\mathbf{u}(\mathbf{x}_n,\mathbf{z}_n)\right\}\\
h(\mathbf{x}_n,\mathbf{z}_n)&=\left(\frac{1}{2\pi}\right)^{D/2}\\
g(\boldsymbol\eta)&=1\\
\boldsymbol\eta&=\begin{pmatrix}\boldsymbol\eta_1^{\rm T}&\cdots&\boldsymbol\eta_K^{\rm T}\end{pmatrix}^{\rm T}\\
\boldsymbol\eta_k&=\begin{pmatrix}\ln\pi_k&\ln|\boldsymbol\Lambda_k|&\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k&\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k&\frac{1}{2}\boldsymbol\lambda_{k1}^{\rm T}&\cdots&\frac{1}{2}\boldsymbol\lambda_{kD}^{\rm T}\end{pmatrix}^{\rm T}\\
\mathbf{u}(\mathbf{x}_n,\mathbf{z}_n)&=\begin{pmatrix}z_{n1}\mathbf{u}(\mathbf{x}_n)^{\rm T}&\cdots&z_{nK}\mathbf{u}(\mathbf{x}_n)^{\rm T}\end{pmatrix}^{\rm T}\\
\mathbf{u}(\mathbf{x}_n)&=\begin{pmatrix}1& \frac{1}{2}& -\frac{1}{2}& \mathbf{x}_n^{\rm T}& -\textsf{\textbf{x}}_{n1}^{\rm T}& \cdots& -\textsf{\textbf{x}}_{nD}^{\rm T}\end{pmatrix}^{\rm T}
\end{align*}
$$
と表すことができる。
これらを式(10.115)に代入すると,
$$
\begin{align*}
\ln q^*(\mathbf{Z})&=\sum_{n=1}^N\left\{\ln h(\mathbf{x}_n,\mathbf{z}_n)+\mathbb{E}[\boldsymbol\eta^{\rm T}]\mathbf{u}(\mathbf{x}_n,\mathbf{z}_n)\right\}+{\rm const}\\
&=\sum_{n=1}^N\left\{-\frac{D}{2}\ln(2\pi)+\mathbb{E}[\boldsymbol\eta^{\rm T}]\mathbf{u}(\mathbf{x}_n,\mathbf{z}_n)\right\}+{\rm const}\\
&=\sum_{n=1}^N\left\{-\frac{D}{2}\ln(2\pi)+\sum_{k=1}^Kz_{nk}\mathbb{E}[\boldsymbol\eta_k^{\rm T}]\mathbf{u}(\mathbf{x}_n)\right\}+{\rm const}\\
&=\sum_{n=1}^N\left\{-\frac{D}{2}\ln(2\pi)+\sum_{k=1}^Kz_{nk}\left(\mathbb{E}[\ln\pi_k]+\frac{1}{2}\mathbb{E}[\ln|\boldsymbol\Lambda_k|]-\frac{1}{2}\mathbb{E}[(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)]\right)\right\}+{\rm const}\\
&=\sum_{n=1}^N\sum_{k=1}^Kz_{nk}\left(\mathbb{E}[\ln\pi_k]+\frac{1}{2}\mathbb{E}[\ln|\boldsymbol\Lambda_k|]-\frac{D}{2}\ln(2\pi)-\frac{1}{2}\mathbb{E}[(\mathbf{x}_n-\boldsymbol\mu_k)^{\rm T}\boldsymbol\Lambda_k(\mathbf{x}_n-\boldsymbol\mu_k)]\right)+{\rm const}\\
&=\sum_{n=1}^N\sum_{k=1}^Kz_{nk}\ln\rho_{nk}+{\rm const}
\end{align*}
$$
となり,式(10.45)が得られる。
$${\beta_0\mathbf{m}_0\mathbf{m}_0^{\rm T}+\mathbf{W}_0^{-1}=:\begin{pmatrix}\mathbf{a}_1&\cdots&\mathbf{a}_D\end{pmatrix}}$$とおくと,
$$
\begin{align*}
p(\boldsymbol\pi,\boldsymbol\mu,\boldsymbol\Lambda)&=p(\boldsymbol\pi)p(\boldsymbol\mu,\boldsymbol\Lambda)\\
&=p(\boldsymbol\pi)p(\boldsymbol\mu|\boldsymbol\Lambda)p(\boldsymbol\Lambda)\\
&={\rm Dir}(\boldsymbol\pi|\boldsymbol\alpha_0)\prod_{k=1}^K\mathcal{N}(\boldsymbol\mu_k|\mathbf{m}_0,(\beta_0\boldsymbol\Lambda_k)^{-1})\mathcal{W}(\boldsymbol\Lambda_k|\mathbf{W}_0,\nu_0)\\
&=C(\boldsymbol\alpha_0)\left(\frac{\beta_0}{2\pi}\right)^{DK/2}\left\{B(\mathbf{W}_0,\nu_0)\right\}^K\exp\left(\sum_{k=1}^K\left\{(\alpha_0-1)\ln\pi_k+\frac{\nu_0-D}{2}\ln|\boldsymbol\Lambda_k|-\frac{\beta_0}{2}\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\boldsymbol\mu_k+\beta_0\boldsymbol\mu_k^{\rm T}\boldsymbol\Lambda_k\mathbf{m}_0-\frac{1}{2}{\rm Tr}\left(\boldsymbol\Lambda_k\left(\beta_0\mathbf{m}_0\mathbf{m}_0^{\rm T}+\mathbf{W}_0^{-1}\right)\right)\right\}\right)\\
&=C(\boldsymbol\alpha_0)\left(\frac{\beta_0}{2\pi}\right)^{DK/2}\left\{B(\mathbf{W}_0,\nu_0)\right\}^K\exp\left(\sum_{k=1}^K\boldsymbol\eta_k^{\rm T}\begin{pmatrix}\alpha_0-1\\ \frac{\nu_0-D}{2}\\ -\frac{\beta_0}{2} \\ \beta_0\mathbf{m}_0 \\ -\mathbf{a}_1 \\ \vdots \\ -\mathbf{a}_D \end{pmatrix}\right)\\
&=:C(\boldsymbol\alpha_0)\left(\frac{\beta_0}{2\pi}\right)^{DK/2}\left\{B(\mathbf{W}_0,\nu_0)\right\}^K\exp\left(\sum_{k=1}^K\boldsymbol\eta_k^{\rm T}\boldsymbol\chi_k\right)\\
&=:C(\boldsymbol\alpha_0)\left(\frac{\beta_0}{2\pi}\right)^{DK/2}\left\{B(\mathbf{W}_0,\nu_0)\right\}^K\exp\left(\boldsymbol\eta^{\rm T}\boldsymbol\chi_0\right)\\
\boldsymbol\chi_0&:=\begin{pmatrix}\boldsymbol\chi_1^{\rm T}&\cdots&\boldsymbol\chi_K^{\rm T}\end{pmatrix}^{\rm T}\\
\boldsymbol\chi_1&=\boldsymbol\chi_2=\cdots=\boldsymbol\chi_K\\
&=\begin{pmatrix}\alpha_0-1& \frac{\nu_0-D}{2}& -\frac{\beta_0}{2} & \beta_0\mathbf{m}_0^{\rm T} & -\mathbf{a}_1^{\rm T} & \cdots & -\mathbf{a}_D^{\rm T} \end{pmatrix}^{\rm T}
\end{align*}
$$
と表すことができる。
式(10.121)に従って$${\boldsymbol\chi_N}$$を計算すると,
$$
\begin{align*}
\boldsymbol\chi_N&=\boldsymbol\chi_0+\sum_{n=1}^N\mathbb{E}_{\mathbf{z}_n}[\mathbf{u}(\mathbf{x}_n,\mathbf{m}_n)]\\
&=\begin{pmatrix}\boldsymbol\chi_1\\ \vdots\\ \boldsymbol\chi_K\end{pmatrix}+\sum_{n=1}^N\begin{pmatrix}r_{n1}\mathbf{u}(\mathbf{x}_1)\\ \vdots\\ r_{nK}\mathbf{u}(\mathbf{x}_K)\end{pmatrix}\\
\boldsymbol\chi_k+ \sum_{n=1}^Nr_{nK}\mathbf{u}(\mathbf{x}_k)&=\begin{pmatrix}\alpha_0-1\\ \frac{\nu_0-D}{2}\\ -\frac{\beta_0}{2} \\ \beta_0\mathbf{m}_0 \\ -\mathbf{a}_1 \\ \vdots \\ -\mathbf{a}_D \end{pmatrix}+\begin{pmatrix}N_k\\ \frac{N_k}{2}\\ -\frac{N_k}{2} \\ N_k\overline{\mathbf{x}}_k \\ -\sum_{n=1}^Nr_{n1}\textsf{\textbf{x}}_1 \\ \vdots \\ -\sum_{n=1}^Nr_{nK}\textsf{\textbf{x}}_D \end{pmatrix}\\
&=\begin{pmatrix}\alpha_0+N_k-1\\ \frac{\nu_0+N_k-D}{2}\\ -\frac{\beta_0+N_k}{2} \\ \beta_0\mathbf{m}_0+N_k\overline{\mathbf{x}}_k \\ -\left(\mathbf{a}_1+\sum_{n=1}^Nr_{n1}\textsf{\textbf{x}}_1\right) \\ \vdots \\ -\left(\mathbf{a}_D+\sum_{n=1}^Nr_{nK}\textsf{\textbf{x}}_D\right) \end{pmatrix}\\
&=\begin{pmatrix}\alpha_k-1\\ \frac{\nu_k-D}{2}\\ -\frac{\beta_k}{2} \\ \beta_k\mathbf{m}_k \\ -\left(\beta_k\mathbf{m}_k\mathbf{m}_k^{\rm T}+\mathbf{W}_k^{-1}\right)_{:1} \\ \vdots \\ -\left(\beta_k\mathbf{m}_k\mathbf{m}_k^{\rm T}+\mathbf{W}_k^{-1}\right)_{:D} \end{pmatrix}
\end{align*}
$$
ここで,$${\left(\mathbf{m}_k\mathbf{m}_k^{\rm T}+\mathbf{W}_k^{-1}\right)_{:k}}$$は$${\mathbf{m}_k\mathbf{m}_k^{\rm T}+\mathbf{W}_k^{-1}}$$のk列ベクトルを表す。
上式より,
$$
\begin{align*}
\alpha_k&=\alpha_0+N_k\\
\nu_k&=\nu_0+N_k\\
\beta_k&=\beta_0+N_k\\
\mathbf{m}_k&=\frac{1}{\beta_k}(\beta_0\mathbf{m}_0+N_k\overline{\mathbf{x}}_k)\\
\mathbf{W}_k^{-1}&=\mathbf{W}_0^{-1}+\sum_{n=1}^Nr_{nk}\mathbf{x}_n\mathbf{x}_n^{\rm T}+\beta_0\mathbf{m}_0\mathbf{m}_0^{\rm T}-\beta_k\mathbf{m}_k\mathbf{m}_k^{\rm T}\\
&=\mathbf{W}_0^{-1}+N_k\mathbf{S}_k-N_k\overline{\mathbf{x}}_k\overline{\mathbf{x}}_k^{\rm T}+\beta_0\mathbf{m}_0\mathbf{m}_0^{\rm T}-\beta_k\mathbf{m}_k\mathbf{m}_k^{\rm T}\\
&=\mathbf{W}_0^{-1}+N_k\mathbf{S}_k+\frac{\beta_0 N_k}{\beta_k}(\overline{\mathbf{x}}_k-\mathbf{m}_0)(\overline{\mathbf{x}}_k-\mathbf{m}_0)^{\rm T}
\end{align*}
$$
となることが示される。
Exercise (10.29)
$$
\begin{align*}
\frac{{\rm d}^2}{{\rm d}x^2}\ln x&=\frac{{\rm d}}{{\rm d}x}\frac{1}{x}\\
&=-\frac{1}{x^2}\\
&<0\ \ \ ({\rm for}\ 0<x<\infty)
\end{align*}
$$
より,$${f(x)=\ln x}$$は凹関数である。
$$
\begin{align*}
y(x)&:=f(\xi)+f^{(1)}(\xi)(x-\xi)\\
&=\ln(\xi)+\xi^{-1}(x-\xi)
\end{align*}
$$
は$${y(x)\geq f(x)}$$を満たし,等式は$${x=\xi}$$で成立する。
$${\lambda=\xi^{-1}}$$とおくと,
$$
\begin{align*}
y(x,\lambda)&=-\ln\lambda+\lambda(x-\lambda^{-1})\\
&=\lambda x-(\ln\lambda+1)\\
f(x)&={\rm min}_{\lambda}\left\{\lambda x-(\ln\lambda+1)\right\}\\
&={\rm min}_{\lambda}\left\{\lambda x-g(\lambda)\right\}\\
\therefore g(\lambda)&=\ln\lambda+1
\end{align*}
$$
$${x\lambda -g(\lambda)}$$は$${\lambda=x^{-1}}$$のときに最小値を取るため,
$$
\begin{align*}
f(x)&={\rm min}_{\lambda}\left\{x\lambda -(\ln\lambda+1)\right\}\\
&=xx^{-1}-\ln x^{-1}-1\\
&=\ln x
\end{align*}
$$
となり,$${f(x)=\ln x}$$が再現される。
Exercise (10.30)
$$
\begin{align*}
\frac{{\rm d}^2}{{\rm d}x^2}(-\ln(1+{\rm e}^{-x}))&=\frac{{\rm d}}{{\rm d}x}\frac{{\rm e}^{-x}}{1+{\rm e}^{-x}}\\
&=-\left(\frac{{\rm e}^{-x}}{1+{\rm e}^{-x}}\right)^2\\
&<0\ \ \ ({\rm for}\ -\infty <x<\infty)
\end{align*}
$$
より,$${f(x)=-\ln(1+{\rm e}^{-x})}$$は凹関数である。
$$
\begin{align*}
y(x)&:=f(\xi)+f^{(1)}(\xi)(x-\xi)\\
&=-\ln(1+{\rm e}^{-\xi})+\frac{{\rm e}^{-\xi}}{1+{\rm e}^{-\xi}}(x-\xi)
\end{align*}
$$
は$${y(x)\geq f(x)}$$を満たし,等式は$${x=\xi}$$で成立する。
$${\lambda={\rm e}^{-\xi}(1+{\rm e}^{-\xi})^{-1}}$$とおくと,
$$
\begin{align*}
y(x,\lambda)&=\ln(1-\lambda)+\lambda\left(x+\ln\left(\frac{\lambda}{1-\lambda}\right)\right)\\
&=\lambda x-\left\{-\lambda\ln\lambda-(1-\lambda)\ln(1-\lambda)\right\}\\
\ln\sigma(x)&=f(x)={\rm min}_{\lambda}\left\{\lambda x-\left\{-\lambda\ln\lambda-(1-\lambda)\ln(1-\lambda)\right\}\right\}\\
&={\rm min}_{\lambda}\left\{\lambda x-g(\lambda)\right\}\\
\therefore g(\lambda)&=-\lambda\ln\lambda-(1-\lambda)\ln(1-\lambda)\\
\sigma(x)&\leq\exp\left(\lambda x-g(\lambda)\right)
\end{align*}
$$
Exercise (10.31) - (10.39)
Exercise (10.31)
$$
\begin{align*}
\frac{{\rm d}^2}{{\rm d}x^2}(-\ln({\rm e}^{x/2}+{\rm e}^{-x/2}))&=-\frac{1}{2}\frac{{\rm d}}{{\rm d}x}\tanh(x/2)\\
&=-\frac{1}{4\cosh^2(x/2)}\\
&<0\ \ \ ({\rm for}\ -\infty <x<\infty)
\end{align*}
$$
より,$${f(x)=-\ln({\rm e}^{x/2}+{\rm e}^{-x/2})}$$は凹関数である。
一方,$${x^2=z}$$に対して二階微分すると,
$$
\begin{align*}
\frac{{\rm d}^2}{{\rm d}z^2}(-\ln({\rm e}^{\sqrt{z}/2}+{\rm e}^{-\sqrt{z}/2}))&=-\frac{1}{4}\frac{{\rm d}}{{\rm d}z}(z^{-1/2}\tanh(\sqrt{z}/2))\\
&=-\frac{1}{4}\left(-\frac{1}{2}z^{-3/2}\tanh(\sqrt{z}/2)+\frac{z^{-1}}{4\cosh^2(\sqrt{z}/2)}\right)\\
&=\frac{z^{-3/2}}{4({\rm e}^{\sqrt{z}/2}+{\rm e}^{-\sqrt{z}/2})^2}\left(\sinh(\sqrt{z})-\sqrt{z}\right)\\
&> 0\ \ \ ({\rm for}\ 0 < z<\infty)
\end{align*}
$$
より,$${f(x)=-\ln({\rm e}^{x/2}+{\rm e}^{-x/2})}$$は$${x^2}$$に対しては凸関数である。
グラフ形状を以下に示す。
$$
\begin{align*}
y(x^2)&:=f(\sqrt{\xi^2})+f^{(1)}(\sqrt{\xi^2})(x^2-\xi^2)\\
&=-\ln({\rm e}^{\xi/2}+{\rm e}^{-\xi/2})-\frac{1}{4\xi}\tanh(\xi/2)(x^2-\xi^2)
\end{align*}
$$
は$${y(x^2)\leq f(\sqrt{x^2})}$$を満たし,等式は$${x^2=\xi^2}$$で成立する。
$${\lambda(\xi)=-\frac{1}{4\xi}\tanh(\xi/2)}$$とおくと,
$$
\begin{align*}
y(x^2,\lambda)&=\lambda(\xi)x^2-\left(\lambda(\xi)\xi^2-\ln({\rm e}^{\xi/2}+{\rm e}^{-\xi/2})\right)\\
&=\lambda(\xi)\left(x^2-\xi^2\right)-\ln\left\{{\rm e}^{\xi/2}(1+{\rm e}^{-\xi})\right\}\\
&=\lambda(\xi)\left(x^2-\xi^2\right)-\xi/2+\ln\sigma(\xi)\\
&\leq f(x)=-x/2+\ln\sigma(x)\\
\therefore \sigma(x)&\geq\sigma(\xi)\exp\left\{(x-\xi)/2+\lambda(\xi)(x^2-\xi^2)\right\}
\end{align*}
$$
Exercise (10.32)
$$
\begin{align*}
\mathbf{S}_N^{-1}&=\mathbf{S}_0^{-1}+2\sum_{n=1}^N\lambda(\xi_n)\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\\
&=\left\{\mathbf{S}_0^{-1}+2\lambda(\xi_1)\boldsymbol\phi_1\boldsymbol\phi_1^{\rm T}\right\}+2\sum_{n=2}^N\lambda(\xi_n)\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\\
&=\mathbf{S}_1^{-1}+2\sum_{n=2}^N\lambda(\xi_n)\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\\
&=\mathbf{S}_2^{-1}+2\sum_{n=3}^N\lambda(\xi_n)\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\\
&\vdots\\
\mathbf{S}_N^{-1}\mathbf{m}_N&=\mathbf{S}_0^{-1}\mathbf{m}_0+\sum_{n=1}^N(t_n-1/2)\boldsymbol\phi_n\\
&=\left\{\mathbf{S}_0^{-1}\mathbf{m}_0+(t_n-1/2)\boldsymbol\phi_1\right\}+\sum_{n=2}^N(t_n-1/2)\boldsymbol\phi_n\\
&=\mathbf{S}_1^{-1}\mathbf{m}_1+\sum_{n=2}^N(t_n-1/2)\boldsymbol\phi_n\\
&=\mathbf{S}_2^{-1}\mathbf{m}_2+\sum_{n=3}^N(t_n-1/2)\boldsymbol\phi_n\\
&\vdots\\
\end{align*}
$$
となり,1データのみを用いて段階的に更新することができる。
Exercise (10.33)
$$
\begin{align*}
\frac{\partial\mathcal{Q}(\boldsymbol\xi,\boldsymbol\xi^{\rm old})}{\partial\xi_n}&=\frac{1}{\sigma(\xi_n)}\frac{{\rm d}\sigma(\xi_n)}{{\rm d}\xi_n}-\frac{1}{2}-\frac{{\rm d}\lambda(\xi_n)}{{\rm d}\xi_n}\left(\boldsymbol\phi_n^{\rm T}\mathbb{E}\left[\mathbf{w}\mathbf{w}^{\rm T}\right]\boldsymbol\phi_n-\xi_n^2\right)+2\lambda(\xi_n)\xi_n\\
&=\sigma(-\xi_n)-\frac{1}{2}-\frac{{\rm d}\lambda(\xi_n)}{{\rm d}\xi_n}\left(\boldsymbol\phi_n^{\rm T}\mathbb{E}\left[\mathbf{w}\mathbf{w}^{\rm T}\right]\boldsymbol\phi_n-\xi_n^2\right)+\sigma(\xi_n)-\frac{1}{2}\\
&=-\frac{{\rm d}\lambda(\xi_n)}{{\rm d}\xi_n}\left(\boldsymbol\phi_n^{\rm T}\mathbb{E}\left[\mathbf{w}\mathbf{w}^{\rm T}\right]\boldsymbol\phi_n-\xi_n^2\right)\\
&=0\\
\therefore \left(\xi_n^{\rm new}\right)^2&=\boldsymbol\phi_n^{\rm T}\mathbb{E}\left[\mathbf{w}\mathbf{w}^{\rm T}\right]\boldsymbol\phi_n\\
&=\boldsymbol\phi_n^{\rm T}\left(\mathbf{S}_N+\mathbf{m}_N\mathbf{m}_N^{\rm T}\right)\boldsymbol\phi_n
\end{align*}
$$
Exercise (10.34)
$$
\begin{align*}
\frac{\partial}{\partial\xi_n}\ln|\mathbf{S}_N|&=-\frac{\partial}{\partial\xi_n}\ln|\mathbf{S}_N^{-1}|\\&=-{\rm Tr}\left(\mathbf{S}_N\frac{\partial\mathbf{S}_N^{-1}}{\partial\xi_n}\right)\\
&=-{\rm Tr}\left(\mathbf{S}_N2\lambda^{(1)}(\xi_n)\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\right)\\
&=-2\lambda^{(1)}(\xi_n)\boldsymbol\phi_n^{\rm T}\mathbf{S}_N\boldsymbol\phi_n\\
\frac{\partial}{\partial\xi_n}\mathbf{m}_N^{\rm T}\mathbf{S}_N^{-1}\mathbf{m}_N&=2\frac{\partial \mathbf{m}_N^{\rm T}}{\partial\xi_n}\mathbf{S}_N^{-1}\mathbf{m}_N+\mathbf{m}_N^{\rm T}\frac{\partial\mathbf{S}_N^{-1}}{\partial\xi_n}\mathbf{m}_N\\
&=2\mathbf{m}_N^{\rm T}\mathbf{S}_N^{-1}\frac{\partial\mathbf{S}_N}{\partial\xi_n}\mathbf{S}_N^{-1}\mathbf{m}_N+\mathbf{m}_N^{\rm T}\frac{\partial\mathbf{S}_N^{-1}}{\partial\xi_n}\mathbf{m}_N\\
&=-2\mathbf{m}_N^{\rm T}\frac{\partial\mathbf{S}_N^{-1}}{\partial\xi_n}\mathbf{m}_N+\mathbf{m}_N^{\rm T}\frac{\partial\mathbf{S}_N^{-1}}{\partial\xi_n}\mathbf{m}_N\\
&=-\mathbf{m}_N^{\rm T}\frac{\partial\mathbf{S}_N^{-1}}{\partial\xi_n}\mathbf{m}_N\\
&=-\mathbf{m}_N^{\rm T}\left(2\lambda^{(1)}(\xi_n)\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\right)\mathbf{m}_N\\
&=-2\lambda^{(1)}(\xi_n)\boldsymbol\phi_n^{\rm T}\mathbf{m}_N\mathbf{m}_N^{\rm T}\boldsymbol\phi_n\\
\frac{\partial}{\partial\xi_n}\sum_{n=1}^N\left\{\ln\sigma(\xi_n)-\frac{1}{2}\xi_n+\lambda(\xi)\xi_n^2\right\}&=\frac{\sigma^{(1)}(\xi_n)}{\sigma(\xi_n)}-\frac{1}{2}+\lambda^{(1)}\xi(\xi)_n^2+2\xi_n\lambda(\xi)\\
&=1-\sigma(\xi_n)-\frac{1}{2}+\lambda^{(1)}(\xi)\xi_n^2+\sigma(\xi_n)-\frac{1}{2}\\
&=\lambda^{(1)}(\xi)\xi_n^2
\end{align*}
$$
より,
$$
\begin{align*}
\frac{\partial\mathcal{L}(\boldsymbol\xi)}{\partial\xi_n}&=\frac{1}{2}\frac{\partial}{\partial\xi_n}\ln|\mathbf{S}_N|+\frac{1}{2}\frac{\partial}{\partial\xi_n}\mathbf{m}_N^{\rm T}\mathbf{S}_N^{-1}\mathbf{m}_N+\frac{\partial}{\partial\xi_n}\sum_{n=1}^N\left\{\ln\sigma(\xi_n)-\frac{1}{2}\xi_n+\lambda(\xi)\xi_n^2\right\}\\
&=-\lambda^{(1)}(\xi_n)\boldsymbol\phi_n^{\rm T}\mathbf{S}_N\boldsymbol\phi_n-\lambda^{(1)}(\xi_n)\boldsymbol\phi_n^{\rm T}\mathbf{m}_N\mathbf{m}_N^{\rm T}\boldsymbol\phi_n+\lambda^{(1)}(\xi)\xi_n^2\\
&=\lambda^{(1)}(\xi)\left(\xi_n^2-\boldsymbol\phi_n^{\rm T}(\mathbf{S}_N+\mathbf{m}_N\mathbf{m}_N^{\rm T})\boldsymbol\phi_n\right)\\
&=0\\
\therefore (\xi_n^{\rm new})^2&=\boldsymbol\phi_n^{\rm T}(\mathbf{S}_N+\mathbf{m}_N\mathbf{m}_N^{\rm T})\boldsymbol\phi_n
\end{align*}
$$
Exercise (10.35)
$$
\begin{align*}
h(\mathbf{w},\boldsymbol\xi)&=\prod_{n=1}^N\sigma(\xi_n)\exp\left(\mathbf{w}^{\rm T}\boldsymbol\phi_n t_n-\frac{1}{2}(\mathbf{w}^{\rm T}\boldsymbol\phi_n+\xi_n)-\lambda(\xi_n)(\mathbf{w}^{\rm T}\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\mathbf{w}-\xi_n^2)\right)\\
&=\left\{\prod_{n=1}^N\sigma(\xi_n)\exp\left(-\frac{1}{2}\xi_n+\lambda(\xi_n)\xi_n^2\right)\right\}\exp\left(-\frac{1}{2}\mathbf{w}^{\rm T}\left(2\sum_{n=1}^N\lambda(\xi_n)\boldsymbol\phi_n\boldsymbol\phi_n^{\rm T}\right)\mathbf{w}+\mathbf{w}^{\rm T}\sum_{n=1}^N\left(t_n-\frac{1}{2}\right)\boldsymbol\phi_n\right)\\
p(\mathbf{w})&=\mathcal{N}(\mathbf{w}|\mathbf{m}_0,\mathbf{S}_0)\\
&=\left(\frac{1}{2\pi}\right)^{M/2}|\mathbf{S}_0|^{-1/2}\exp\left(-\frac{1}{2}(\mathbf{w}-\mathbf{m}_0)^{\rm T}\mathbf{S}_0^{-1}(\mathbf{w}-\mathbf{m}_0)\right)\\
&=\left\{\left(\frac{1}{2\pi}\right)^{M/2}|\mathbf{S}_0|^{-1/2}\exp\left(-\frac{1}{2}\mathbf{m}_0^{\rm T}\mathbf{S}_0^{-1}\mathbf{m}_0\right)\right\}\exp\left(-\frac{1}{2}\mathbf{w}^{\rm T}\mathbf{S}_0^{-1}\mathbf{w}+\mathbf{w}^{\rm T}\mathbf{S}_0^{-1}\mathbf{m}_0\right)
\end{align*}
$$
より,
$$
\begin{align*}
\int{\rm d}\mathbf{w}h(\mathbf{w},\boldsymbol\xi)p(\mathbf{w})&=\left\{\prod_{n=1}^N\sigma(\xi_n)\exp\left(-\frac{1}{2}\xi_n+\lambda(\xi_n)\xi_n^2\right)\left(\frac{1}{2\pi}\right)^{M/2}|\mathbf{S}_0|^{-1/2}\exp\left(-\frac{1}{2}\mathbf{m}_0^{\rm T}\mathbf{S}_0^{-1}\mathbf{m}_0\right)\right\}\int{\rm d}\mathbf{w}\exp\left(-\frac{1}{2}\mathbf{w}^{\rm T}\mathbf{S}_N^{-1}\mathbf{w}+\mathbf{w}^{\rm T}\mathbf{S}_N^{-1}\mathbf{m}_N\right)\\
\int{\rm d}\mathbf{w}h(\mathbf{w},\boldsymbol\xi)p(\mathbf{w})&=\left\{\prod_{n=1}^N\sigma(\xi_n)\exp\left(-\frac{1}{2}\xi_n+\lambda(\xi_n)\xi_n^2\right)\left(\frac{1}{2\pi}\right)^{M/2}|\mathbf{S}_0|^{-1/2}\exp\left(\frac{1}{2}\mathbf{m}_N^{\rm T}\mathbf{S}_N^{-1}\mathbf{m}_N-\frac{1}{2}\mathbf{m}_0^{\rm T}\mathbf{S}_0^{-1}\mathbf{m}_0\right)\right\}\int{\rm d}\mathbf{w}\exp\left(-\frac{1}{2}(\mathbf{w}-\mathbf{m}_N)^{\rm T}\mathbf{S}_N^{-1}(\mathbf{w}-\mathbf{m}_N)\right)\\
&=\prod_{n=1}^N\sigma(\xi_n)\exp\left(-\frac{1}{2}\xi_n+\lambda(\xi_n)\xi_n^2\right)|\mathbf{S}_N|^{1/2}|\mathbf{S}_0|^{-1/2}\exp\left(\frac{1}{2}\mathbf{m}_N^{\rm T}\mathbf{S}_N^{-1}\mathbf{m}_N-\frac{1}{2}\mathbf{m}_0^{\rm T}\mathbf{S}_0^{-1}\mathbf{m}_0\right)
\end{align*}
$$
$$
\begin{align*}
\therefore \mathcal{L}(\boldsymbol\xi)&=\int{\rm d}\mathbf{w}h(\mathbf{w},\boldsymbol\xi)p(\mathbf{w})\\
&=\frac{1}{2}\ln\frac{|\mathbf{S}_N|}{|\mathbf{S}_0|}+\frac{1}{2}\mathbf{m}_N^{\rm T}\mathbf{S}_N^{-1}\mathbf{m}_N-\frac{1}{2}\mathbf{m}_0^{\rm T}\mathbf{S}_0^{-1}\mathbf{m}_0+\sum_{n=1}^N\left\{\ln\sigma(\xi_n)-\frac{1}{2}\xi_n+\lambda(\xi)\xi_n^2\right\}
\end{align*}
$$
Exercise (10.36)
$${p_j(\mathcal{D},\boldsymbol\theta)=p_j(\mathcal{D})p_j(\boldsymbol\theta|\mathcal{D}):=\prod_{i=0}^{j}f_i(\boldsymbol\theta)}$$とおくと,
$$
\begin{align*}
p_N(\mathcal{D})&=\int{\rm d}\boldsymbol\theta p_N(\mathcal{D},\boldsymbol\theta)\\
&=\int{\rm d}\boldsymbol\theta f_N(\boldsymbol\theta)\prod_{i=0}^{N-1}f_i(\boldsymbol\theta)\\
&=p_{N-1}(\mathcal{D})\int{\rm d}\boldsymbol\theta f_N(\boldsymbol\theta)p_{N-1}(\boldsymbol\theta|\mathcal{D})\\
&\simeq p_{N-1}(\mathcal{D})\int{\rm d}\boldsymbol\theta f_N(\boldsymbol\theta)q^{
\backslash N}(\boldsymbol\theta)\\
&=Z_np_{N-1}(\mathcal{D})\\
&\simeq Z_nZ_{n-1}p_{N-2}(\mathcal{D})\\
&\vdots\\
&\simeq \prod_j Z_j
\end{align*}
$$
Exercise (10.37)
$${\tilde{f}_0^{\rm init}(\boldsymbol\theta)=f_0(\boldsymbol\theta)}$$に選ぶと,$${f_0(\boldsymbol\theta)q^{\backslash 0}(\boldsymbol\theta)/Z_0}$$により近い$${q^{\rm new}(\boldsymbol\theta)}$$を探すことになる。$${f_0(\boldsymbol\theta)}$$が$${q(\boldsymbol\theta)}$$と同じ指数関数族である場合,モーメントのみならず分布そのものを一致させることができるため,$${\widetilde{f}_0^{\rm new}(\boldsymbol\theta)=f_0(\boldsymbol\theta)=\tilde{f}_0^{\rm init}(\boldsymbol\theta)}$$となる。
Exercise (10.38)
$$
\begin{align*}
q^{\backslash n}(\boldsymbol\theta)&=\frac{\mathcal{N}(\boldsymbol\theta|\mathbf{m},v\mathbf{I})}{s_n\mathcal{N}(\boldsymbol\theta|\mathbf{m}_n,v_n\mathbf{I})}\\
&=\frac{1}{s_n}\left(\frac{v_n}{v}\right)^{D/2}\exp\left(-\frac{1}{2v}(\boldsymbol\theta-\mathbf{m})^{\rm T}(\boldsymbol\theta-\mathbf{m})+\frac{1}{2v_n}(\boldsymbol\theta-\mathbf{m}_n)^{\rm T}(\boldsymbol\theta-\mathbf{m}_n)\right)\\
&=\frac{1}{s_n}\left(\frac{v_n}{v}\right)^{D/2}\exp\left(-\frac{1}{2}(v^{-1}-v_n^{-1})\left\{\frac{v^{-1}}{v^{-1}-v_n^{-1}}(\boldsymbol\theta-\mathbf{m})^{\rm T}(\boldsymbol\theta-\mathbf{m})-\frac{v_n^{-1}}{v^{-1}-v_n^{-1}}(\boldsymbol\theta-\mathbf{m}_n)^{\rm T}(\boldsymbol\theta-\mathbf{m}_n)\right\}\right)\\
&=\frac{1}{s_n}\left(\frac{v_n}{v}\right)^{D/2}\exp\left(-\frac{1}{2}(v^{-1}-v_n^{-1})\left\{\boldsymbol\theta^{\rm T}\boldsymbol\theta-2\left(\mathbf{m}+\frac{v_n^{-1}}{v^{-1}-v_n^{-1}}(\mathbf{m}-\mathbf{m}_n)\right)^{\rm T}\boldsymbol\theta+\left(\mathbf{m}^{\rm T}\mathbf{m}+\frac{v_n^{-1}}{v^{-1}-v_n^{-1}}(\mathbf{m}^{\rm T}\mathbf{m}-\mathbf{m}_n^{\rm T}\mathbf{m}_n)\right)\right\}\right)\\
&\propto\mathcal{N}\left(\boldsymbol\theta\left|\mathbf{m}+\frac{v_n^{-1}}{v^{-1}-v_n^{-1}}(\mathbf{m}-\mathbf{m}_n),\left(v^{-1}-v_n^{-1}\right)^{-1}\mathbf{I}\right.\right)\\
\therefore (v^{\backslash n})^{-1}&=v^{-1}-v_n^{-1}\\
\mathbf{m}^{\backslash n}&=\mathbf{m}+\frac{v_n^{-1}}{v^{-1}-v_n^{-1}}(\mathbf{m}-\mathbf{m}_n)\\
&=\mathbf{m}+v^{\backslash n}v_n^{-1}(\mathbf{m}-\mathbf{m}_n)
\end{align*}
$$
$$
\begin{align*}
Z_n&=\int{\rm d}\boldsymbol\theta q^{\backslash n}(\boldsymbol\theta)f_n(\boldsymbol\theta)\\
&=\int{\rm d}\boldsymbol\theta q^{\backslash n}(\boldsymbol\theta)p(\mathbf{x}_n|\boldsymbol\theta)\\
&=(1-w)\int{\rm d}\boldsymbol\theta\mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})\mathcal{N}(\mathbf{x}_n|\boldsymbol\theta,\mathbf{I})+w\mathcal{N}(\mathbf{x}_n|\mathbf{0},a\mathbf{I})\int{\rm d}\boldsymbol\theta\mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})\\
&=(1-w)\mathcal{N}(\mathbf{x}_n|\mathbf{m}^{\backslash n},(v^{\backslash n}+1)\mathbf{I})+w\mathcal{N}(\mathbf{x}_n|\mathbf{0},a\mathbf{I})
\end{align*}
$$
Exercise (10.39)
$$
\begin{align*}
\nabla_{\mathbf{m}^{\backslash n}}\ln Z_n&=\frac{1}{Z_n}\nabla_{\mathbf{m}^{\backslash n}}Z_n\\
&=\frac{1}{Z_n}\int{\rm d}\boldsymbol\theta\left\{\nabla_{\mathbf{m}^{\backslash n}} q^{\backslash n}(\boldsymbol\theta)\right\}f_n(\boldsymbol\theta)\\
&=\frac{1}{Z_n}\int{\rm d}\boldsymbol\theta\left\{\nabla_{\mathbf{m}^{\backslash n}} \mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})\right\}f_n(\boldsymbol\theta)\\
&=\frac{1}{Z_n}\int{\rm d}\boldsymbol\theta\left\{\frac{\boldsymbol\theta-\mathbf{m}^{\backslash n}}{v^{\backslash n}} \mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})\right\}f_n(\boldsymbol\theta)\\
&=\frac{1}{v^{\backslash n}}\left(\int{\rm d}\boldsymbol\theta\boldsymbol\theta\frac{\mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})f_n(\boldsymbol\theta)}{Z_n}-\mathbf{m}^{\backslash n}\right)\\
&=\frac{1}{v^{\backslash n}}\left(\int{\rm d}\boldsymbol\theta\boldsymbol\theta q^{\rm new}(\boldsymbol\theta)-\mathbf{m}^{\backslash n}\right)\\
&=\frac{1}{v^{\backslash n}}\left(\mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta]-\mathbf{m}^{\backslash n}\right)\\
\therefore \mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta]&=\mathbf{m}^{\backslash n}+v^{\backslash n}\nabla_{\mathbf{m}^{\backslash n}}\ln Z_n
\end{align*}
$$
$${\nabla_{\mathbf{m}^{\backslash n}}\ln Z_n}$$を具体的に計算すると,
$$
\begin{align*}
\nabla_{\mathbf{m}^{\backslash n}}\ln Z_n&=\frac{1}{Z_n}\nabla_{\mathbf{m}^{\backslash n}}Z_n\\
&=\frac{1}{Z_n}\nabla_{\mathbf{m}^{\backslash n}}\left((1-w)\mathcal{N}(\mathbf{x}_n|\mathbf{m}^{\backslash n},(v^{\backslash n}+1)\mathbf{I})+w\mathcal{N}(\mathbf{x}_n|\mathbf{0},a\mathbf{I})\right)\\
&=\frac{(1-w)\mathcal{N}(\mathbf{x}_n|\mathbf{m}^{\backslash n},(v^{\backslash n}+1)\mathbf{I})}{Z_n}\frac{1}{v^{\backslash n}+1}(\mathbf{x}_n-\mathbf{m}^{\backslash n})\\
&=\left(1-\frac{w}{Z_n}\mathcal{N}(\mathbf{x}_n|\mathbf{0},a\mathbf{I})\right)\frac{1}{v^{\backslash n}+1}(\mathbf{x}_n-\mathbf{m}^{\backslash n})\\
&=\rho_n\frac{1}{v^{\backslash n}+1}(\mathbf{x}_n-\mathbf{m}^{\backslash n})
\end{align*}
$$
となるため,
$$
\begin{align*}
\mathbf{m}^{\rm new}&=\mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta]\\
&=\mathbf{m}^{\backslash n}+v^{\backslash n}\nabla_{\mathbf{m}^{\backslash n}}\ln Z_n\\
&=\mathbf{m}^{\backslash n}+\rho_n\frac{v^{\backslash n}}{v^{\backslash n}+1}(\mathbf{x}_n-\mathbf{m}^{\backslash n})
\end{align*}
$$
が得られる。
$$
\begin{align*}
\nabla_{v^{\backslash n}}\ln Z_n&=\frac{1}{Z_n}\nabla_{v^{\backslash n}}Z_n\\
&=\frac{1}{Z_n}\int{\rm d}\boldsymbol\theta\left\{\nabla_{v^{\backslash n}} q^{\backslash n}(\boldsymbol\theta)\right\}f_n(\boldsymbol\theta)\\
&=\frac{1}{Z_n}\int{\rm d}\boldsymbol\theta\left\{\nabla_{v^{\backslash n}} \mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})\right\}f_n(\boldsymbol\theta)\\
&=\frac{1}{Z_n}\int{\rm d}\boldsymbol\theta\left\{-\frac{D}{2v^{\backslash n}}+\frac{(\boldsymbol\theta-\mathbf{m}^{\backslash n})^{\rm T}(\boldsymbol\theta-\mathbf{m}^{\backslash n})}{2(v^{\backslash n})^2} \mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})\right\}f_n(\boldsymbol\theta)\\
&=\frac{1}{2(v^{\backslash n})^2}\left(\int{\rm d}\boldsymbol\theta\boldsymbol\theta^{\rm T}\boldsymbol\theta\frac{\mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})f_n(\boldsymbol\theta)}{Z_n}-2\left(\int{\rm d}\boldsymbol\theta\boldsymbol\theta^{\rm T}\frac{\mathcal{N}(\boldsymbol\theta|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I})f_n(\boldsymbol\theta)}{Z_n}\right)\mathbf{m}^{\backslash n}+\left\|\mathbf{m}^{\backslash n}\right\|^2-Dv^{\backslash n}\right)\\
&=\frac{1}{2(v^{\backslash n})^2}\left(\int{\rm d}\boldsymbol\theta\boldsymbol\theta^{\rm T}\boldsymbol\theta q^{\rm new}(\boldsymbol\theta)-2\left(\int{\rm d}\boldsymbol\theta\boldsymbol\theta^{\rm T}q^{\rm new}(\boldsymbol\theta)\right)\mathbf{m}^{\backslash n}+\left\|\mathbf{m}^{\backslash n}\right\|^2-Dv^{\backslash n}\right)\\
&=\frac{1}{2(v^{\backslash n})^2}\left(\mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta^{\rm T}\boldsymbol\theta]-2\mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta^{\rm T}]\mathbf{m}^{\backslash n}+\left\|\mathbf{m}^{\backslash n}\right\|^2-Dv^{\backslash n}\right)\\
\therefore \mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta^{\rm T}\boldsymbol\theta]&=2(v^{\backslash n})^2\nabla_{v^{\backslash n}}\ln Z_n+2\mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta^{\rm T}]\mathbf{m}^{\backslash n}-\left\|\mathbf{m}^{\backslash n}\right\|^2+Dv^{\backslash n}
\end{align*}
$$
$${\nabla_{v^{\backslash n}}\ln Z_n}$$を具体的に計算すると,
$$
\begin{align*}
\nabla_{v^{\backslash n}}\ln Z_n&=\frac{1}{Z_n}\nabla_{v^{\backslash n}}Z_n\\
&=\frac{1}{Z_n}\nabla_{v^{\backslash n}}\left((1-w)\mathcal{N}(\mathbf{x}_n|\mathbf{m}^{\backslash n},(v^{\backslash n}+1)\mathbf{I})+w\mathcal{N}(\mathbf{x}_n|\mathbf{0},a\mathbf{I})\right)\\
&=\frac{(1-w)\mathcal{N}(\mathbf{x}_n|\mathbf{m}^{\backslash n},(v^{\backslash n}+1)\mathbf{I})}{Z_n}\frac{\left\|\mathbf{x}_n-\mathbf{m}^{\backslash n}\right\|^2-D(v^{\backslash n}+1)}{2(v^{\backslash n}+1)^2}\\
&=\rho_n\frac{\left\|\mathbf{x}_n-\mathbf{m}^{\backslash n}\right\|^2-D(v^{\backslash n}+1)}{2(v^{\backslash n}+1)^2}
\end{align*}
$$
となるため,
$$
\begin{align*}
v^{\rm new}&=\frac{1}{D}\left(\mathbb{E}_{q^{\rm new}(\boldsymbol\theta)}[\boldsymbol\theta^{\rm T}\boldsymbol\theta]-(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\rm new}\right)\\
&=\frac{1}{D}\left(2(v^{\backslash n})^2\nabla_{v^{\backslash n}}\ln Z_n+2(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\backslash n}-\left\|\mathbf{m}^{\backslash n}\right\|^2+Dv^{\backslash n}-(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\rm new}\right)\\
&=v^{\backslash n}+\frac{1}{D}\left(2(v^{\backslash n})^2\rho_n\frac{\left\|\mathbf{x}_n-\mathbf{m}^{\backslash n}\right\|^2-D(v^{\backslash n}+1)}{2(v^{\backslash n}+1)^2}+2(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\backslash n}-\left\|\mathbf{m}^{\backslash n}\right\|^2-(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\rm new}\right)\\
&=v^{\backslash n}-\rho_n\frac{(v^{\backslash n})^2}{v^{\backslash n}+1}+\frac{1}{D}\left((v^{\backslash n})^2\rho_n\frac{\left\|\mathbf{x}_n-\mathbf{m}^{\backslash n}\right\|^2}{(v^{\backslash n}+1)^2}-\left\|\mathbf{m}^{\rm new}-\mathbf{m}^{\backslash n}\right\|^2\right)\\
&=v^{\backslash n}-\rho_n\frac{(v^{\backslash n})^2}{v^{\backslash n}+1}+\rho_n(1-\rho_n)\frac{(v^{\backslash n})^2\left\|\mathbf{x}_n-\mathbf{m}^{\backslash n}\right\|^2}{D(v^{\backslash n}+1)^2}
\end{align*}
$$
が得られる。
$$
\begin{align*}
\tilde{f}_n^{\rm new}(\boldsymbol\theta)&=Z_n\frac{q^{\rm new}(\boldsymbol\theta)}{q^{\backslash n}(\boldsymbol\theta)}\\
&=Z_n\frac{\mathcal{N}\left(\boldsymbol\theta\left|\mathbf{m}^{\rm new},v^{\rm new}\mathbf{I}\right.\right)}{\mathcal{N}\left(\boldsymbol\theta\left|\mathbf{m}^{\backslash n},v^{\backslash n}\mathbf{I}\right.\right)}\\
&=Z_n\left(\frac{v^{\backslash n}}{v^{\rm new}}\right)^{D/2}\exp\left(-\frac{1}{2v^{\rm new}}\left\|\boldsymbol\theta-\mathbf{m}^{\rm new}\right\|^2+\frac{1}{2v^{\backslash n}}\left\|\boldsymbol\theta-\mathbf{m}^{\backslash n}\right\|^2\right)\\
&=Z_n\left(\frac{v^{\backslash n}}{v^{\rm new}}\right)^{D/2}\exp\left(-\frac{1}{2}\left\{(v^{\rm new})^{-1}-(v^{\backslash n})^{-1}\right\}\boldsymbol\theta^{\rm T}\boldsymbol\theta+\left\{(v^{\rm new})^{-1}-(v^{\backslash n})^{-1}\right\}\boldsymbol\theta^{\rm T}\left(\mathbf{m}^{\backslash n}+\frac{(v^{\rm new})^{-1}}{(v^{\rm new})^{-1}-(v^{\backslash n})^{-1}}(\mathbf{m}^{\rm new}-\mathbf{m}^{\backslash n})\right)-\frac{1}{2}\left\{(v^{\rm new})^{-1}(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\rm new}-(v^{\backslash n})^{-1}(\mathbf{m}^{\backslash n})^{\rm T}\mathbf{m}^{\backslash n}\right\}\right)\\
&=:Z_n\left(\frac{v^{\backslash n}}{v^{\rm new}}\right)^{D/2}\exp\left(-\frac{(v_n^{\rm new})^{-1}}{2}\boldsymbol\theta^{\rm T}\boldsymbol\theta+(v_n^{\rm new})^{-1}\boldsymbol\theta^{\rm T}\mathbf{m}_n^{\rm new}-\frac{1}{2}\left\{(v^{\rm new})^{-1}(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\rm new}-(v^{\backslash n})^{-1}(\mathbf{m}^{\backslash n})^{\rm T}\mathbf{m}^{\backslash n}\right\}\right)\\
&=Z_n\left(\frac{v^{\backslash n}}{v^{\rm new}}\right)^{D/2}\exp\left(-\frac{1}{2v_n^{\rm new}}\|\boldsymbol\theta-\mathbf{m}_n^{\rm new}\|^2\right)\exp\left(\frac{1}{2}\left\{(v_n^{\rm new})^{-1}(\mathbf{m}_n^{\rm new})^{\rm T}\mathbf{m}_n^{\rm new}-(v^{\rm new})^{-1}(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\rm new}+(v^{\backslash n})^{-1}(\mathbf{m}^{\backslash n})^{\rm T}\mathbf{m}^{\backslash n}\right\}\right)\\
&=Z_n\left(\frac{2\pi v_n^{\rm new}v^{\backslash n}}{v^{\rm new}}\right)^{D/2}\exp\left(\frac{1}{2}\left\{(v_n^{\rm new})^{-1}(\mathbf{m}_n^{\rm new})^{\rm T}\mathbf{m}_n^{\rm new}-(v^{\rm new})^{-1}(\mathbf{m}^{\rm new})^{\rm T}\mathbf{m}^{\rm new}+(v^{\backslash n})^{-1}(\mathbf{m}^{\backslash n})^{\rm T}\mathbf{m}^{\backslash n}\right\}\right)\mathcal{N}(\boldsymbol\theta|\mathbf{m}_n^{\rm new},v_n^{\rm new}\mathbf{I})\\
&=Z_n\left(\frac{2\pi v_n^{\rm new}v^{\backslash n}}{v^{\rm new}}\right)^{D/2}\exp\left(\frac{1}{2(v_n^{\rm new}+v^{\backslash n})}\left\|\mathbf{m}_n^{\rm new}-\mathbf{m}^{\backslash n}\right\|^2\right)\mathcal{N}(\boldsymbol\theta|\mathbf{m}_n^{\rm new},v_n^{\rm new}\mathbf{I})\\
&=Z_n\left(\frac{2\pi v_n^{\rm new}v^{\backslash n}}{v^{\rm new}}\right)^{D/2}\frac{1}{(2\pi(v_n^{\rm new}+v^{\backslash n}))^{D/2}\mathcal{N}(\mathbf{m}_n^{\rm new}|\mathbf{m}^{\backslash n},(v_n^{\rm new}+v^{\backslash n})\mathbf{I})}\mathcal{N}(\boldsymbol\theta|\mathbf{m}_n^{\rm new},v_n^{\rm new}\mathbf{I})\\
&=\frac{Z_n}{\mathcal{N}(\mathbf{m}_n^{\rm new}|\mathbf{m}^{\backslash n},(v_n^{\rm new}+v^{\backslash n})\mathbf{I})}\mathcal{N}(\boldsymbol\theta|\mathbf{m}_n^{\rm new},v_n^{\rm new}\mathbf{I})\\
&=s_n\mathcal{N}(\boldsymbol\theta|\mathbf{m}_n^{\rm new},v_n^{\rm new}\mathbf{I})\\
\therefore (v_n^{\rm new})^{-1}&=(v^{\rm new})^{-1}-(v^{\backslash n})^{-1}\\
\mathbf{m}_n^{\rm new}&=\mathbf{m}^{\backslash n}+\frac{v^{\rm new}+v^{\backslash n}}{v^{\backslash n}}(\mathbf{m}^{\rm new}-\mathbf{m}^{\backslash n})\\
s_n&=\frac{Z_n}{\mathcal{N}(\mathbf{m}_n^{\rm new}|\mathbf{m}^{\backslash n},(v_n^{\rm new}+v^{\backslash n})\mathbf{I})}
\end{align*}
$$
上式の$${v_n^{\rm new},\mathbf{m}_n^{\rm new}}$$の表記を$${v_n,\mathbf{m}_n}$$に変更して$${p(\mathcal{D})}$$を求めると,
$$
\begin{align*}
p(\mathcal{D})&=\int{\rm d}\boldsymbol\theta p(\boldsymbol\theta)\prod_{n=1}^Nf_n(\boldsymbol\theta)\\
&\simeq \int{\rm d}\boldsymbol\theta p(\boldsymbol\theta)\prod_{n=1}^N\tilde{f}_n(\boldsymbol\theta)\\
&=(2\pi b)^{-D/2}\prod_{n=1}^{N}\left\{s_n(2\pi v_n)^{-D/2}\right\}\int{\rm d}\boldsymbol\theta\exp\left(-\frac{1}{2b}\|\boldsymbol\theta\|^2-\frac{1}{2}\sum_{n=1}^N\frac{\|\boldsymbol\theta-\mathbf{m}_n\|^2}{v_n}\right)\\
&=(2\pi b)^{-D/2}\prod_{n=1}^{N}\left\{s_n(2\pi v_n)^{-D/2}\right\}\exp\left(\frac{1}{2}\left\{\left(b^{-1}+\sum_{n=1}^Nv_n^{-1}\right)^{-1}\left\|\sum_{n=1}^Nv_n^{-1}\mathbf{m}_n\right\|^2-\sum_{n=1}^Nv_n^{-1}\|\mathbf{m}_n\|^2\right\}\right)\int{\rm d}\boldsymbol\theta\exp\left(-\frac{1}{2}\left(b^{-1}+\sum_{n=1}^Nv_n^{-1}\right)\left\|\boldsymbol\theta-\left(b^{-1}+\sum_{n=1}^Nv_n^{-1}\right)^{-1}\sum_{n=1}^Nv_n^{-1}\mathbf{m}_n\right\|^2\right)\\
&=\left(1+b\sum_{n=1}^Nv_n^{-1}\right)^{-D/2}\exp\left(\frac{1}{2}\left\{\left(b^{-1}+\sum_{n=1}^Nv_n^{-1}\right)^{-1}\left\|\sum_{n=1}^Nv_n^{-1}\mathbf{m}_n\right\|^2-\sum_{n=1}^Nv_n^{-1}\|\mathbf{m}_n\|^2\right\}\right)\prod_{n=1}^{N}\left\{s_n(2\pi v_n)^{-D/2}\right\}
\end{align*}
$$
$${q(\boldsymbol\theta)\propto\prod_iq_i(\boldsymbol\theta)}$$より,
$$
\begin{align*}
v^{-1}&=b^{-1}+\sum_{n=1}^Nv_n^{-1}\\
\mathbf{m}&=\sum_{n=1}^Nv_n^{-1}\mathbf{m}_n
\end{align*}
$$
が成立するため,
$$
\begin{align*}
p(\mathcal{D})&\simeq \left(\frac{v}{b}\right)^{D/2}\exp\left(\frac{1}{2}\left\{v^{-1}\|\mathbf{m}\|^2-\sum_{n=1}^Nv_n^{-1}\|\mathbf{m}_n\|^2\right\}\right)\prod_{n=1}^{N}\left\{s_n(2\pi v_n)^{-D/2}\right\}
\end{align*}
$$
が得られる。