Appendix: Complete derivative of BG-RBM
Energy function:
$$E\left( \textbf{v},\textbf{h}\right)_{BG} = \sum\limits_{i = 1}^{n_{v}}\frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}}-\sum\limits_{j = 1}^{n_{h}}b_{j}h_{j} -\sum\limits_{i = 1}^{n_{v}}\sum\limits_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j} $$
1.1 Definition of conditional probability P(h|v)
$$\begin{array}{@{}rcl@{}} P(\textbf{h}|\textbf{v}) &=& \frac{P(\textbf{v},\textbf{h})}{P(\textbf{h})} = \frac{\frac{1}{Z}e^{-E(\textbf{v},\textbf{h})}}{\frac{1}{Z}{\sum}_{\textbf{h}} e^{-E(\textbf{v},\textbf{h})}} = \frac{e^{-E(\textbf{v},\textbf{h})}}{{\sum}_{\textbf{h}}e^{-E(\textbf{v},\textbf{h})}} \\ & =& \frac{e^{-\left( {\sum}_{i = 1}^{n_{v}}\frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}}-{\sum}_{j = 1}^{n_{h}}b_{j}h_{j} -{\sum}_{i = 1}^{n_{v}}{\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) }}{{\sum}_{h} e^{-\left( {\sum}_{i = 1}^{n_{v}}\frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}}-{\sum}_{j = 1}^{n_{h}}b_{j}h_{j} -{\sum}_{i = 1}^{n_{v}}{\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) }} \\ & =& \frac{e^{-{\sum}_{i = 1}^{n_{v}} \left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\sum}_{h} e^{-{\sum}_{i = 1}^{n_{v}} \left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}} \end{array} $$
Rewritting the above equation in term of product of expert model:
$$\begin{array}{@{}rcl@{}} P(\textbf{h}|\textbf{v}) & =& \frac{{\prod}_{j} e^{-{\sum}_{i = 1}^{n_{v}} \left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\prod}_{j} {\sum}_{h} e^{-{\sum}_{i = 1}^{n_{v}} \left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}} \\ & =& \prod\limits_{j} \frac{e^{\left( \frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}+b_{j}h_{j} \right) }}{{\sum}_{h} e^{\left( \frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}+b_{j}h_{j} \right) } } = \prod\limits_{j} P(h_{j}|\textbf{v}) \end{array} $$
For binary hj ∈{0, 1}, P(hj = 1) equals to:
$$P(h_{j} = 1) = \frac{e^{\left( \frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}+b_{j} \right) }}{e^{\left( \frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}+b_{j} \right) } + e^{0}} = sig\left( \frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}+b_{j}\right) $$
1.2 Definition of conditional probability P(v|h)
$$\begin{array}{@{}rcl@{}} P(\textbf{v}|\textbf{h}) & =& \frac{P(\textbf{v},\textbf{h})}{P(\textbf{v})} = \frac{\frac{1}{Z}e^{-E(\textbf{v},\textbf{h})}}{\frac{1}{Z}{\int}_{\textbf{v}} e^{-E(\textbf{v},\textbf{h})}dv} = \frac{e^{-E(\textbf{v},\textbf{h})}}{{\int}_{\textbf{v}}e^{-E(\textbf{v},\textbf{h})}dv} \\ & =& \frac{e^{-{\sum}_{i = 1}^{n_{v}} \left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\int}_{\textbf{v}} e^{-{\sum}_{i = 1}^{n_{v}} \left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}dv} \end{array} $$
Rewritting the above equation in term of product of expert model:
$$P(\textbf{v}|\textbf{h}) = \frac{{\prod}_{i} e^{-\left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\prod}_{i} {\int}_{\textbf{v}} e^{- \left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}dv} $$
Simplifying the denominator:
$$\begin{array}{@{}rcl@{}} P(\textbf{v}|\textbf{h}) & = & \frac{{\prod}_{i} e^{\!-\left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\prod}_{i} {\int}_{\textbf{v}} e^{-\left( \frac{1}{2{\sigma_{i}^{2}}}({v_{i}^{2}} - 2v_{i}a_{i} + {a_{i}^{2}})-{\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}} W_{ij}h_{j} \right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j} } dv } \\ & = & \frac{{\prod}_{i} e^{-\left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\prod}_{i} e^{\left( \frac{-{a_{i}^{2}}}{2{\sigma_{i}^{2}}}\right)+{\sum}_{j = 1}^{n_{h}}b_{j}h_{j}} {\int}_{\textbf{v}}e^{\frac{1}{2{\sigma_{i}^{2}}}\left( -{v_{i}^{2}} + 2v_{i}a_{i} \right)+{\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}} dv} \\ & = & \frac{{\prod}_{i} e^{-\left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\prod}_{i} e^{\left( \frac{-{a_{i}^{2}}}{2{\sigma_{i}^{2}}}\right)+{\sum}_{j = 1}^{n_{h}}b_{j}h_{j}} {\int}_{\textbf{v}}e^{\frac{1}{2{\sigma_{i}^{2}}}\left( -{v_{i}^{2}}\right)} e^{v_{i} \left( \frac{a_{i}}{{\sigma_{i}^{2}}} + {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}} W_{ij}h_{j} \right)^{2} } dv} \end{array} $$
Integrating the denominator w.r.t v:
$$\begin{array}{@{}rcl@{}} & =& \frac{{\prod}_{i} e^{-\left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{{\prod}_{i} e^{\left( \frac{-{a_{i}^{2}}}{2{\sigma_{i}^{2}}}\right)+{\sum}_{j = 1}^{n_{h}}b_{j}h_{j}} e^{\frac{{\sigma_{i}^{2}} \left( \frac{a_{i}}{{\sigma_{i}^{2}}} + {\sum}_{j = 1}^{n_{h}}\frac{W_{ij}}{{\sigma_{i}^{2}}}h_{j}\right)^{2} }{2}} \left( \sqrt{2{\sigma_{i}^{2}}\pi}\right)} \\ & =& \frac{{\prod}_{i} e^{-\left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}}}{\left( \sigma_{i}\sqrt{2\pi}\right) {\prod}_{i} e^{\frac{1}{2} \left( {\sum}_{j = 1}^{n_{h}}W_{ij}h_{j} \right)^{2}+{\sum}_{j = 1}^{n_{h}}b_{j}h_{j}+\frac{a_{i}W_{ij}h_{j}}{{\sigma_{i}^{2}}}} } \end{array} $$
Simplifying the above equation:
$$\begin{array}{@{}rcl@{}} & =& \prod\limits_{i} \frac{1}{\sigma_{i}\sqrt{2\pi}} e^{-\left( \frac{(v_{i}-a_{i})^{2}}{2{\sigma_{i}^{2}}} - {\sum}_{j = 1}^{n_{h}}\frac{v_{i}}{{\sigma_{i}^{2}}}W_{ij}h_{j}\right) + {\sum}_{j = 1}^{n_{h}}b_{j}h_{j}-\left( \frac{1}{2} \left( {\sum}_{j = 1}^{n_{h}}W_{ij}h_{j} \right)^{2}+{\sum}_{j = 1}^{n_{h}}b_{j}h_{j}+\frac{a_{i}W_{ij}h_{j}}{{\sigma_{i}^{2}}}\right) } \\ & =& \prod\limits_{i} \frac{1}{\sigma_{i}\sqrt{2\pi}}e^{-\frac{1}{2{\sigma_{i}^{2}}} \left( v_{i} - \left( a_{i}+{\sum}_{j = 1}^{n_{h}}W_{ij}h_{j}\right) \right)^{2}} \end{array} $$
The above equation is a probability density function of Gaussian distribution with mean \(v_{i} - \left (a_{i}+{\sum }_{j = 1}^{n_{h}}W_{ij}h_{j}\right )\) and variance \({\sigma _{i}^{2}}\).
1.3 Derivative of log-likelihood function
$$\begin{array}{@{}rcl@{}} \frac{\partial\ln P(\textbf{v})}{\partial{W_{ij}}} & =& \frac{\partial\ln{\sum}_{\textbf{h}}e^{-E(\textbf{v},\textbf{h})} }{\partial{W_{ij}}}-\frac{\partial\ln{\sum}_{\textbf{h}}{\sum}_{\textbf{v}}e^{-E(\textbf{v},\textbf{h})} }{\partial{W_{ij}}} \\ & =& \frac{1}{{\sum}_{\textbf{h}}e^{-E(\textbf{v},\textbf{h})}} \left( {\sum}_{\textbf{h}}e^{-E(\textbf{v},\textbf{h})}.-\frac{\partial E(\textbf{v},\textbf{h})}{\partial{W_{ij}}} \right) \\&&- \frac{1}{{\sum}_{\textbf{v},\textbf{h}}e^{-E(\textbf{v},\textbf{h})}} \left( {\sum}_{\textbf{v},\textbf{h}}e^{-E(\textbf{v},\textbf{h})}.-\frac{\partial E(\textbf{v},\textbf{h})}{\partial{W_{ij}}} \right) \\ & =& - {\sum}_{\textbf{h}}\frac{e^{-E(\textbf{v},\textbf{h})}}{{\sum}_{\textbf{h}} e^{-E(\textbf{v},\textbf{h})}}.\left( \frac{\partial E(\textbf{v},\textbf{h})}{\partial{W_{ij}}} \right) \\&&+ {\sum}_{\textbf{v},\textbf{h}}\frac{e^{-E(\textbf{v},\textbf{h})}}{{\sum}_{\textbf{v},\textbf{h}} e^{-E(\textbf{v},\textbf{h})}}.\left( \frac{\partial E(\textbf{v},\textbf{h})}{\partial{W_{ij}}} \right) \\ & =& -{\sum}_{\textbf{h}} P(\textbf{h}|\textbf{v}).\frac{\partial E(\textbf{v},\textbf{h})}{\partial{W_{ij}}} + {\sum}_{\textbf{v},\textbf{h}} P(\textbf{h},\textbf{v}).\frac{\partial E(\textbf{v},\textbf{h})}{\partial{W_{ij}}} \end{array} $$
1.4 Derivative of log-likelihood approximation
$$\begin{array}{@{}rcl@{}} \frac{\partial\ln P(\textbf{v})}{\partial{W_{ij}}} & \simeq& -\sum\limits_{\textbf{h}} P(\textbf{h}|\textbf{v}) \frac{\partial E(\textbf{v},\textbf{h})}{\partial{W_{ij}}} + {\sum}_{\tilde{\textbf{v}}} P(\tilde{\textbf{v}}) {\sum}_{\tilde{\textbf{h}}} P(\tilde{\textbf{h}}|\tilde{\textbf{v}}) \frac{\partial E(\tilde{\textbf{v}},\tilde{\textbf{h}})}{\partial{W_{ij}}} \\ & \simeq& \sum\limits_{\textbf{h}} P(\textbf{h}|\textbf{v})\frac{v_{i}h_{j}}{\sigma^{2}} - {\sum}_{\tilde{\textbf{v}}}P(\tilde{\textbf{v}}) {\sum}_{\tilde{\textbf{h}}} P(\tilde{\textbf{h}}|\tilde{\textbf{v}})\frac{\tilde{v_{i}}\tilde{h_{j}}}{\sigma^{2}} \\ & \simeq& \sum\limits_{\textbf{h}} P(h=+ 1|\textbf{v})\frac{v_{i}}{\sigma^{2}}-{\sum}_{\tilde{\textbf{v}}}P(\tilde{\textbf{v}}){\sum}_{\tilde{\textbf{h}}}P(\tilde{h} = + 1|\tilde{\textbf{v}})\frac{\tilde{v_{i}}}{\sigma^{2}} \end{array} $$
By applying this procedure to all biases, we obtain the gradients as follows:
$$\begin{array}{@{}rcl@{}} \frac{\partial\ln P(\textbf{v})}{\partial{W_{ij}}} & \simeq& \frac{v_{i} h_{j} - \tilde{v}_{i} \tilde{h}_{j}}{\sigma^{2}} \end{array} $$
$$\begin{array}{@{}rcl@{}} \frac{\partial\ln P(\textbf{v})}{\partial{a_{i}}} & \simeq& \frac{v_{i} - \tilde{v}_{i} }{\sigma^{2}} \\ \frac{\partial\ln P(\textbf{v})}{\partial{b_{j}}} & \simeq& h_{j} - \tilde{h}_{j} \end{array} $$