深度逆向强化学习公式推导

符号标识

\[\begin{matrix} \xi = \{ s_{1},a_{1},\ldots,s_{t},a_{t} \}\tag{1} \\ \end{matrix}\]

\[\begin{matrix} R_{\lambda} = \sum_{}^{}{r_{\lambda}(s_{t},a_{t})}\ \tag{2} \\ \end{matrix}\]

最优化公式

\[\begin{matrix} Max\ \sum_{Path\xi_{i}}^{}{- p\left( \xi_{i} \right)lgp\left( \xi_{i} \right)}\ \ \tag{3} \\ \end{matrix}\]

\[\begin{matrix} s.t.\ \sum_{Path\xi_{i}}^{}{p\left( \xi_{i} \right)f_{\xi_{i}}} = \widetilde{f}\tag{4} \\ \end{matrix}\]

\[\begin{matrix} \sum_{Path\xi_{i}}^{}{p\left( \xi_{i} \right) = 1}\tag{5} \\ \end{matrix}\]

利用拉格朗日乘子法,该优化问题可以转化为:

\[\begin{matrix} \min L = \sum_{\xi_{i}}^{}{plgp} - \sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}\left( pf_{\xi_{i}} - \widetilde{f} \right) - \lambda_{0}\left( \sum_{}^{}{p - 1} \right)}\ \ \tag{6} \\ \end{matrix}\]

对概率p进行微分,并令导数为0,可以得到:

\[\begin{matrix} \frac{\partial L}{\partial p} = \sum_{\xi_{i}}^{}{lgp} + 1 - \sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}f_{\xi_{i}}} - \lambda_{0} = 0\tag{7} \\ \end{matrix}\]

\[\begin{matrix} p = \frac{\exp\left( \sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}f_{\xi_{i}}} \right)}{\exp\left( 1 - \lambda_{0} \right)}\tag{8} \\ \end{matrix}\]

轨迹的加和同样等同于状态的加和,即\(\sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}f_{\xi_{i} } } = \sum_{j = 1}^{n}{\lambda_{j}f_{j} }\),因此式(8)可改写为

\[\begin{matrix} p = \frac{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}{\exp\left( 1 - \lambda_{0} \right)} = \frac{1}{Z}\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)\tag{9} \\ \end{matrix}\]

将(9)带入(5)可得

\[\begin{matrix} \sum_{Path\xi_{i}}^{}{\frac{1}{Z}\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)} = 1\tag{10} \\ \end{matrix}\]

因此:

\[\begin{matrix} Z = \sum_{Path\xi_{i}}^{}{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}\tag{11} \\ \end{matrix}\]

将(11)带入(9)可得:

\[\begin{matrix} P_{\lambda} = P\left( \xi_{i}|\lambda \right) = \frac{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}{\sum_{Path\xi_{i}}^{}{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}}\tag{12} \\ \end{matrix}\]

最大似然

利用最大似然函数对式(12)进行求解

\[\begin{matrix} \lambda = \underset{\lambda}{argmax}\sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)}\tag{13} \\ \end{matrix}\]

\(L(\lambda) = \sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)}\)

\[{L(\lambda) = \sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)} }{\begin{matrix} = \sum_{\xi \in \Xi}^{}{\sum_{j = 1}^{n}{\lambda_{j}f_{j}}} - \sum_{\xi \in \Xi}^{}{\log Z_{\lambda}}\ \\ \end{matrix} }\begin{matrix} = \sum_{\xi \in \Xi}^{}{R_{\lambda}(\xi) - \sum_{\xi \in \Xi}^{}{\log Z_{\lambda}}}\tag{14} \\ \end{matrix}\]

因此,对式(14)求导可得

\[{\nabla L(\lambda) = \sum_{\xi \in \Xi}^{}\frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}\frac{1}{\sum_{\Xi}^{}{\exp\left( R_{\lambda}(\xi) \right)}}\sum_{\xi \in \Xi}^{}{\exp\left( R_{\lambda}(\xi) \right)}\frac{dR_{\lambda}(\xi)}{d\lambda}\ }{= \sum_{\xi \in \Xi}^{}\frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}{\sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)}\frac{dR_{\lambda}(\xi)}{d\lambda}} }{= \frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)\frac{dR_{\lambda}(\xi)}{d\lambda}} }\begin{matrix} = \widetilde{f} - \sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)f_{\xi}}\ (15) \\ \end{matrix}\]

轨迹的加和同样等同于状态的加和,因此式(15)最终简化为

\[\begin{matrix} \nabla L(\lambda) = \widetilde{f} - \sum_{s_{i}}^{}{D_{s_{i}}f_{s_{i}}}\tag{16} \\ \end{matrix}\]

此外,式子(15)还可以看作\(\frac{\partial\mathcal{L}_{\mathcal{D}}}{\partial R}\frac{\partial R}{\partial\lambda}\)的形式,则式(15)可以改写为:

\[{\nabla L(\lambda) = \sum_{\xi \in \Xi}^{}\frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)}\frac{dR_{\lambda}(\xi)}{d\lambda} }{\ \ \ \ \ = \sum_{\xi \in \Xi}^{}{\left( 1 - p\left( \xi \middle| \lambda \right) \right)\frac{dR_{\lambda}(\xi)}{d\lambda}} }{\ \ \ \ \ = \left( \mu_{D} - E\lbrack\mu\rbrack \right)\frac{\partial g(f,\lambda)}{\partial\lambda}} (17)\]

最大后验概率

\[p(\lambda/\xi) = \frac{p(\xi|\lambda)p(\lambda)}{p(\xi)}\]

利用最大后验概率对式(12)进行求解

\[\lambda = \underset{\lambda}{argmax}\sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)}p(\lambda)\]

\[{L(\lambda) = \sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)p(\lambda)} }{= \sum_{\xi \in \Xi}^{}\left( logp\left( \xi \middle| \lambda \right) + logp(\lambda) \right) }{= \sum_{\xi \in \Xi}^{}{(\mathcal{L}_{\mathcal{D}} + \mathcal{L}_{\lambda})}}\]

因此

\[{\nabla L(\lambda) = \sum_{\xi \in \Xi}^{}\left( \frac{\partial\mathcal{L}_{\mathcal{D}}}{\partial R}\frac{\partial R}{\partial\lambda}\ + \frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda} \right) }{= \sum_{\xi \in \Xi}^{}{\left( 1 - p\left( \xi \middle| \lambda \right) \right)\frac{\partial g(f,\lambda)}{\partial\lambda}} + \frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda} }{= \left( \mu_{D} - E\lbrack\mu\rbrack \right)\frac{\partial g(f,\lambda)}{\partial\lambda} + \frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda}}\]

其中\(\frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda}\)为关于参数\(\lambda\)的正则化项