深度逆向强化学习公式推导
符号标识
\[\begin{matrix} \xi = \{ s_{1},a_{1},\ldots,s_{t},a_{t} \}\tag{1} \\ \end{matrix}\]
\[\begin{matrix} R_{\lambda} = \sum_{}^{}{r_{\lambda}(s_{t},a_{t})}\ \tag{2} \\ \end{matrix}\]
最优化公式
\[\begin{matrix} Max\ \sum_{Path\xi_{i}}^{}{- p\left( \xi_{i} \right)lgp\left( \xi_{i} \right)}\ \ \tag{3} \\ \end{matrix}\]
\[\begin{matrix} s.t.\ \sum_{Path\xi_{i}}^{}{p\left( \xi_{i} \right)f_{\xi_{i}}} = \widetilde{f}\tag{4} \\ \end{matrix}\]
\[\begin{matrix} \sum_{Path\xi_{i}}^{}{p\left( \xi_{i} \right) = 1}\tag{5} \\ \end{matrix}\]
利用拉格朗日乘子法,该优化问题可以转化为:
\[\begin{matrix} \min L = \sum_{\xi_{i}}^{}{plgp} - \sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}\left( pf_{\xi_{i}} - \widetilde{f} \right) - \lambda_{0}\left( \sum_{}^{}{p - 1} \right)}\ \ \tag{6} \\ \end{matrix}\]
对概率p进行微分,并令导数为0,可以得到:
\[\begin{matrix} \frac{\partial L}{\partial p} = \sum_{\xi_{i}}^{}{lgp} + 1 - \sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}f_{\xi_{i}}} - \lambda_{0} = 0\tag{7} \\ \end{matrix}\]
\[\begin{matrix} p = \frac{\exp\left( \sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}f_{\xi_{i}}} \right)}{\exp\left( 1 - \lambda_{0} \right)}\tag{8} \\ \end{matrix}\]
轨迹的加和同样等同于状态的加和,即\(\sum_{\xi_{i} = 1}^{n}{\lambda_{\xi_{i}}f_{\xi_{i} } } = \sum_{j = 1}^{n}{\lambda_{j}f_{j} }\),因此式(8)可改写为
\[\begin{matrix} p = \frac{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}{\exp\left( 1 - \lambda_{0} \right)} = \frac{1}{Z}\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)\tag{9} \\ \end{matrix}\]
将(9)带入(5)可得
\[\begin{matrix} \sum_{Path\xi_{i}}^{}{\frac{1}{Z}\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)} = 1\tag{10} \\ \end{matrix}\]
因此:
\[\begin{matrix} Z = \sum_{Path\xi_{i}}^{}{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}\tag{11} \\ \end{matrix}\]
将(11)带入(9)可得:
\[\begin{matrix} P_{\lambda} = P\left( \xi_{i}|\lambda \right) = \frac{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}{\sum_{Path\xi_{i}}^{}{\exp\left( \sum_{j = 1}^{n}{\lambda_{j}f_{j}} \right)}}\tag{12} \\ \end{matrix}\]
最大似然
利用最大似然函数对式(12)进行求解
\[\begin{matrix} \lambda = \underset{\lambda}{argmax}\sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)}\tag{13} \\ \end{matrix}\]
令\(L(\lambda) = \sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)}\)则
\[{L(\lambda) = \sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)} }{\begin{matrix} = \sum_{\xi \in \Xi}^{}{\sum_{j = 1}^{n}{\lambda_{j}f_{j}}} - \sum_{\xi \in \Xi}^{}{\log Z_{\lambda}}\ \\ \end{matrix} }\begin{matrix} = \sum_{\xi \in \Xi}^{}{R_{\lambda}(\xi) - \sum_{\xi \in \Xi}^{}{\log Z_{\lambda}}}\tag{14} \\ \end{matrix}\]
因此,对式(14)求导可得
\[{\nabla L(\lambda) = \sum_{\xi \in \Xi}^{}\frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}\frac{1}{\sum_{\Xi}^{}{\exp\left( R_{\lambda}(\xi) \right)}}\sum_{\xi \in \Xi}^{}{\exp\left( R_{\lambda}(\xi) \right)}\frac{dR_{\lambda}(\xi)}{d\lambda}\ }{= \sum_{\xi \in \Xi}^{}\frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}{\sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)}\frac{dR_{\lambda}(\xi)}{d\lambda}} }{= \frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)\frac{dR_{\lambda}(\xi)}{d\lambda}} }\begin{matrix} = \widetilde{f} - \sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)f_{\xi}}\ (15) \\ \end{matrix}\]
轨迹的加和同样等同于状态的加和,因此式(15)最终简化为
\[\begin{matrix} \nabla L(\lambda) = \widetilde{f} - \sum_{s_{i}}^{}{D_{s_{i}}f_{s_{i}}}\tag{16} \\ \end{matrix}\]
此外,式子(15)还可以看作\(\frac{\partial\mathcal{L}_{\mathcal{D}}}{\partial R}\frac{\partial R}{\partial\lambda}\)的形式,则式(15)可以改写为:
\[{\nabla L(\lambda) = \sum_{\xi \in \Xi}^{}\frac{dR_{\lambda}(\xi)}{d\lambda} - \sum_{\xi \in \Xi}^{}{p\left( \xi \middle| \lambda \right)}\frac{dR_{\lambda}(\xi)}{d\lambda} }{\ \ \ \ \ = \sum_{\xi \in \Xi}^{}{\left( 1 - p\left( \xi \middle| \lambda \right) \right)\frac{dR_{\lambda}(\xi)}{d\lambda}} }{\ \ \ \ \ = \left( \mu_{D} - E\lbrack\mu\rbrack \right)\frac{\partial g(f,\lambda)}{\partial\lambda}} (17)\]
最大后验概率
\[p(\lambda/\xi) = \frac{p(\xi|\lambda)p(\lambda)}{p(\xi)}\]
利用最大后验概率对式(12)进行求解
\[\lambda = \underset{\lambda}{argmax}\sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)}p(\lambda)\]
\[{L(\lambda) = \sum_{\xi \in \Xi}^{}{logp\left( \xi \middle| \lambda \right)p(\lambda)} }{= \sum_{\xi \in \Xi}^{}\left( logp\left( \xi \middle| \lambda \right) + logp(\lambda) \right) }{= \sum_{\xi \in \Xi}^{}{(\mathcal{L}_{\mathcal{D}} + \mathcal{L}_{\lambda})}}\]
因此
\[{\nabla L(\lambda) = \sum_{\xi \in \Xi}^{}\left( \frac{\partial\mathcal{L}_{\mathcal{D}}}{\partial R}\frac{\partial R}{\partial\lambda}\ + \frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda} \right) }{= \sum_{\xi \in \Xi}^{}{\left( 1 - p\left( \xi \middle| \lambda \right) \right)\frac{\partial g(f,\lambda)}{\partial\lambda}} + \frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda} }{= \left( \mu_{D} - E\lbrack\mu\rbrack \right)\frac{\partial g(f,\lambda)}{\partial\lambda} + \frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda}}\]
其中\(\frac{\partial\mathcal{L}_{\lambda}}{\partial\lambda}\)为关于参数\(\lambda\)的正则化项