PZR0033
update in policy
5fb2a02
raw
history blame
1.04 kB
import numpy as np
import torch
class myOptimizer():
def __init__(self, lr, mu, mu_square, adaptation_rate, transaction_cost):
self.lr = lr
self.mu = mu
self.mu_square = mu_square
self.adaptation_rate = adaptation_rate
self.last_gradient = 0.0
self.transaction_cost = transaction_cost
def step(self, grad_n, reward, last_observation):
numerator = self.mu_square - (self.mu * reward)
denominator = torch.sqrt((self.mu_square - (self.mu ** 2)) ** 3)
gradient = numerator / (denominator + 1e-8)
self.mu = self.mu + self.adaptation_rate * (reward - self.mu)
self.mu_square = self.mu_square + self.adaptation_rate * ((reward ** 2) - self.mu_square)
current_grad = (-1.0 * self.transaction_cost * grad_n)
previous_grad = (last_observation + self.transaction_cost) * self.last_gradient
gradient = gradient * (current_grad + previous_grad)
self.last_gradient = grad_n
return self.lr * gradient