Spaces:

OpenDILabCommunity
/

gomoku

Sleeping

App Files Files Community

gomoku / DI-engine /dizoo /beergame /envs /BGAgent.py

zjowowen

init space

3dfe8fb over 1 year ago

raw

history blame contribute delete

8.36 kB

	# Code Reference: https://github.com/OptMLGroup/DeepBeerInventory-RL.
	import argparse
	import numpy as np


	# Here we want to define the agent class for the BeerGame
	class Agent(object):
	# initializes the agents with initial values for IL, OO and saves self.agentNum for recognizing the agents.
	def __init__(
	self, agentNum: int, IL: int, AO: int, AS: int, c_h: float, c_p: float, eta: int, compuType: str,
	config: argparse.Namespace
	) -> None:
	self.agentNum = agentNum
	self.IL = IL # Inventory level of each agent - changes during the game
	self.OO = 0 # Open order of each agent - changes during the game
	self.ASInitial = AS # the initial arriving shipment.
	self.ILInitial = IL # IL at which we start each game with this number
	self.AOInitial = AO # OO at which we start each game with this number
	self.config = config # an instance of config is stored inside the class
	self.curState = [] # this function gets the current state of the game
	self.nextState = []
	self.curReward = 0 # the reward observed at the current step
	self.cumReward = 0 # cumulative reward; reset at the beginning of each episode
	self.totRew = 0 # it is reward of all players obtained for the current player.
	self.c_h = c_h # holding cost
	self.c_p = c_p # backorder cost
	self.eta = eta # the total cost regulazer
	self.AS = np.zeros((1, 1)) # arriced shipment
	self.AO = np.zeros((1, 1)) # arrived order
	self.action = 0 # the action at time t
	self.compType = compuType
	# self.compTypeTrain = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
	# self.compTypeTest = compuType # rnd -> random / srdqn-> srdqn / Strm-> formula-Rong2008 / bs -> optimal policy if exists
	self.alpha_b = self.config.alpha_b[self.agentNum] # parameters for the formula
	self.betta_b = self.config.betta_b[self.agentNum] # parameters for the formula
	if self.config.demandDistribution == 0:
	self.a_b = np.mean((self.config.demandUp, self.config.demandLow)) # parameters for the formula
	self.b_b = np.mean((self.config.demandUp, self.config.demandLow)) * (
	np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
	np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
	) # parameters for the formula
	elif self.config.demandDistribution == 1 or self.config.demandDistribution == 3 or self.config.demandDistribution == 4:
	self.a_b = self.config.demandMu # parameters for the formula
	self.b_b = self.config.demandMu * (
	np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
	np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
	) # parameters for the formula
	elif self.config.demandDistribution == 2:
	self.a_b = 8 # parameters for the formula
	self.b_b = (3 / 4.) * 8 * (
	np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
	np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
	) # parameters for the formula
	elif self.config.demandDistribution == 3:
	self.a_b = 10 # parameters for the formula
	self.b_b = 7 * (
	np.mean((self.config.leadRecItemLow[self.agentNum], self.config.leadRecItemUp[self.agentNum])) +
	np.mean((self.config.leadRecOrderLow[self.agentNum], self.config.leadRecOrderUp[self.agentNum]))
	) # parameters for the formula
	else:
	raise Exception('The demand distribution is not defined or it is not a valid type.!')

	self.hist = [] # this is used for plotting - keeps the history for only one game
	self.hist2 = [] # this is used for animation usage
	self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
	self.T = 0
	self.bsBaseStock = 0
	self.init_bsBaseStock = 0
	self.nextObservation = []

	if self.compType == 'srdqn':
	# sets the initial input of the network
	self.currentState = np.stack(
	[self.curState for _ in range(self.config.multPerdInpt)], axis=0
	) # multPerdInpt observations stacked. each row is an observation

	# reset player information
	def resetPlayer(self, T: int):
	self.IL = self.ILInitial
	self.OO = 0
	self.AS = np.squeeze(
	np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
	) # arriced shipment
	self.AO = np.squeeze(
	np.zeros((1, T + max(self.config.leadRecItemUp) + max(self.config.leadRecOrderUp) + 10))
	) # arrived order
	if self.agentNum != 0:
	for i in range(self.config.leadRecOrderUp_aux[self.agentNum - 1]):
	self.AO[i] = self.AOInitial[self.agentNum - 1]
	for i in range(self.config.leadRecItemUp[self.agentNum]):
	self.AS[i] = self.ASInitial
	self.curReward = 0 # the reward observed at the current step
	self.cumReward = 0 # cumulative reward; reset at the begining of each episode
	self.action = []
	self.hist = []
	self.hist2 = []
	self.srdqnBaseStock = [] # this holds the base stock levels that srdqn has came up with. added on Nov 8, 2017
	self.T = T
	self.curObservation = self.getCurState(1) # this function gets the current state of the game
	self.nextObservation = []
	if self.compType == 'srdqn':
	self.currentState = np.stack([self.curObservation for _ in range(self.config.multPerdInpt)], axis=0)

	# updates the IL and OO at time t, after recieving "rec" number of items
	def recieveItems(self, time: int) -> None:
	self.IL = self.IL + self.AS[time] # inverntory level update
	self.OO = self.OO - self.AS[time] # invertory in transient update

	# find action Value associated with the action list
	def actionValue(self, curTime: int) -> int:
	if self.config.fixedAction:
	a = self.config.actionList[np.argmax(self.action)]
	else:
	# "d + x" rule
	if self.compType == 'srdqn':
	a = max(0, self.config.actionList[np.argmax(self.action)] * self.config.action_step + self.AO[curTime])
	elif self.compType == 'rnd':
	a = max(0, self.config.actionList[np.argmax(self.action)] + self.AO[curTime])
	else:
	a = max(0, self.config.actionListOpt[np.argmax(self.action)])

	return a

	# getReward returns the reward at the current state
	def getReward(self) -> None:
	# cost (holding + backorder) for one time unit
	self.curReward = (self.c_p * max(0, -self.IL) + self.c_h * max(0, self.IL)) / 200. # self.config.Ttest #
	self.curReward = -self.curReward
	# make reward negative, because it is the cost

	# sum total reward of each agent
	self.cumReward = self.config.gamma * self.cumReward + self.curReward

	# This function returns a np.array of the current state of the agent
	def getCurState(self, t: int) -> np.ndarray:
	if self.config.ifUseASAO:
	if self.config.if_use_AS_t_plus_1:
	curState = np.array(
	[-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t], self.AO[t]]
	)
	else:
	curState = np.array(
	[-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO, self.AS[t - 1], self.AO[t]]
	)
	else:
	curState = np.array([-1 * (self.IL < 0) * self.IL, 1 * (self.IL > 0) * self.IL, self.OO])

	if self.config.ifUseActionInD:
	a = self.config.actionList[np.argmax(self.action)]
	curState = np.concatenate((curState, np.array([a])))

	return curState