data(Maze)
# solve the POMDP for 5 epochs and no discounting
sol <- solve_MDP(Maze, discount = 1)
sol
# U in the policy is and estimate of the utility of being in a state when using the optimal policy.
policy(sol)
matrix(policy(sol)[[1]]$action, nrow = 3, dimnames = list(1:3, 1:4))[3:1, ]
## Example 1: simulate 10 trajectories following the policy, only the final belief state is returned
sim <- simulate_MDP(sol, n = 100, horizon = 10, verbose = TRUE)
sim
# Note that all simulations start at s_1 and that the simulated avg. reward
# is therefore an estimate to the U value for the start state s_1.
policy(sol)[[1]][1,]
# Calculate proportion of actions taken in the simulation
round_stochastic(sim$action_cnt / sum(sim$action_cnt), 2)
# reward distribution
hist(sim$reward)
## Example 2: simulate starting following a uniform distribution over all
# states and return all visited states
sim <- simulate_MDP(sol, n = 100, start = "uniform", horizon = 10, return_states = TRUE)
sim$avg_reward
# how often was each state visited?
table(sim$states)
matrix(table(sim$states),nrow = 3, dimnames = list(1:3, 1:4))[3:1, ]
Run the code above in your browser using DataLab