USING OPENAI WITH ROS: Unit 2: Problem with Calling Sarsa algorithm

Hi,

I am trying to complete exercise 2.2, where Start_training.py needs to call sarsa.py. I am confused how should I get the action2 within start_training.py?

Thanks & Regards,
Soumik Basu

Hi @BlueCap ,

Could you share your rosject with me please?
This is the best way to reproduce the error

I do have start_training.py which used to call qlearn.py; now I need to call sarsa.py. In sarsa, learn(self, state1, action1, reward, state2, action2) methods needs one more parameter action2. In start_traing how can I get the calculation for that action2?

start_training.py ==>

#!/usr/bin/env python

import gym
import time
import numpy
import random
import time
import qlearn
from gym import wrappers

import rospy
import rospkg
from functools import reduce

from openai_ros.task_envs.cartpole_stay_up import stay_up

if name == ‘main’:

rospy.init_node('cartpole_gym', anonymous=True, log_level=rospy.WARN)

# Create the Gym environment
env = gym.make('CartPoleStayUp-v0')
rospy.loginfo ( "Gym environment done")
    
# Set the logging system
rospack = rospkg.RosPack()
pkg_path = rospack.get_path('cartpole_v0_training')
outdir = pkg_path + '/training_results'
env = wrappers.Monitor(env, outdir, force=True) 
rospy.loginfo ( "Monitor Wrapper started")

last_time_steps = numpy.ndarray(0)

# Loads parameters from the ROS param server
# Parameters are stored in a yaml file inside the config directory
# They are loaded at runtime by the launch file
Alpha = rospy.get_param("/cartpole_v0/alpha")
Epsilon = rospy.get_param("/cartpole_v0/epsilon")
Gamma = rospy.get_param("/cartpole_v0/gamma")
epsilon_discount = rospy.get_param("/cartpole_v0/epsilon_discount")
nepisodes = rospy.get_param("/cartpole_v0/nepisodes")
nsteps = rospy.get_param("/cartpole_v0/nsteps")
running_step = rospy.get_param("/cartpole_v0/running_step")

# Initialises the algorithm that we are going to use for learning
qlearn = qlearn.QLearn(actions=range(env.action_space.n),
                alpha=Alpha, gamma=Gamma, epsilon=Epsilon)
initial_epsilon = qlearn.epsilon

start_time = time.time()
highest_reward = 0

# Starts the main training loop: the one about the episodes to do
for x in range(nepisodes):
    rospy.logdebug("############### START EPISODE => " + str(x))
    
    cumulated_reward = 0  
    done = False
    if qlearn.epsilon > 0.05:
        qlearn.epsilon *= epsilon_discount
    
    # Initialize the environment and get first state of the robot
    
    observation = env.reset()
    state = ''.join(map(str, observation))
    
    # Show on screen the actual situation of the robot
    # for each episode, we test the robot for nsteps
    for i in range(nsteps):
        
        rospy.loginfo("############### Start Step => "+str(i))
        # Pick an action based on the current state
        action = qlearn.chooseAction(state)
        rospy.loginfo ("Next action is: %d", action)
        # Execute the action in the environment and get feedback
        observation, reward, done, info = env.step(action)
        rospy.loginfo(str(observation) + " " + str(reward))
        cumulated_reward += reward
        if highest_reward < cumulated_reward:
            highest_reward = cumulated_reward

        nextState = ''.join(map(str, observation))

        # Make the algorithm learn based on the results
        #rospy.logwarn("############### State we were => " + str(state))
        #rospy.logwarn("############### Action that we took => " + str(action))
        #rospy.logwarn("############### Reward that action gave => " + str(reward))
        #rospy.logwarn("############### State in which we will start next step => " + str(nextState))
        qlearn.learn(state, action, reward, nextState)

        if not(done):
            state = nextState
        else:
            rospy.loginfo ("DONE")
            last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
            break
        rospy.loginfo("############### END Step =>" + str(i))
        #raw_input("Next Step...PRESS KEY")
        #rospy.sleep(2.0)
    m, s = divmod(int(time.time() - start_time), 60)
    h, m = divmod(m, 60)
    rospy.logwarn ( ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+"     Time: %d:%02d:%02d" % (h, m, s)))


rospy.loginfo ( ("\n|"+str(nepisodes)+"|"+str(qlearn.alpha)+"|"+str(qlearn.gamma)+"|"+str(initial_epsilon)+"*"+str(epsilon_discount)+"|"+str(highest_reward)+"| PICTURE |"))

l = last_time_steps.tolist()
l.sort()

#print("Parameters: a="+str)
rospy.loginfo("Overall score: {:0.2f}".format(last_time_steps.mean()))
rospy.loginfo("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))

env.close()

sarsa.py ==>

import random

class Sarsa:
def init(self, actions, epsilon=0.1, alpha=0.2, gamma=0.9):
self.q = {}

    self.epsilon = epsilon
    self.alpha = alpha
    self.gamma = gamma
    self.actions = actions

def getQ(self, state, action):
    return self.q.get((state, action), 0.0)

def learnQ(self, state, action, reward, value):
    oldv = self.q.get((state, action), None)
    if oldv is None:
        self.q[(state, action)] = reward 
    else:
        self.q[(state, action)] = oldv + self.alpha * (value - oldv)

def chooseAction(self, state):
    if random.random() < self.epsilon:
        action = random.choice(self.actions)
    else:
        q = [self.getQ(state, a) for a in self.actions]
        maxQ = max(q)
        count = q.count(maxQ)
        if count > 1:
            best = [i for i in range(len(self.actions)) if q[i] == maxQ]
            i = random.choice(best)
        else:
            i = q.index(maxQ)

        action = self.actions[i]
    return action

def learn(self, state1, action1, reward, state2, action2):
    qnext = self.getQ(state2, action2)
    self.learnQ(state1, action1, reward, reward + self.gamma * qnext)