Implementing advanced algorithms on local environments (Drone implementation)

Hi,

I’m currently working on a project to teach a drone how to hover.
I’m simply using [openai_ros] and for the training algorithm leveraging algorithms from openai baselines

After taking the courses, I have successfully implemented algorithms like q-learning and deepq using discrete action space. But faced few problems when I tried to used more advanced algorithms like a2c ppo on continous action space.

I think i have set the task env and training env right because the simulation starts to run when I launch the launch file for the training. But for a2c and ppo, the simulations does not reset. I can see from the terminal that the episode has ended for some reason (e.g drone got flipped, went outside of the bounded area etc.) but it just does not do the reset.

I’m assuming that this might be because of the different env settings for these algorithms since they sometimes allow multiple environment to run at the same time. Is there something that I should take care of before running these two algorithms?

This is the code for my training
Any sort of help would be appreciated thank you :smiley:

#!/usr/bin/env python

import os
import gym
from baselines.a2c import a2c
import rospy
import rospkg

# import our task environment
import hummingbird_hover_task_env_a2c

def main():
    rospy.init_node('hummingbird_train_a2c', anonymous=True, log_level=rospy.WARN)

    # Set the path where learned model will be saved
    rospack = rospkg.RosPack()
    pkg_path = rospack.get_path('hummingbird_pkg')
    models_dir_path = os.path.join(pkg_path, "models_saved")
    if not os.path.exists(models_dir_path):
        os.makedirs(models_dir_path)

    out_model_file_path = os.path.join(models_dir_path, "hummgingbird_hover_model_a2c.pkl")

    nsteps = rospy.get_param("/hummingbird/nsteps")
    total_timesteps= rospy.get_param("/hummingbird/total_timesteps")
    vf_coef = rospy.get_param("/hummingbird/vf_coef")
    ent_coef = rospy.get_param("/hummingbird/ent_coef")
    max_grad_norm = rospy.get_param("/hummingbird/max_grad_norm")
    lrschedule = rospy.get_param("/hummingbird/lrschedule")
    epsilon = rospy.get_param("/hummingbird/epsilon")
    alpha = rospy.get_param("/hummingbird/alpha")
    gamma = rospy.get_param("/hummingbird/gamma")
    log_interval = rospy.get_param("/hummingbird/log_interval")

    # max_timesteps = rospy.get_param("/hummingbird/max_timesteps")
    # buffer_size = rospy.get_param("/hummingbird/buffer_size")
    # We convert to float becase if we are using Ye-X notation, it sometimes treats it like a string.
    lr = float(rospy.get_param("/hummingbird/lr"))

    # exploration_fraction = rospy.get_param("/hummingbird/exploration_fraction")
    # exploration_final_eps = rospy.get_param("/hummingbird/exploration_final_eps")
    # print_freq = rospy.get_param("/hummingbird/print_freq")

    # reward_task_learned = rospy.get_param("/hummingbird/reward_task_learned")


    # def callback(lcl, _glb):
    #     # stop training if reward exceeds 199
    #     aux1 = lcl['t'] > 100
    #     aux2 = sum(lcl['episode_rewards'][-101:-1]) / 100
    #     is_solved = aux1 and aux2 >= reward_task_learned
    #
    #     rospy.logdebug("aux1="+str(aux1))
    #     rospy.logdebug("aux2="+str(aux2))
    #     rospy.logdebug("reward_task_learned="+str(reward_task_learned))
    #     rospy.logdebug("IS SOLVED?="+str(is_solved))
    #
    #     return is_solved

    env = gym.make("HummingbirdHoverTaskEnvA2C-v0")
    network = 'mlp'
    act = a2c.learn(
        network,
        env,
        seed=None,
        nsteps=5,
        total_timesteps=int(80e6),
        vf_coef=0.5,
        ent_coef=0.01,
        max_grad_norm=0.5,
        lr=7e-4,
        lrschedule='linear',
        epsilon=1e-5,
        alpha=0.99,
        gamma=0.99,
        log_interval=100,
        load_path=out_model_file_path
    )

    env.close()


if __name__ == '__main__':
    main()

    # for x in range(nepisodes):
    #     rospy.logdebug("############### START EPISODE => " + str(x))
    #
    #     cumulated_reward = 0
    #     done = False
    #     if qlearn.epsilon > 0.05:
    #         qlearn.epsilon *= epsilon_discount
    #
    #     # Initialize the environment and get first state of the robot
    #     observation = env.reset()
    #     state = ''.join(map(str, observation))
    #
    #     # Show on screen the actual situation of the robot
    #     # for each episode, we test the robot for nsteps
    #     for i in range(nsteps):
    #
    #         rospy.loginfo("############### Start Step => "+str(i))
    #         # Pick an action based on the current state
    #         action = qlearn.chooseAction(state)
    #         rospy.loginfo ("Next action is: %d", action)
    #         # Execute the action in the environment and get feedback
    #         observation, reward, done, info = env.step(action)
    #         rospy.loginfo(str(observation) + " " + str(reward))
    #         cumulated_reward += reward
    #         if highest_reward < cumulated_reward:
    #             highest_reward = cumulated_reward
    #
    #         nextState = ''.join(map(str, observation))
    #
    #         # Make the algorithm learn based on the results
    #         #rospy.logwarn("############### State we were => " + str(state))
    #         #rospy.logwarn("############### Action that we took => " + str(action))
    #         #rospy.logwarn("############### Reward that action gave => " + str(reward))
    #         #rospy.logwarn("############### State in which we will start next step => " + str(nextState))
    #         qlearn.learn(state, action, reward, nextState)
    #
    #         if not(done):
    #             state = nextState
    #         else:
    #             rospy.loginfo ("DONE")
    #             last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
    #             break
    #         rospy.loginfo("############### END Step =>" + str(i))
    #         #raw_input("Next Step...PRESS KEY")
    #         #rospy.sleep(2.0)
    #     m, s = divmod(int(time.time() - start_time), 60)
    #     h, m = divmod(m, 60)
    #     rospy.logwarn ( ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+"     Time: %d:%02d:%02d" % (h, m, s)))
    #
    #
    # rospy.loginfo ( ("\n|"+str(nepisodes)+"|"+str(qlearn.alpha)+"|"+str(qlearn.gamma)+"|"+str(initial_epsilon)+"*"+str(epsilon_discount)+"|"+str(highest_reward)+"| PICTURE |"))
    #
    # l = last_time_steps.tolist()
    # l.sort()
    #
    # #print("Parameters: a="+str)
    # rospy.loginfo("Overall score: {:0.2f}".format(last_time_steps.mean()))
    # rospy.loginfo("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))

Hi Kim,
the code that you are posting is just the part of the training script (according to the OpenAI-ROS architecture, see image below). It looks like it is correct.

Assuming that you have used the same RobotEnvironment for the drone that OpenAI-ROS provides off-the-shelf (I mean, you did not modify it), I think that the problem may be in your TaskEnvironment.

Can you share your rosject with me so I can have a look at it?

Dear rtellez,

Hope you are doing well.
Sorry for the late response. I had some personal issues and couldn’t spend any time on this project for couple of months. :frowning: However, now everyhing is resolved and I’m back!

Back to my question. Basically what I am trying to do right now is to as an extension to the course I took from here, I’m working on a personal project to teach a drone to hover using OpenaiRos on gazebo environment, with RotorS simulator: https://github.com/ethz-asl/rotors_simulator (This simulator offers multiple drone models, and also has actuators for the motor control as well – I’m also thinking of doing a test on a real drone after this project for completeness).

For this, I also somehow customized the RobotEnvironment a bit Like this

#! /usr/bin/env python

import numpy
import rospy
import time
from openai_ros import robot_gazebo_env
from std_msgs.msg import Float64
from sensor_msgs.msg import JointState
from sensor_msgs.msg import Imu
from mav_msgs.msg import Actuators
from nav_msgs.msg import Odometry



class HummingbirdHoverEnv(robot_gazebo_env.RobotGazeboEnv):
    """Superclass for all HummingbirdHoverEnv environments.
    """

    def __init__(self):
        """
        Initializes a new HummingbirdHoverEnv environment.
        The Sensors: The sensors accessible are the ones considered useful for AI learning.

        Sensor Topic List:
        * /hummingbird/ground_truth/imu: IMU of the drone giving acceleration and orientation relative to world.
        * /hummingbird/ground_truth/odometry: ODOM
        * /hummingbird/motor_speed: Sensing the motor speed

        Actuators Topic List:
        * /hummingbird/command/motor_speed: Command the motor speed

        Args:
        """
        rospy.logdebug("Start HummingbirdHoverEnv INIT...")

        # Variables that we give through the constructor.
        # None in this case

        # Internal Vars
        # Doesnt have any accessible
        self.controllers_list = []

        self.robot_name_space = ""

        # We launch the init function of the Parent Class robot_gazebo_env.RobotGazeboEnv
        super(HummingbirdHoverEnv, self).__init__(controllers_list=self.controllers_list,
                                                  robot_name_space=self.robot_name_space,
                                                  reset_controls=False,
                                                  start_init_physics_parameters=False,
                                                  reset_world_or_sim="WORLD")

        self.gazebo.unpauseSim()

        # self.controllers_object.reset_controllers()
        self._check_all_sensors_ready()

        # We Start all the ROS related Subscribers and publishers
        rospy.Subscriber("/hummingbird/ground_truth/imu", Imu, self._imu_callback)
        rospy.Subscriber("/hummingbird/ground_truth/odometry", Odometry, self._odom_callback)
        rospy.Subscriber("/hummingbird/motor_speed", Actuators, self._actuators_callback)

        self._cmd_motor_speed_pub = rospy.Publisher('/hummingbird/command/motor_speed', Actuators, queue_size=1)
        # self._hummingbird_position_pub = rospy.Publisher('/hummingbird/command/motor_speed', JointState, queue_size=1)

        self._check_publishers_connection()
        self.gazebo.pauseSim()

        rospy.logdebug("Finished HummingbirdHoverEnv INIT...")

    # Methods needed by the RobotGazeboEnv
    # ----------------------------

    def _check_all_systems_ready(self):
        """
        Checks that all the sensors, publishers and other simulation systems are
        operational.
        """
        self._check_all_sensors_ready()
        return True


    # HummingbirdHoverEnv virtual methods
    # ----------------------------

    def _check_all_sensors_ready(self):
        self._check_imu_ready()
        self._check_odom_ready()
        self._check_actuators_ready()
        rospy.logdebug("ALL SENSORS READY")

    def _check_imu_ready(self):
        self.imu = None
        rospy.logdebug("Waiting for //hummingbird/ground_truth/imu to be READY...")
        while self.imu is None and not rospy.is_shutdown():
            try:
                self.imu = rospy.wait_for_message(
                    "/hummingbird/ground_truth/imu", Imu, timeout=5.0)
                rospy.logdebug("Current /hummingbird/ground_truth/imu READY=>")

            except:
                rospy.logerr(
                    "Current /hummingbird/ground_truth/imu not ready yet, retrying for getting imu")

        return self.imu

    def _check_odom_ready(self):
        self.odom = None
        while self.odom is None and not rospy.is_shutdown():
            try:
                self.odom = rospy.wait_for_message("/hummingbird/ground_truth/odometry", Odometry, timeout=1.0)
                rospy.logdebug("Current /hummingbird/ground_truth/odometry READY=>" + str(self.odom))

            except:
                rospy.logerr("Current /hummingbird/ground_truth/odometry not ready yet, retrying for getting odom")

        return self.odom

    def _check_actuators_ready(self):
        self.actuators = None
        while self.actuators is None and not rospy.is_shutdown():
            try:
                self.actuators = rospy.wait_for_message("/hummingbird/motor_speed", Actuators, timeout=1.0)
                rospy.logdebug("Current /hummingbird/motor_speed READY=>" + str(self.actuators))

            except:
                rospy.logerr("Current /hummingbird/motor_speed not ready yet, retrying for getting odom")

        return self.actuators

    def _imu_callback(self, data):
        self.imu = data

    def _odom_callback(self, data):
        self.odom = data

    def _actuators_callback(self, data):
        self.actuators = data

    def _check_publishers_connection(self):
        """
        Checks that all the publishers are working
        :return:
        """
        rate = rospy.Rate(10)  # 10hz
        while self._cmd_motor_speed_pub.get_num_connections() == 0 and not rospy.is_shutdown():
            rospy.logdebug("No subscribers to _cmd_motor_speed_pub yet so we wait and try again")
            try:
                rate.sleep()
            except rospy.ROSInterruptException:
                # This is to avoid error when world is rested, time when backwards.
                pass
        rospy.logdebug("_cmd_motor_speed_pub Publisher Connected")
        rospy.logdebug("All Publishers READY")

    # Methods that the TaskEnvironment will need to define here as virtual
    # because they will be used in RobotGazeboEnv GrandParentClass and defined in the
    # TaskEnvironment.
    # ----------------------------
    def _set_init_pose(self):
        """Sets the Robot in its init pose
        """
        raise NotImplementedError()

    def _init_env_variables(self):
        """Inits variables needed to be initialised each time we reset at the start
        of an episode.
        """
        raise NotImplementedError()

    def _compute_reward(self, observations, done):
        """Calculates the reward to give based on the observations given.
        """
        raise NotImplementedError()

    def _set_action(self, action):
        """Applies the given action to the simulation.
        """
        raise NotImplementedError()

    def _get_obs(self):
        raise NotImplementedError()

    def _is_done(self, observations):
        """Checks if episode done based on observations given.
        """
        raise NotImplementedError()

    # Methods that the TaskEnvironment will need.
    # ----------------------------
    def move_motor(self, motor_speed):
        motor_speed_value = Actuators()
        motor_speed_value.angular_velocities = motor_speed
        rospy.logdebug("Motor Velocity>>" + str(motor_speed_value))
        self._cmd_motor_speed_pub.publish(motor_speed_value)
        # self.wait_until_motor_is_in_vel(motor_speed_value.angular_velocities)

    # def init_hummingbird_pose(self, init_pose):
    #     init_pose_value = JointState()
    #     init_pose_value.position = init_pose
    #     rospy.logdebug("Init pose>>" + str(init_pose_value))
    #     self._hummingbird_position_pub.publish(init_pose_value)

    def get_imu(self):
        return self.imu

    def get_odom(self):
        return self.odom

    def get_actuators(self):
        return self.actuators

    def reinit_sensors(self):
        """
        This method is for the tasks so that when reseting the episode
        the sensors values are forced to be updated with the real data and

        """

And my Task environment goes like this (My initial goal was to make this all work, so I did not change parameters, or did reward shaping yet)

#! /usr/bin/env python

import rospy
import numpy
from gym import spaces
import hummingbird_hover_env
from gym.envs.registration import register
from geometry_msgs.msg import Point
from geometry_msgs.msg import Vector3
from std_msgs.msg import Float64
from tf.transformations import euler_from_quaternion
from openai_ros.task_envs.task_commons import LoadYamlFileParamsTest
max_episode_steps = 10000 # Can be any Value

register(
    id='HummingbirdHoverTaskEnvA2C-v0',
    entry_point='hummingbird_hover_task_env_a2c:HummingbirdHoverTaskEnv',
    max_episode_steps=max_episode_steps,
)   ## CHECK NAMING HERE

class HummingbirdHoverTaskEnv(hummingbird_hover_env.HummingbirdHoverEnv):
    def __init__(self):
        """
        Make hummingbird learn how to hover (Get to a point)
        """

        # Load Params from the desired Yaml file
        LoadYamlFileParamsTest(rospackage_name="hummingbird_pkg",
                               rel_path_from_package_to_file="config",
                               yaml_file_name="hummingbird_a2c_params.yaml")    ### Need change

        # Only variable needed to be set here
        self.num_envs = rospy.get_param('/hummingbird/num_envs')

        self.n_actions = rospy.get_param('/hummingbird/n_actions')

        self.rpm_max = rospy.get_param("/hummingbird/action/rpm_max")
        self.rpm_min = rospy.get_param("/hummingbird/action/rpm_min")

        a_high = numpy.array([self.rpm_max, #r1
                              self.rpm_max, #r2
                              self.rpm_max, #r3
                              self.rpm_max, #r4
                             ])

        a_low = numpy.array([self.rpm_min, #r1
                             self.rpm_min, #r2
                             self.rpm_min, #r3
                             self.rpm_min, #r4
                            ])

        self.action_space = spaces.Box(a_low, a_high)
        # self.action_space = spaces.Discrete(self.n_actions)
    # We set the reward range, which is not compulsory but here we do it.
        self.reward_range = (-numpy.inf, numpy.inf)

        # Actions and Observations
        self.init_motor_speed = rospy.get_param(
            '/hummingbird/init_motor_speed')
        # self.step_motor_speed = rospy.get_param(
        #     '/hummingbird/step_motor_speed')
        # self.init_linear_speed_vector = Vector3()
        # self.init_linear_speed_vector.x = rospy.get_param(
        #     '/hummingbird/init_linear_speed_vector/x')
        # self.init_linear_speed_vector.y = rospy.get_param(
        #     '/hummingbird/init_linear_speed_vector/y')
        # self.init_linear_speed_vector.z = rospy.get_param(
        #     '/hummingbird/init_linear_speed_vector/z')
        #
        # self.init_angular_turn_speed = rospy.get_param(
        #     '/hummingbird/init_angular_turn_speed')
        self.running_step = rospy.get_param(
            '/hummingbird/running_step')

        #self.init_point = Float64()
        #self.init_point.data = rospy.get_param("/hummingbird/init_position")

        # Get WorkSpace Cube Dimensions
        self.work_space_x_max = rospy.get_param("/hummingbird/work_space/x_max")
        self.work_space_x_min = rospy.get_param("/hummingbird/work_space/x_min")
        self.work_space_y_max = rospy.get_param("/hummingbird/work_space/y_max")
        self.work_space_y_min = rospy.get_param("/hummingbird/work_space/y_min")
        self.work_space_z_max = rospy.get_param("/hummingbird/work_space/z_max")
        self.work_space_z_min = rospy.get_param("/hummingbird/work_space/z_min")

        # Maximum RPY values
        self.max_roll = rospy.get_param("/hummingbird/max_roll")
        self.max_pitch = rospy.get_param("/hummingbird/max_pitch")
        self.max_yaw = rospy.get_param("/hummingbird/max_yaw")

        # Maximum linear v,a and angular w and a -- just define range

        # Get Desired Point to Get
        self.desired_point = Point()
        self.desired_point.x = rospy.get_param("/hummingbird/desired_pose/x")
        self.desired_point.y = rospy.get_param("/hummingbird/desired_pose/y")
        self.desired_point.z = rospy.get_param("/hummingbird/desired_pose/z")

        self.desired_point_epsilon = rospy.get_param("/hummingbird/desired_point_epsilon")

        # We place the Maximum and minimum values of the X,Y,Z,R,P,Yof the pose

        high = numpy.array([self.work_space_x_max,      #pos.x
                            self.work_space_y_max,      #pos.y
                            self.work_space_z_max,      #pos.z
                            numpy.inf,                  #vel.x
                            numpy.inf,                  #vel.y
                            numpy.inf,                  #vel.z
                            numpy.inf,                  #acc.x
                            numpy.inf,                  #acc.y
                            numpy.inf,                  #acc.z
                            self.max_roll,              #att.x
                            self.max_pitch,             #att.y
                            self.max_yaw,               #att.z
                            numpy.inf,                  #ang_vel.x
                            numpy.inf,                  #ang_vel.y
                            numpy.inf                   #ang_vel.z
                            ])
        # numpy.inf,                #ang_acc.x
        # numpy.inf,                #ang_acc.y
        # numpy.inf,                #ang_acc.z

        low = numpy.array([self.work_space_x_min,       #pos.x
                           self.work_space_y_min,       #pos.y
                           self.work_space_z_min,       #pos.z
                           -numpy.inf,                  #vel.x
                           -numpy.inf,                  #vel.y
                           -numpy.inf,                  #vel.z
                           -numpy.inf,                  #acc.x
                           -numpy.inf,                  #acc.y
                           -numpy.inf,                  #acc.z
                           -1*self.max_roll,            #att.x
                           -1*self.max_pitch,           #att.y
                           -1*self.max_yaw,             #att.z
                           -numpy.inf,                  #ang_vel.x
                           -numpy.inf,                  #ang_vel.y
                           -numpy.inf                   #ang_vel.z
                           ])
        # -numpy.inf,                  #ang_acc.x
        # -numpy.inf,                  #ang_acc.y
        # -numpy.inf,                  #ang_acc.z

        self.observation_space = spaces.Box(low, high)

        rospy.logdebug("ACTION SPACES TYPE===>" + str(self.action_space))
        rospy.logdebug("OBSERVATION SPACES TYPE===>" + str(self.observation_space))

        # Rewards (needs shaping)
        self.closer_to_point_reward = rospy.get_param("/hummingbird/closer_to_point_reward")
        self.not_ending_point_reward = rospy.get_param("/hummingbird/not_ending_point_reward")
        self.end_episode_points = rospy.get_param("/hummingbird/end_episode_points")

        self.cumulated_steps = 0.0

        # Here we will add any init functions prior to starting the MyRobotEnv
        super(HummingbirdHoverTaskEnv, self).__init__() #ros_ws_abspath

    def _set_init_pose(self):
        """
        Sets the Robot in its init motor speed
        and lands the robot. Its preparing it to be reseted in the world.
        """
        #raw_input("INIT SPEED PRESS")
        init_motor_input = [self.init_motor_speed, self.init_motor_speed, self.init_motor_speed, self.init_motor_speed]
        self.move_motor(init_motor_input)
        # rospy.sleep(1.0) # wait for some time
        # We Issue the landing command to be sure it starts landing
        # raw_input("LAND PRESS")
        # self.land()

        return True

    def _init_env_variables(self):
        """
        Inits variables needed to be initialised each time we reset at the start
        of an episode.
        :return:
        """
        #raw_input("TakeOFF PRESS")
        # We TakeOff before sending any movement commands
        # self.takeoff()
        self.motor_speed = self.init_motor_speed
        # For Info Purposes
        self.cumulated_reward = 0.0
        # We get the initial pose to measure the distance from the desired point.
        gt_pose = self.get_base_pose()
        self.previous_distance_from_des_point = self.get_distance_from_desired_point(gt_pose.position)

    def _set_action(self, action):
        """
        This set action will Set the motor speed of the hummingbird
        :param action: The action that set s what movement to do next.
        """

        rospy.logdebug("Start Set Action ==>"+str(action))
        # We convert the actions to speed movements to send to the parent class of Hummingbird

        # if action == 0:  # Speed up
        #     self.motor_speed += self.step_motor_speed
        #     # self.last_action = "FORWARDS"
        # elif action == 1:  # Speed down
        #     self.motor_speed -= self.step_motor_speed
        #     # self.last_action = "BACKWARDS"

        # We tell hummingbird the motor speed to set to execute
        # print(action[0])
        motor_input = 500 * action[0]
        self.move_motor(motor_input)
        rospy.sleep(self.running_step)  # wait for some time
        rospy.logdebug("END Set Action ==>"+str(action))

    def _get_obs(self):
        """
        Here we define what sensor data defines our robots observations
        To know which Variables we have access to, we need to read the
        droneEnv API DOCS
        :return:
        """
        rospy.logdebug("Start Get Observation ==>")
        # We get the laser scan data
        gt_pose = self.get_base_pose()
        gt_linear_vel = self.get_base_linear_vel()
        gt_angular_vel = self.get_base_angular_vel()
        gt_linear_acc = self.get_base_linear_acc()
        #gt_angular_acc = self.get_base_angular_acc()

        # We get the orientation of the cube in RPY
        roll, pitch, yaw = self.get_orientation_euler(gt_pose.orientation)

        """
        observations = [    round(gt_pose.position.x, 1),
                            round(gt_pose.position.y, 1),
                            round(gt_pose.position.z, 1),
                            round(roll, 1),
                            round(pitch, 1),
                            round(yaw, 1),
                            round(sonar_value,1)]
        """
        # We simplify a bit the spatial grid to make learning faster
        observations = [round(gt_pose.position.x, 1),
                        round(gt_pose.position.y, 1),
                        round(gt_pose.position.z, 1),
                        round(gt_linear_vel.x, 1),
                        round(gt_linear_vel.y, 1),
                        round(gt_linear_vel.z, 1),
                        round(gt_linear_acc.x, 1),
                        round(gt_linear_acc.y, 1),
                        round(gt_linear_acc.z, 1),
                        round(roll, 1),
                        round(pitch, 1),
                        round(yaw, 1),
                        round(gt_angular_vel.x, 1),
                        round(gt_angular_vel.y, 1),
                        round(gt_angular_vel.z, 1),
                        # round(gt_angular_acc.x, 1),
                        # round(gt_angular_acc.y, 1),
                        # round(gt_angular_acc.z, 1),
                        ]

        rospy.logdebug("Observations==>"+str(observations))
        rospy.logdebug("END Get Observation ==>")
        # print(observations)
        return observations

    def _is_done(self, observations):
        """
        The done can be done due to three reasons:
        1) It went outside the workspace
        2) It flipped due to a crash or something
        3) It has reached the desired point
        """

        episode_done = False

        current_position = Point()
        current_position.x = observations[0]
        current_position.y = observations[1]
        current_position.z = observations[2]

        current_orientation = Point()
        current_orientation.x = observations[9]
        current_orientation.y = observations[10]
        current_orientation.z = observations[11]

        is_inside_workspace_now = self.is_inside_workspace(current_position)
        # sonar_detected_something_too_close_now = self.sonar_detected_something_too_close(
        #     sonar_value)
        drone_flipped = self.drone_has_flipped(current_orientation)
        has_reached_des_point = self.is_in_desired_position(
            current_position, self.desired_point_epsilon)

        rospy.logwarn(">>>>>> DONE RESULTS <<<<<")
        if not is_inside_workspace_now:
            rospy.logerr("is_inside_workspace_now=" +
                         str(is_inside_workspace_now))
        else:
            rospy.logwarn("is_inside_workspace_now=" +
                          str(is_inside_workspace_now))

        if drone_flipped:
            rospy.logerr("drone_flipped="+str(drone_flipped))
        else:
            rospy.logwarn("drone_flipped="+str(drone_flipped))

        if has_reached_des_point:
            rospy.logerr("has_reached_des_point="+str(has_reached_des_point))
        else:
            rospy.logwarn("has_reached_des_point="+str(has_reached_des_point))

        # We see if we are outside the Learning Space
        episode_done = not(
            is_inside_workspace_now) or drone_flipped or has_reached_des_point

        if episode_done:
            rospy.logerr("episode_done====>"+str(episode_done))
        else:
            rospy.logwarn("episode_done====>"+str(episode_done))

        return episode_done

    def _compute_reward(self, observations, done):

        current_position = Point()
        current_position.x = observations[0]
        current_position.y = observations[1]
        current_position.z = observations[2]

        distance_from_des_point = self.get_distance_from_desired_point(
            current_position)
        distance_difference = distance_from_des_point - \
                              self.previous_distance_from_des_point

        if not done:

            # If there has been a decrease in the distance to the desired point, we reward it
            if distance_difference < 0.0:
                rospy.logwarn("DECREASE IN DISTANCE GOOD")
                reward = self.closer_to_point_reward
            else:
                rospy.logerr("INCREASE IN DISTANCE BAD")
                reward = 0

        else:

            if self.is_in_desired_position(current_position, epsilon=0.5):
                reward = self.end_episode_points
            else:
                reward = -1*self.end_episode_points

        self.previous_distance_from_des_point = distance_from_des_point

        rospy.logdebug("reward=" + str(reward))
        self.cumulated_reward += reward
        rospy.logdebug("Cumulated_reward=" + str(self.cumulated_reward))
        self.cumulated_steps += 1
        rospy.logdebug("Cumulated_steps=" + str(self.cumulated_steps))

        return reward

    # Internal TaskEnv Methods

    def is_in_desired_position(self, current_position, epsilon=0.05):
        """
        It return True if the current position is similar to the desired poistion
        """

        is_in_desired_pos = False

        x_pos_plus = self.desired_point.x + epsilon
        x_pos_minus = self.desired_point.x - epsilon
        y_pos_plus = self.desired_point.y + epsilon
        y_pos_minus = self.desired_point.y - epsilon
        z_pos_plus = self.desired_point.z + epsilon
        z_pos_minus = self.desired_point.z - epsilon

        x_current = current_position.x
        y_current = current_position.y
        z_current = current_position.z

        x_pos_are_close = (x_current <= x_pos_plus) and (
                x_current > x_pos_minus)
        y_pos_are_close = (y_current <= y_pos_plus) and (
                y_current > y_pos_minus)
        z_pos_are_close = (z_current <= z_pos_plus) and (
                z_current > z_pos_minus)

        is_in_desired_pos = x_pos_are_close and y_pos_are_close and z_pos_are_close

        rospy.logwarn("###### IS DESIRED POS ? ######")
        rospy.logwarn("current_position"+str(current_position))
        rospy.logwarn("x_pos_plus"+str(x_pos_plus) +
                      ",x_pos_minus="+str(x_pos_minus))
        rospy.logwarn("y_pos_plus"+str(y_pos_plus) +
                      ",y_pos_minus="+str(y_pos_minus))
        rospy.logwarn("z_pos_plus"+str(z_pos_plus) +
                      ",z_pos_minus="+str(z_pos_minus))
        rospy.logwarn("x_pos_are_close"+str(x_pos_are_close))
        rospy.logwarn("y_pos_are_close"+str(y_pos_are_close))
        rospy.logwarn("z_pos_are_close"+str(z_pos_are_close))
        rospy.logwarn("is_in_desired_pos"+str(is_in_desired_pos))
        rospy.logwarn("############")

        return is_in_desired_pos

    def is_inside_workspace(self, current_position):
        """
        Check if the Drone is inside the Workspace defined
        """
        is_inside = False

        rospy.logwarn("##### INSIDE WORK SPACE? #######")
        rospy.logwarn("XYZ current_position"+str(current_position))
        rospy.logwarn("work_space_x_max"+str(self.work_space_x_max) +
                      ",work_space_x_min="+str(self.work_space_x_min))
        rospy.logwarn("work_space_y_max"+str(self.work_space_y_max) +
                      ",work_space_y_min="+str(self.work_space_y_min))
        rospy.logwarn("work_space_z_max"+str(self.work_space_z_max) +
                      ",work_space_z_min="+str(self.work_space_z_min))
        rospy.logwarn("############")

        if current_position.x > self.work_space_x_min and current_position.x <= self.work_space_x_max:
            if current_position.y > self.work_space_y_min and current_position.y <= self.work_space_y_max:
                if current_position.z > self.work_space_z_min and current_position.z <= self.work_space_z_max:
                    is_inside = True

        return is_inside

    def drone_has_flipped(self, current_orientation):
        """
        Based on the orientation RPY given states if the drone has flipped
        """
        has_flipped = True

        self.max_roll = rospy.get_param("/hummingbird/max_roll")
        self.max_pitch = rospy.get_param("/hummingbird/max_pitch")

        rospy.logwarn("#### HAS FLIPPED? ########")
        rospy.logwarn("RPY current_orientation"+str(current_orientation))
        rospy.logwarn("max_roll"+str(self.max_roll) +
                      ",min_roll="+str(-1*self.max_roll))
        rospy.logwarn("max_pitch"+str(self.max_pitch) +
                      ",min_pitch="+str(-1*self.max_pitch))
        rospy.logwarn("############")

        if current_orientation.x > -1*self.max_roll and current_orientation.x <= self.max_roll:
            if current_orientation.y > -1*self.max_pitch and current_orientation.y <= self.max_pitch:
                has_flipped = False

        return has_flipped

    def get_base_pose(self):

        odom = self.get_odom()
        base_pose = odom.pose.pose

        return base_pose

    def get_base_linear_vel(self):

        odom = self.get_odom()
        base_linear_vel = odom.twist.twist.linear

        return base_linear_vel

    def get_base_angular_vel(self):

        odom = self.get_odom()
        base_angular_vel = odom.twist.twist.angular

        return base_angular_vel

    def get_base_rpy(self):

        imu = self.get_imu()
        base_orientation = imu.orientation

        euler_rpy = Vector3()
        euler = euler_from_quaternion([ base_orientation.x,
                                        base_orientation.y,
                                        base_orientation.z,
                                        base_orientation.w]
                                      )
        euler_rpy.x = euler[0]
        euler_rpy.y = euler[1]
        euler_rpy.z = euler[2]

        return euler_rpy

    def get_base_linear_acc(self):

        imu = self.get_imu()
        base_linear_acc = imu.linear_acceleration

        return base_linear_acc

    def get_distance_from_desired_point(self, current_position):
        """
        Calculates the distance from the current position to the desired point
        :param start_point:
        :return:
        """
        distance = self.get_distance_from_point(current_position,
                                                self.desired_point)

        return distance

    def get_distance_from_point(self, pstart, p_end):
        """
        Given a Vector3 Object, get distance from current position
        :param p_end:
        :return:
        """
        a = numpy.array((pstart.x, pstart.y, pstart.z))
        b = numpy.array((p_end.x, p_end.y, p_end.z))

        distance = numpy.linalg.norm(a - b)

        return distance

    def get_orientation_euler(self, quaternion_vector):
        # We convert from quaternions to euler
        orientation_list = [quaternion_vector.x,
                            quaternion_vector.y,
                            quaternion_vector.z,
                            quaternion_vector.w]

        roll, pitch, yaw = euler_from_quaternion(orientation_list)
        return roll, pitch, yaw

And the learning part is basically the same as mentioned above, which just calls the learning scripts from the baselines and basically leveraging it.

Strange thing here is that when I tried the learning with TRPO, it worked (meaning that the it automatically resets the env after the episode, but i have no idea why this is not the case for algorithms like PPO and A2C.

For the learning part, I’m using tf2.

I’m sorry that I cannot share the whole rosject with you, my goal was to try to run everything on my local machine, but if you need more information i will try to share the whole thing by pushing this to the git or sth.

Thank you very much for your time, and looking forward to your answer.

Sincerely,
Taehyoung Kim