!26518 tag environment implement

Merge pull request !26518 from chenweifeng/tag-env-implement
4 years ago · 0d11abc7c2
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/rl/tag_env_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/rl/tag_env_impl.cu
@@ -0,0 +1,339 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/rl/tag_env_impl.cuh"
 #include <assert.h>
 #include <algorithm>

 __global__ void InitKernel(const int env_num, const int agent_num, const GameSetting *setting,
                           AgentState *agent_state) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < env_num * agent_num; i += gridDim.x * blockDim.x) {
    curand_init(setting->seed, i, 0, &agent_state->rand_state[i]);
  }
 }

 void InitEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *state,
             cudaStream_t stream) {
  InitKernel<<<(env_num * agent_num + 255) / 256, 256, 0, stream>>>(env_num, agent_num, setting, state);
 }

 __global__ void ResetKernel(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
                            float *state) {
  // Reset the agent state
  for (size_t gaid = blockIdx.x * blockDim.x + threadIdx.x; gaid < env_num * agent_num;
       gaid += gridDim.x * blockDim.x) {
    const int eid = gaid / agent_num;
    const int aid = gaid % agent_num;

    // Static reset.
    bool is_prey = (aid >= setting->predator_num);
    int x = is_prey ? 0 : 0.5 * setting->map_width;
    int y = is_prey ? 0 : 0.5 * setting->map_length;

    // Random reset.
    // auto local_state = agent_state->rand_state[gaid];
    // // curand_uniform result [0, 1), cache x y to registers.
    // int x = static_cast<int>(curand_uniform(&local_state) * setting->map_width);
    // int y = static_cast<int>(curand_uniform(&local_state) * setting->map_length);
    // agent_state->rand_state[gaid] = local_state;

    agent_state->loc_x[gaid] = x;
    agent_state->loc_y[gaid] = y;
    agent_state->still_in_game[gaid] = true;
    agent_state->time_step[eid] = 0;
    agent_state->prey_left[eid] = setting->prey_num;

    const int state_size_per_agent = agent_num * kFeatureNum + 1;
    const size_t base = eid * agent_num * state_size_per_agent + aid * kFeatureNum;
    for (int id = 0; id < agent_num; id++) {
      size_t offset = base + id * state_size_per_agent;
      state[offset] = static_cast<float>(x) / setting->map_width;
      state[offset + 1] = static_cast<float>(y) / setting->map_length;
      state[offset + 2] = aid >= setting->predator_num;
      state[offset + 3] = aid == id;
    }
    state[eid * agent_num * state_size_per_agent + aid * state_size_per_agent + agent_num * kFeatureNum] = 0;
  }
 }

 __global__ void ResetIfDoneKernel(const int env_num, const int agent_num, const GameSetting *setting,
                                  AgentState *agent_state, float *state, const bool *done) {
  for (size_t gaid = blockIdx.x * blockDim.x + threadIdx.x; gaid < env_num * agent_num;
       gaid += gridDim.x * blockDim.x) {
    const int eid = gaid / agent_num;
    const int aid = gaid % agent_num;

    if (done[eid]) {
      bool is_prey = (aid >= setting->predator_num);
      int x = is_prey ? 0 : 0.5 * setting->map_width;
      int y = is_prey ? 0 : 0.5 * setting->map_length;

      agent_state->loc_x[gaid] = x;
      agent_state->loc_y[gaid] = y;
      agent_state->still_in_game[gaid] = true;
      agent_state->time_step[eid] = 0;
      agent_state->prey_left[eid] = setting->prey_num;

      const int state_size_per_agent = agent_num * kFeatureNum + 1;
      const size_t base = eid * agent_num * state_size_per_agent + aid * kFeatureNum;
      for (int id = 0; id < agent_num; id++) {
        size_t offset = base + id * state_size_per_agent;
        state[offset] = static_cast<float>(x) / setting->map_width;
        state[offset + 1] = static_cast<float>(y) / setting->map_length;
        state[offset + 2] = aid >= setting->predator_num;
        state[offset + 3] = aid == id;
      }
      state[eid * agent_num * state_size_per_agent + aid * state_size_per_agent + agent_num * kFeatureNum] = 0;
    }
  }
 }

 void ResetEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state, float *state,
              cudaStream_t stream) {
  ResetKernel<<<(env_num * agent_num + 255) / 256, 256, 0, stream>>>(env_num, agent_num, setting, agent_state, state);
 }

 __global__ void StepBindBlockKernel(const int env_num, const int agent_num, const GameSetting *setting,
                                    AgentState *agent_state, const int *action, float *state, float *reward,
                                    bool *done) {
  __shared__ int team_reward;
  extern __shared__ int agent_loc[];
  int *loc_x = agent_loc;
  int *loc_y = agent_loc + agent_num;

  int eid = blockIdx.x;
  for (int aid = threadIdx.x; aid < agent_num; aid += blockDim.x) {
    int gaid = eid * agent_num + aid;
    float agent_reward = 0.0;

    // Parse discrete action.
    int action_offset = action[gaid] * 2;
    assert(action_offset <= 8);
    int action_x = setting->index_to_action[action_offset];
    int action_y = setting->index_to_action[action_offset + 1];

    // Update agent location.
    int x = agent_state->loc_x[gaid] + action_x;
    int y = agent_state->loc_y[gaid] + action_y;

    int map_width = setting->map_width;
    int map_length = setting->map_length;
    if (x < 0 || y < 0 || x > map_width || y > map_length) {
      x = min(max(0, x), setting->map_width);
      y = min(max(0, y), setting->map_length);
      agent_reward -= setting->wall_hit_penalty;
    }

    loc_x[aid] = x;
    loc_y[aid] = y;
    agent_state->loc_x[gaid] = x;
    agent_state->loc_y[gaid] = y;

    // Update time step
    if (aid == 0) {
      team_reward = 0;
      agent_state->time_step[eid]++;
    }
    __syncthreads();

    // Calculate team reward.
    bool is_prey = aid >= setting->predator_num;
    if (is_prey && agent_state->still_in_game[gaid]) {
      for (int tid = 0; tid < setting->predator_num; tid++) {
        // Every prey only caught by one predator.
        if (x == loc_x[tid] && y == loc_y[tid]) {
          agent_state->still_in_game[gaid] = false;
          agent_state->prey_left[eid]--;
          atomicAdd(&team_reward, 1);
          break;
        }
      }
    }
    __syncthreads();

    // Construct observation.
    const int state_size_per_agent = agent_num * kFeatureNum + 1;
    const size_t base = eid * agent_num * state_size_per_agent + aid * kFeatureNum;
    for (int id = 0; id < agent_num; id++) {
      size_t offset = base + id * state_size_per_agent;
      state[offset] = static_cast<float>(x) / map_width;
      state[offset + 1] = static_cast<float>(y) / map_length;
      state[offset + 2] = is_prey;
      state[offset + 3] = (aid == id);
    }
    state[eid * agent_num * state_size_per_agent + aid * state_size_per_agent + agent_num * kFeatureNum] =
      static_cast<float>(agent_state->time_step[eid]) / setting->max_timestep;

    if (team_reward > 0) {
      agent_reward += is_prey ? -setting->caught_penalty * team_reward : setting->catch_reward * team_reward;
    } else {
      agent_reward += is_prey ? setting->step_cost : -setting->step_cost;
    }

    reward[gaid] = agent_reward * agent_state->still_in_game[gaid];

    if (aid == 0) {
      done[eid] = (agent_state->time_step[eid] >= setting->max_timestep) || (agent_state->prey_left[eid] == 0);
    }
  }
 }

 void StepBindBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
                   const int *action, float *state, float *reward, bool *done, cudaStream_t stream) {
  size_t shm_size = env_num * agent_num * sizeof(float) * 2;
  StepBindBlockKernel<<<env_num, 256, shm_size, stream>>>(env_num, agent_num, setting, agent_state, action, state,
                                                          reward, done);

  int block_dim = 256;
  int grid_dim = (env_num * agent_num + block_dim - 1) / block_dim;
  ResetIfDoneKernel<<<grid_dim, block_dim, 0, stream>>>(env_num, agent_num, setting, agent_state, state, done);
 }

 __global__ void UpdateAgentLoc(const int env_num, const int agent_num, const GameSetting *setting,
                               AgentState *agent_state, const int *action, float *state, float *reward) {
  int total_agent = env_num * agent_num;
  for (size_t gaid = blockIdx.x * blockDim.x + threadIdx.x; gaid < total_agent; gaid += gridDim.x * blockDim.x) {
    const int eid = gaid / agent_num;
    const int aid = gaid % agent_num;
    // Parse discrete action.
    int action_offset = action[gaid] * 2;
    assert(action_offset <= 8);
    int action_x = setting->index_to_action[action_offset];
    int action_y = setting->index_to_action[action_offset + 1];

    // Update agent location
    int x = agent_state->loc_x[gaid] + action_x;
    int y = agent_state->loc_y[gaid] + action_y;

    int map_width = setting->map_width;
    int map_length = setting->map_length;
    reward[gaid] = 0.0;
    if (x < 0 || y < 0 || x > map_width || y > map_length) {
      x = min(max(0, x), setting->map_width);
      y = min(max(0, y), setting->map_length);
      reward[gaid] -= setting->wall_hit_penalty;
    }
    agent_state->loc_x[gaid] = x;
    agent_state->loc_y[gaid] = y;

    if (aid == 0) {
      agent_state->time_step[eid]++;
    }
  }
 }

 __global__ void CalcTeamReward(const int env_num, const int agent_num, const GameSetting *setting,
                               AgentState *agent_state, float *team_reward) {
  const int prey_num_per_env = setting->prey_num;
  const int total_prey_num = env_num * prey_num_per_env;
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total_prey_num; i += gridDim.x * blockDim.x) {
    const int eid = i / prey_num_per_env;
    const int rid = eid * agent_num + i % prey_num_per_env + setting->predator_num;

    if (agent_state->still_in_game[rid]) {
      int x = agent_state->loc_x[rid];
      int y = agent_state->loc_y[rid];
      for (int j = 0; j < setting->predator_num; j++) {
        int tid = eid * agent_num + j;
        if (x == agent_state->loc_x[tid] && y == agent_state->loc_y[tid]) {
          agent_state->still_in_game[rid] = false;
          agent_state->prey_left[eid]--;
          atomicAdd(&team_reward[eid], 1);
        }
      }
    }
  }
 }

 __global__ void CalcAgentReward(const int env_num, const int agent_num, const GameSetting *setting,
                                AgentState *agent_state, float *state, float *reward, bool *done, float *team_reward) {
  int total_agent_num = env_num * agent_num;
  for (size_t gaid = blockIdx.x * blockDim.x + threadIdx.x; gaid < total_agent_num; gaid += gridDim.x * blockDim.x) {
    int eid = gaid / agent_num;
    int aid = gaid % agent_num;
    bool is_prey = aid >= setting->predator_num;
    if (team_reward[eid] > 0) {
      reward[gaid] += is_prey ? -team_reward[eid] * setting->caught_penalty : team_reward[eid] * setting->catch_reward;
    } else {
      reward[gaid] += is_prey ? setting->step_cost : -setting->step_cost;
    }

    // Construct observation.
    int x = agent_state->loc_x[gaid];
    int y = agent_state->loc_y[gaid];
    int map_width = setting->map_width;
    int map_length = setting->map_length;

    const int state_size_per_agent = agent_num * kFeatureNum + 1;
    const size_t base = eid * agent_num * state_size_per_agent + aid * kFeatureNum;
    for (int id = 0; id < agent_num; id++) {
      size_t offset = base + id * state_size_per_agent;
      state[offset] = static_cast<float>(x) / map_width;
      state[offset + 1] = static_cast<float>(y) / map_length;
      state[offset + 2] = is_prey;
      state[offset + 3] = (aid == id);
    }
    state[eid * agent_num * state_size_per_agent + aid * state_size_per_agent + agent_num * kFeatureNum] =
      static_cast<float>(agent_state->time_step[eid]) / setting->max_timestep;

    if (aid == 0) {
      done[eid] = (agent_state->time_step[eid] >= setting->max_timestep) || (agent_state->prey_left[eid] == 0);
    }
  }
 }

 void StepCrossBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
                    const int *action, float *state, float *reward, bool *done, float *team_reward,
                    cudaStream_t stream) {
  // Update agent location, construct observation, done.
  int block_dim = 256;
  int grid_dim = (env_num * agent_num + block_dim - 1) / block_dim;
  UpdateAgentLoc<<<grid_dim, block_dim, 0, stream>>>(env_num, agent_num, setting, agent_state, action, state, reward);

  // Calculate team reward.
  cudaMemsetAsync(team_reward, 0, sizeof(float) * env_num, stream);
  CalcTeamReward<<<grid_dim, block_dim, 0, stream>>>(env_num, agent_num, setting, agent_state, team_reward);

  // Calculate agent reward.
  CalcAgentReward<<<grid_dim, block_dim, 0, stream>>>(env_num, agent_num, setting, agent_state, state, reward, done,
                                                      team_reward);

  // Reset the ended environment.
  ResetIfDoneKernel<<<grid_dim, block_dim, 0, stream>>>(env_num, agent_num, setting, agent_state, state, done);
 }

 __global__ void AgentStateCopyKernel(const int env_num, const int agent_num, AgentState *dst, AgentState *src) {
  int total_agent_num = env_num * agent_num;
  for (size_t gaid = blockIdx.x * blockDim.x + threadIdx.x; gaid < total_agent_num; gaid += gridDim.x * blockDim.x) {
    int eid = gaid / total_agent_num;
    int aid = gaid % total_agent_num;
    dst->loc_x[gaid] = src->loc_x[gaid];
    dst->loc_y[gaid] = src->loc_y[gaid];
    dst->rand_state[gaid] = src->rand_state[gaid];
    dst->still_in_game[gaid] = src->still_in_game[gaid];

    if (aid == 0) {
      dst->time_step[eid] = src->time_step[eid];
      dst->prey_left[eid] = src->prey_left[eid];
    }
  }
 }

 void AgentStateCopy(const int env_num, const int agent_num, AgentState *dst, AgentState *src, cudaStream_t stream) {
  int block_dim = 256;
  int grid_dim = (env_num * agent_num + block_dim - 1) / block_dim;
  AgentStateCopyKernel<<<grid_dim, block_dim, 0, stream>>>(env_num, agent_num, dst, src);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/rl/tag_env_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/rl/tag_env_impl.cuh
@@ -0,0 +1,59 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_TAG_ENV_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_TAG_ENV_IMPL_H_

 #include <curand_kernel.h>

 constexpr int kFeatureNum = 4;

 struct GameSetting {
  int seed;
  int predator_num;
  int prey_num;
  int max_timestep;
  int map_length;
  int map_width;
  float wall_hit_penalty;
  float catch_reward;
  float caught_penalty;
  float step_cost;
  int index_to_action[10] = {0, 0, 1, 0, -1, 0, 0, 1, 0, -1};
 };

 // Structure of array (short for SOA) for parallel.
 // member shape: [env_num, agent_num]
 struct AgentState {
  int *loc_x;
  int *loc_y;
  curandState *rand_state;
  bool *still_in_game;
  int *time_step;
  int *prey_left;
 };

 void InitEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *state,
             cudaStream_t stream);
 void ResetEnv(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state, float *state,
              cudaStream_t stream);
 void StepBindBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
                   const int *action, float *state, float *reward, bool *done, cudaStream_t stream);
 void StepCrossBlock(const int env_num, const int agent_num, const GameSetting *setting, AgentState *agent_state,
                    const int *action, float *state, float *reward, bool *done, float *team_reward,
                    cudaStream_t stream);
 void AgentStateCopy(const int env_num, const int agent_num, AgentState *dst, AgentState *src, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_TAG_ENV_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/rl/environment_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/rl/environment_factory.h
@@ -46,6 +46,7 @@ class Environment {
  virtual size_t StateSizeInBytes() = 0;
  virtual size_t RewardSizeInBytes() = 0;
  virtual size_t DoneSizeInBytes() = 0;
  virtual size_t WorkspaceSizeInBytes() { return 0; }
 };

 constexpr int64_t kInvalidHandle = -1;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/rl/tag_environment.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/rl/tag_environment.cc
@@ -0,0 +1,226 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/rl/tag_environment.h"

 #include <memory>
 #include <string>
 #include <vector>
 #include <tuple>
 #include <map>
 #include <utility>
 #include "runtime/device/gpu/cuda_driver.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr auto kSeedAttr = "seed";
 constexpr auto kPredatorNumAttr = "predator_num";
 constexpr auto kPreyNumrAttr = "prey_num";
 constexpr auto kMaxTimestepAttr = "max_timestep";
 constexpr auto kMapLengthAttr = "map_length";
 constexpr auto kMapWidthAttr = "map_width";
 constexpr auto kWallHitPenaltyAttr = "wall_hit_penalty";
 constexpr auto kCatchRewardAttr = "catch_reward";
 constexpr auto kCaughtPenaltyAttr = "catched_penalty";
 constexpr auto kStepCostAttr = "step_cost";
 constexpr auto kEnvNumAttr = "env_num";
 }  // namespace

 TagEnvironment::~TagEnvironment() {
  auto &allocator = device::gpu::GPUMemoryAllocator::GetInstance();
  allocator.FreeTensorMem(agent_state_device_);
  allocator.FreeTensorMem(game_setting_device_);
  FinalizeAgentState(agent_state_host_);
 }

 bool TagEnvironment::InitGameSetting(const CNodePtr &cnode, GameSetting *setting_host) {
  MS_LOG(EXCEPTION) << "The `game_setting` should not be nullprt";

  setting_host->seed = AnfAlgo::GetNodeAttr<int64_t>(cnode, kSeedAttr);
  setting_host->predator_num = AnfAlgo::GetNodeAttr<int64_t>(cnode, kPredatorNumAttr);
  setting_host->prey_num = AnfAlgo::GetNodeAttr<int64_t>(cnode, kPreyNumrAttr);
  setting_host->max_timestep = AnfAlgo::GetNodeAttr<int64_t>(cnode, kMaxTimestepAttr);
  setting_host->map_length = AnfAlgo::GetNodeAttr<int64_t>(cnode, kMapLengthAttr);
  setting_host->map_width = AnfAlgo::GetNodeAttr<int64_t>(cnode, kMapWidthAttr);
  setting_host->wall_hit_penalty = AnfAlgo::GetNodeAttr<float>(cnode, kWallHitPenaltyAttr);
  setting_host->catch_reward = AnfAlgo::GetNodeAttr<float>(cnode, kCatchRewardAttr);
  setting_host->caught_penalty = AnfAlgo::GetNodeAttr<float>(cnode, kCaughtPenaltyAttr);
  setting_host->step_cost = AnfAlgo::GetNodeAttr<float>(cnode, kStepCostAttr);

  env_num_ = AnfAlgo::GetNodeAttr<int64_t>(cnode, kEnvNumAttr);
  agent_num_ = setting_host->predator_num + setting_host->prey_num;
  return true;
 }

 bool TagEnvironment::InitAgentState(int predator_num, int prey_num, AgentState *agent_state) {
  MS_LOG(EXCEPTION) << "The `state` should not be nullptr";

  int total_agents_num = env_num_ * agent_num_;
  auto &allocator = device::gpu::GPUMemoryAllocator::GetInstance();
  agent_state->loc_x = static_cast<int *>(allocator.AllocTensorMem(sizeof(int) * total_agents_num));
  agent_state->loc_y = static_cast<int *>(allocator.AllocTensorMem(sizeof(int) * total_agents_num));
  agent_state->still_in_game = static_cast<bool *>(allocator.AllocTensorMem(sizeof(bool) * total_agents_num));
  agent_state->rand_state =
    static_cast<curandState *>(allocator.AllocTensorMem(sizeof(curandState) * total_agents_num));
  agent_state->time_step = static_cast<int *>(allocator.AllocTensorMem(sizeof(int) * env_num_));
  agent_state->prey_left = static_cast<int *>(allocator.AllocTensorMem(sizeof(int) * env_num_));
  return true;
 }

 bool TagEnvironment::FinalizeAgentState(const AgentState &agent_setting) {
  MS_LOG(EXCEPTION) << "The `state` should not be nullptr";
  auto &allocator = device::gpu::GPUMemoryAllocator::GetInstance();
  allocator.FreeTensorMem(agent_setting.prey_left);
  allocator.FreeTensorMem(agent_setting.time_step);
  allocator.FreeTensorMem(agent_setting.still_in_game);
  allocator.FreeTensorMem(agent_setting.rand_state);
  allocator.FreeTensorMem(agent_setting.loc_x);
  allocator.FreeTensorMem(agent_setting.loc_y);
  return true;
 }

 bool TagEnvironment::Init(const CNodePtr &cnode, void *stream_ptr) {
  InitGameSetting(cnode, &game_setting_host_);
  InitAgentState(game_setting_host_.predator_num, game_setting_host_.prey_num, &agent_state_host_);

  // Move the game setting to device.
  auto &allocator = device::gpu::GPUMemoryAllocator::GetInstance();
  game_setting_device_ = static_cast<GameSetting *>(allocator.AllocTensorMem(sizeof(GameSetting)));
  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
    cudaMemcpyAsync(game_setting_device_, &game_setting_host_, sizeof(GameSetting), cudaMemcpyHostToDevice,
                    reinterpret_cast<cudaStream_t>(stream_ptr)),
    "cudaMemcpy failed.");

  // Move the agent state to device.
  agent_state_device_ = static_cast<AgentState *>(allocator.AllocTensorMem(sizeof(AgentState)));
  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
    cudaMemcpyAsync(agent_state_device_, &agent_state_host_, sizeof(AgentState), cudaMemcpyHostToDevice,
                    reinterpret_cast<cudaStream_t>(stream_ptr)),
    "cudaMemcpy failed.");

  InitEnv(env_num_, agent_num_, game_setting_device_, agent_state_device_, reinterpret_cast<cudaStream_t>(stream_ptr));
  return true;
 }

 bool TagEnvironment::Reset(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                           const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  auto state = reinterpret_cast<float *>(outputs[0]->addr);
  ResetEnv(env_num_, agent_num_, game_setting_device_, agent_state_device_, state,
           reinterpret_cast<cudaStream_t>(stream_ptr));
  return true;
 }

 bool TagEnvironment::Step(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                          const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  auto action = reinterpret_cast<int *>(inputs[0]->addr);
  auto state = reinterpret_cast<float *>(outputs[0]->addr);
  auto reward = reinterpret_cast<float *>(outputs[1]->addr);
  auto done = reinterpret_cast<bool *>(outputs[2]->addr);
  auto team_reward = reinterpret_cast<float *>(workspace[0]->addr);
  StepKernelProfiling(action, state, reward, done, team_reward, reinterpret_cast<cudaStream_t>(stream_ptr));
  if (optimal_kernel_ == kBindBlock) {
    StepBindBlock(env_num_, agent_num_, game_setting_device_, agent_state_device_, action, state, reward, done,
                  reinterpret_cast<cudaStream_t>(stream_ptr));
  } else {
    StepCrossBlock(env_num_, agent_num_, game_setting_device_, agent_state_device_, action, state, reward, done,
                   team_reward_, reinterpret_cast<cudaStream_t>(stream_ptr));
  }
  return true;
 }

 size_t TagEnvironment::ActionSizeInBytes() {
  // Action with shape (env_num, agent_num, movement)
  return env_num_ * agent_num_ * sizeof(int);
 }

 size_t TagEnvironment::StateSizeInBytes() {
  // State with shape (env_num, agent_num, agent_num * position)
  return env_num_ * agent_num_ * (agent_num_ * kFeatureNum + 1) * sizeof(float);
 }

 size_t TagEnvironment::RewardSizeInBytes() {
  // Reward with shape (env_num, agent_num, reward)
  return env_num_ * agent_num_ * sizeof(float);
 }

 size_t TagEnvironment::WorkspaceSizeInBytes() {
  // Team reward with shape (env_num,)
  return sizeof(float) * env_num_;
 }

 size_t TagEnvironment::DoneSizeInBytes() { return env_num_ * sizeof(bool); }

 void TagEnvironment::StepKernelProfiling(const int *action, float *state, float *reward, bool *done, float *team_reward,
                                         cudaStream_t stream) {
  if (!enable_profiling_) {
    return;
  }

  MS_LOG(INFO) << "Start Tag environment profiling.";

  // Prepare agent state for profiling.
  AgentState agent_state;
  InitAgentState(game_setting_host_.predator_num, game_setting_host_.prey_num, &agent_state);
  auto &allocator = device::gpu::GPUMemoryAllocator::GetInstance();
  AgentState *agent_state_device = static_cast<AgentState *>(allocator.AllocTensorMem(sizeof(AgentState)));
  CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(
    cudaMemcpyAsync(agent_state_device, &agent_state, sizeof(AgentState), cudaMemcpyHostToDevice, stream),
    "cudaMemcpy failed.");
  AgentStateCopy(env_num_, agent_num_, agent_state_device, agent_state_device_, stream);

  // Warmup
  StepBindBlock(env_num_, agent_num_, game_setting_device_, agent_state_device, action, state, reward, done, stream);
  StepCrossBlock(env_num_, agent_num_, game_setting_device_, agent_state_device, action, state, reward, done,
                 team_reward_, stream);

  // Collect profiling info
  device::gpu::CudaDeviceStream start = nullptr;
  device::gpu::CudaDeviceStream end = nullptr;
  float bind_cost = 0;
  float cross_cost = 0;
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::CreateEvent(&start), "Failed to create event.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::CreateEvent(&end), "Failed to create event.");

  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::RecordEvent(start, stream), "Failed to record event to stream.");
  StepBindBlock(env_num_, agent_num_, game_setting_device_, agent_state_device, action, state, reward, done, stream);
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::RecordEvent(end, stream), "Failed to record event to stream.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::SyncEvent(start), "Failed to sync event.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::SyncEvent(end), "Failed to sync event.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::ElapsedTime(&bind_cost, start, end), "Record time failed.");

  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::RecordEvent(start, stream), "Failed to record event to stream.");
  StepCrossBlock(env_num_, agent_num_, game_setting_device_, agent_state_device, action, state, reward, done,
                 team_reward_, stream);
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::RecordEvent(end, stream), "Failed to record event to stream.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::SyncEvent(start), "Failed to sync event.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::SyncEvent(end), "Failed to sync event.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::ElapsedTime(&cross_cost, start, end), "Record time failed.");

  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::DestroyEvent(start), "Failed to destroy event.");
  CHECK_OP_RET_WITH_EXCEPT(device::gpu::CudaDriver::DestroyEvent(end), "Failed to destroy event.");

  // Select optimal kernel
  optimal_kernel_ = bind_cost < cross_cost ? kBindBlock : kCrossBlock;

  // Free tmp agent state
  allocator.FreeTensorMem(agent_state_device);
  FinalizeAgentState(agent_state);

  MS_LOG(INFO) << "Tag environment profiling finish. Bind cost: " << bind_cost << ", cross cost: " << cross_cost;
  enable_profiling_ = false;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/rl/tag_environment.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/rl/tag_environment.h
@@ -0,0 +1,77 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TAG_ENV_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TAG_ENV_KERNEL_H_

 #include <vector>
 #include "backend/kernel_compiler/gpu/rl/environment_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/rl/tag_env_impl.cuh"

 namespace mindspore {
 namespace kernel {
 // Class for Tag environment.
 // Tag is a multi-agent reinforcement learning environment.
 // It is suppose that the predators learn cooperative stategy (for example surround) to catch the prey.
 // The predators try to catch the prey. All of predators will get same reward when they catch the prey.
 // Tag environment uses discrete action space(still, left, right, up, down), and result observations
 // including agent location information. The tag environment supports multiple instances to speed sample collection.
 // It also supports auto performance profiling and cuda-kernel selection.
 class TagEnvironment : public Environment {
 public:
  TagEnvironment() = default;
  ~TagEnvironment();

  // Init environment. Parse environment setting, create device memory for environment setting and agent state etc.
  bool Init(const CNodePtr &cnode, void *stream_ptr) override;
  // Reset environment state include agent location and time step.
  bool Reset(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
             const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
  // Execute time step.
  bool Step(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
            const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
  size_t ActionSizeInBytes() override;
  size_t StateSizeInBytes() override;
  size_t RewardSizeInBytes() override;
  size_t DoneSizeInBytes() override;
  size_t WorkspaceSizeInBytes() override;

 private:
  // The GameSetting and AgentState are used in C-like compiling environment, use C style resource managerment.
  bool InitGameSetting(const CNodePtr &cnode, GameSetting *setting_host);
  bool InitAgentState(int predator_num, int prey_num, AgentState *agent_state);
  bool FinalizeAgentState(const AgentState &agent_state);

  int env_num_ = 0;
  int agent_num_ = 0;
  GameSetting game_setting_host_;
  GameSetting *game_setting_device_ = nullptr;
  AgentState agent_state_host_;
  AgentState *agent_state_device_;
  float *team_reward_ = nullptr;

  enum StepKernelType { kBindBlock = 0, kCrossBlock };
  void StepKernelProfiling(const int *action, float *state, float *reward, bool *done, float *team_reward,
                           cudaStream_t stream);
  int enable_profiling_ = true;
  StepKernelType optimal_kernel_ = kBindBlock;
 };

 MS_REG_GPU_ENV(Tag, TagEnvironment)
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TAG_ENV_KERNEL_H_