/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_SCHEDULER_H_
#define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_SCHEDULER_H_

#include <vector>
#include <string>
#include <memory>
#include <utility>
#include <unordered_map>
#include <algorithm>
#include <fstream>
#include "runtime/framework/actor/data_source_actor.h"
#include "runtime/framework/actor/loop_count_actor.h"
#include "runtime/framework/actor/kernel_actor.h"
#include "runtime/hardware/device_context.h"
#include "backend/session/kernel_graph.h"

namespace mindspore {
namespace runtime {
using mindspore::device::DeviceContext;
using mindspore::session::KernelWithIndex;
using KernelMapActor = std::unordered_map<std::string, KernelActorPtr>;

enum class GraphExecutionStrategy {
  kPipeline,  // The actor running is triggered only by data.
  kStep       // The actor running need be triggered by control in addition.
};

// The actor set generated by graph transformer is the execution unit of actor runtime.
// It includes data source actor, kernel actor, loop count actor.
// The data source actor is used to obtain data and process them into device tensors,
// and then send them to kernel actor. The kernel actor is used to receive the device tensors to luanch kernel.
// Specifically notice the no input kernel actor, it means that this actor has no input device tensor, need be triggered
// externally. The loop count actor is used to receive the control of tail kernel actor to represent the end of one step
// and decide whether to loop execution by loop count.
struct ActorSet {
  std::vector<DataSourceActorPtr> data_source_actors_;
  std::vector<KernelActorPtr> kernel_actors_;
  // No input kernel actors need be triggered specifically.
  std::vector<KernelActorPtr> no_input_kernel_actors_;
  LoopCountActorPtr loop_count_actor_{nullptr};
};
using ActorSetPtr = std::shared_ptr<ActorSet>;

class GraphScheduler {
 public:
  static GraphScheduler &GetInstance() {
    static GraphScheduler instance;
    return instance;
  }

  // 1. Thread pool creating.
  // 2. The memory manager creating and scheduling.
  void Initialize();

  // Transform graph to actor DAG, contains build and link.
  ActorSet *Transform(const std::vector<KernelGraphPtr> &graphs, const std::vector<DeviceContext *> &device_contexts,
                      const std::vector<TensorPtr> *input_tensors = nullptr,
                      const std::vector<AnfNodePtr> *control_nodes = nullptr,
                      GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);

  // Schedule actors in the actor runtime. Single machine scheduling is supported currently, and distributed scheduling
  // will be supported in the future.
  void Schedule(const ActorSet *actor_set);

  // The prepare processing before run:
  // 1. Prepare the data of device tensor store(such as weights and value nodes of graph).
  // 2. Prepare the data of host tensor queue(such as non weighted parameters of graph).
  // 3. Prepare the output tensor of graph.
  // 4.Prepare the continuous memory for communication kernel.
  void PrepareRun(const KernelGraphPtr &graph, const std::vector<TensorPtr> *input_tensors, VectorRef *const &outputs);

  // The processing entry of actors running.
  bool Run(const ActorSet *actor_set, GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);

  // Fetch the actor set by kernel graph.
  ActorSet *Fetch(const KernelGraphPtr &graph) const;

 private:
  GraphScheduler() = default;
  ~GraphScheduler() = default;
  DISABLE_COPY_AND_ASSIGN(GraphScheduler);

  // Transform the nodes of graph to actors.
  ActorSetPtr Build(const KernelGraphPtr &graph, const DeviceContext *device_context);
  // Link actors to DAG through the edge connection of graph and graph execution strategy.
  void Link(ActorSet *actor_set, const KernelGraphPtr &graph, GraphExecutionStrategy strategy);

  // The processing of actors build.
  std::vector<DataSourceActorPtr> BuildDataSourceActor(const KernelGraphPtr &graph,
                                                       const DeviceContext *device_context);
  std::vector<KernelActorPtr> BuildKernelActor(const KernelGraphPtr &graph, const DeviceContext *device_context);
  std::vector<KernelActorPtr> BuildNoInputKernelActor(const KernelGraphPtr &graph);
  LoopCountActorPtr BuildLoopCountActor(const KernelGraphPtr &graph);

  // The processing of actors link.
  void LinkDataArrowForDeviceDSActor(DeviceQueueDataSourceActor *from_actor, KernelActor *to_actor,
                                     KernelWithIndex from_kernel_with_output_idx,
                                     KernelWithIndex to_to_kernel_with_input_idx);
  void LinkDataArrowForHostDSActor(HostQueueDataSourceActor *from_actor, KernelActor *to_actor,
                                   KernelWithIndex from_kernel_with_output_idx,
                                   KernelWithIndex to_kernel_with_input_idx);
  void LinkDataArrowForKernelActor(KernelActor *from_actor, KernelActor *to_actor,
                                   KernelWithIndex from_kernel_with_output_idx,
                                   KernelWithIndex to_kernel_with_input_idx);
  void LinkControlArrowForKernelActor(KernelActor *from_actor, LoopCountActor *to_actor, const KernelGraphPtr &graph,
                                      GraphExecutionStrategy strategy);
  void LinkControlArrowForLoopCountActor(LoopCountActor *loop_count_actor, const KernelGraphPtr &graph);
  void LinkControlArrowByAutoMonad(KernelActor *to_actor, const AnfNodePtr &from_node,
                                   const KernelMapActor &kernel_actors_map);

  // Check whether the actor set is valid.
  bool CheckActorValid(const ActorSet *actor_set) const;

  // Persist device tensors of graph's some nodes(such as weights and value nodes).
  void PersistDeviceTensor(const KernelGraphPtr &graph);

  // Fetch the hsot tensor queue by kernel graph.
  HostTensorQueue *FetchHostQueue(const KernelGraphPtr &graph) const;

  // Display the actor information of corresponding kernel graph.
  void DumpActor(const KernelGraphPtr &graph) const;
  void DumpDSActor(const DataSourceActor *actor, std::ofstream &ofs) const;
  void DumpLoopCountActor(const LoopCountActor *actor, std::ofstream &ofs) const;
  void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) const;

  std::unordered_map<KernelGraphPtr, ActorSetPtr> graph_to_actors_;
  std::unordered_map<KernelGraphPtr, HostTensorQueuePtr> graph_to_host_queue_;

  // The second element of pair represents the output index of kernel actor corresponding to the device tensor.
  std::unordered_map<DeviceTensorPtr, std::pair<KernelActorPtr, int>> device_address_to_actor_;

  // The id of memory manager actor.
  AID memory_manager_aid_;

  bool init_{false};
};
}  // namespace runtime
}  // namespace mindspore

#endif  // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_GRAPH_SCHEDULER_H_