!15560 Optimize cache_admin tool to allow multiple session destroy

From: @lixiachen Reviewed-by: @robingrosman,@nsyca Signed-off-by: @robingrosman
4 years ago · 83cdb8bb38
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.cc
@@ -17,6 +17,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <algorithm>
 #include <cerrno>
 #include <iomanip>
 #include <iostream>
@@ -39,7 +40,6 @@ const char CacheAdminArgHandler::kServerBinary[] = "cache_server";

 CacheAdminArgHandler::CacheAdminArgHandler()
    : port_(kCfgDefaultCachePort),
      session_id_(0),
      num_workers_(kDefaultNumWorkers),
      shm_mem_sz_(kDefaultSharedMemorySizeInGB),
      log_level_(kDefaultLogLevel),
@@ -102,6 +102,52 @@ CacheAdminArgHandler::CacheAdminArgHandler()

 CacheAdminArgHandler::~CacheAdminArgHandler() = default;

 Status CacheAdminArgHandler::AssignArg(std::string option, std::vector<uint32_t> *out_arg,
                                       std::stringstream *arg_stream, CommandId command_id) {
  // Detect if the user tried to provide this argument more than once
  ArgValue selected_arg = arg_map_[option];
  if (used_args_[selected_arg]) {
    std::string err_msg = "The " + option + " argument was given more than once.";
    return Status(StatusCode::kMDSyntaxError, err_msg);
  }

  // Flag that this arg is used now
  used_args_[selected_arg] = true;

  // Some options are just arguments, for example "--port 50052" is not a command, it's just a argument.
  // Other options are actual commands, for example "--destroy_session 1234".  This executes the destroy session.
  // If this option is also a command, make sure there has not been multiple commands given before assigning it.
  if (command_id != CommandId::kCmdUnknown) {
    if (command_id_ != CommandId::kCmdUnknown) {
      std::string err_msg = "Only one command at a time is allowed.  Invalid command: " + option;
      return Status(StatusCode::kMDSyntaxError, err_msg);
    } else {
      command_id_ = command_id;
    }
  }

  uint32_t value_as_uint;
  while (arg_stream->rdbuf()->in_avail() != 0) {
    *arg_stream >> value_as_uint;
    if (arg_stream->fail()) {
      arg_stream->clear();
      std::string value_as_string;
      *arg_stream >> value_as_string;
      std::string err_msg = "Invalid numeric value: " + value_as_string;
      return Status(StatusCode::kMDSyntaxError, err_msg);
    } else {
      out_arg->push_back(value_as_uint);
    }
  }

  if (out_arg->empty()) {
    std::string err_msg = option + " option requires an argument field.  Syntax: " + option + " <field>";
    return Status(StatusCode::kMDSyntaxError, err_msg);
  }

  return Status::OK();
 }

 Status CacheAdminArgHandler::AssignArg(std::string option, int32_t *out_arg, std::stringstream *arg_stream,
                                       CommandId command_id) {
  // Detect if the user tried to provide this argument more than once
@@ -269,11 +315,7 @@ Status CacheAdminArgHandler::ParseArgStream(std::stringstream *arg_stream) {
        break;
      }
      case ArgValue::kArgDestroySession: {
        // session_id is an unsigned type. We may need to template the AssignArg function so that
        // it can handle different flavours of integers instead of just int32_t.
        int32_t session_int;
        RETURN_IF_NOT_OK(AssignArg(tok, &session_int, arg_stream, CommandId::kCmdDestroySession));
        session_id_ = session_int;
        RETURN_IF_NOT_OK(AssignArg(tok, &session_ids_, arg_stream, CommandId::kCmdDestroySession));
        break;
      }
      case ArgValue::kArgNumWorkers: {
@@ -376,11 +418,13 @@ Status CacheAdminArgHandler::RunCommand() {
      CacheClientGreeter comm(hostname_, port_, 1);
      RETURN_IF_NOT_OK(comm.ServiceStart());
      CacheClientInfo cinfo;
      cinfo.set_session_id(session_id_);
      auto rq = std::make_shared<DropSessionRequest>(cinfo);
      RETURN_IF_NOT_OK(comm.HandleRequest(rq));
      RETURN_IF_NOT_OK(rq->Wait());
      std::cout << "Drop session successfully for server on port " << std::to_string(port_) << std::endl;
      for (session_id_type id : session_ids_) {
        cinfo.set_session_id(id);
        auto rq = std::make_shared<DropSessionRequest>(cinfo);
        RETURN_IF_NOT_OK(comm.HandleRequest(rq));
        RETURN_IF_NOT_OK(rq->Wait());
        std::cout << "Drop session " << id << " successfully for server on port " << std::to_string(port_) << std::endl;
      }
      break;
    }
    case CommandId::kCmdListSessions: {
--- a/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/cache/cache_admin_arg.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <sstream>
 #include <thread>
 #include <vector>
 #include "minddata/dataset/util/status.h"
 #include "minddata/dataset/engine/cache/cache_client.h"

@@ -94,6 +95,9 @@ class CacheAdminArgHandler {
  Status AssignArg(std::string option, float *out_arg, std::stringstream *arg_stream,
                   CommandId command_id = CommandId::kCmdUnknown);

  Status AssignArg(std::string option, std::vector<uint32_t> *out_arg, std::stringstream *arg_stream,
                   CommandId command_id = CommandId::kCmdUnknown);

  Status Validate();

  CommandId command_id_;
@@ -102,7 +106,7 @@ class CacheAdminArgHandler {
  int32_t shm_mem_sz_;
  int32_t log_level_;
  float memory_cap_ratio_;
  session_id_type session_id_;
  std::vector<session_id_type> session_ids_;
  std::string hostname_;
  std::string spill_dir_;
  std::string trailing_args_;
--- a/model_zoo/official/cv/resnet/README.md
+++ b/model_zoo/official/cv/resnet/README.md
@@ -332,16 +332,16 @@ bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet201
 #### Evaluation while training

 ```bash
 # evaluation while distributed training Ascend example:
 # evaluation with distributed training Ascend example:
 bash run_distribute_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)

 # evaluation while standalone training Ascend example:
 # evaluation with standalone training Ascend example:
 bash run_standalone_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)

 # evaluation while distributed training GPU example:
 # evaluation with distributed training GPU example:
 bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012]  [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)

 # evaluation while standalone training GPU example:
 # evaluation with standalone training GPU example:
 bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012]  [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional)
 ```

--- a/model_zoo/official/cv/resnet/src/dataset.py
+++ b/model_zoo/official/cv/resnet/src/dataset.py
@@ -34,8 +34,8 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
        enable_cache(bool): whether tensor caching service is used for eval.
        cache_session_id(int): If enable_cache, cache session_id need to be provided.
        enable_cache(bool): whether tensor caching service is used for eval. Default: False
        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None

    Returns:
        dataset
@@ -104,8 +104,8 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
        enable_cache(bool): whether tensor caching service is used for eval.
        cache_session_id(int): If enable_cache, cache session_id need to be provided.
        enable_cache(bool): whether tensor caching service is used for eval. Default: False
        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None

    Returns:
        dataset
@@ -182,8 +182,8 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
        enable_cache(bool): whether tensor caching service is used for eval.
        cache_session_id(int): If enable_cache, cache session_id need to be provided.
        enable_cache(bool): whether tensor caching service is used for eval. Default: False
        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None

    Returns:
        dataset
@@ -259,8 +259,8 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
        enable_cache(bool): whether tensor caching service is used for eval.
        cache_session_id(int): If enable_cache, cache session_id need to be provided.
        enable_cache(bool): whether tensor caching service is used for eval. Default: False
        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None

    Returns:
        dataset
--- a/tests/ut/python/cachetests/cachetest_args.sh
+++ b/tests/ut/python/cachetests/cachetest_args.sh
@@ -16,7 +16,7 @@

 # source the globals and functions for use with cache testing
 export SKIP_ADMIN_COUNTER=false
 declare failed_tests
 declare session_id failed_tests
 . cachetest_lib.sh
 echo

@@ -160,6 +160,18 @@ cmd="${CACHE_ADMIN} -d 99999"
 CacheAdminCmd "${cmd}" 1
 HandleRcExit $? 0 0

 # generate two new sessions to test multi-destroy
 GetSession
 HandleRcExit $? 0 0
 session_id1=$session_id
 GetSession
 HandleRcExit $? 0 0
 session_id2=$session_id
 # test multi-session destroy
 cmd="${CACHE_ADMIN} -d ${session_id1} ${session_id2}"
 CacheAdminCmd "${cmd}" 0
 HandleRcExit $? 0 0

 # stop cache server at this point
 StopServer
 HandleRcExit $? 1 1