From: @lixiachen Reviewed-by: @robingrosman,@nsyca Signed-off-by: @robingrosmanpull/15560/MERGE
| @@ -17,6 +17,7 @@ | |||
| #include <sys/types.h> | |||
| #include <sys/wait.h> | |||
| #include <unistd.h> | |||
| #include <algorithm> | |||
| #include <cerrno> | |||
| #include <iomanip> | |||
| #include <iostream> | |||
| @@ -39,7 +40,6 @@ const char CacheAdminArgHandler::kServerBinary[] = "cache_server"; | |||
| CacheAdminArgHandler::CacheAdminArgHandler() | |||
| : port_(kCfgDefaultCachePort), | |||
| session_id_(0), | |||
| num_workers_(kDefaultNumWorkers), | |||
| shm_mem_sz_(kDefaultSharedMemorySizeInGB), | |||
| log_level_(kDefaultLogLevel), | |||
| @@ -102,6 +102,52 @@ CacheAdminArgHandler::CacheAdminArgHandler() | |||
| CacheAdminArgHandler::~CacheAdminArgHandler() = default; | |||
| Status CacheAdminArgHandler::AssignArg(std::string option, std::vector<uint32_t> *out_arg, | |||
| std::stringstream *arg_stream, CommandId command_id) { | |||
| // Detect if the user tried to provide this argument more than once | |||
| ArgValue selected_arg = arg_map_[option]; | |||
| if (used_args_[selected_arg]) { | |||
| std::string err_msg = "The " + option + " argument was given more than once."; | |||
| return Status(StatusCode::kMDSyntaxError, err_msg); | |||
| } | |||
| // Flag that this arg is used now | |||
| used_args_[selected_arg] = true; | |||
| // Some options are just arguments, for example "--port 50052" is not a command, it's just a argument. | |||
| // Other options are actual commands, for example "--destroy_session 1234". This executes the destroy session. | |||
| // If this option is also a command, make sure there has not been multiple commands given before assigning it. | |||
| if (command_id != CommandId::kCmdUnknown) { | |||
| if (command_id_ != CommandId::kCmdUnknown) { | |||
| std::string err_msg = "Only one command at a time is allowed. Invalid command: " + option; | |||
| return Status(StatusCode::kMDSyntaxError, err_msg); | |||
| } else { | |||
| command_id_ = command_id; | |||
| } | |||
| } | |||
| uint32_t value_as_uint; | |||
| while (arg_stream->rdbuf()->in_avail() != 0) { | |||
| *arg_stream >> value_as_uint; | |||
| if (arg_stream->fail()) { | |||
| arg_stream->clear(); | |||
| std::string value_as_string; | |||
| *arg_stream >> value_as_string; | |||
| std::string err_msg = "Invalid numeric value: " + value_as_string; | |||
| return Status(StatusCode::kMDSyntaxError, err_msg); | |||
| } else { | |||
| out_arg->push_back(value_as_uint); | |||
| } | |||
| } | |||
| if (out_arg->empty()) { | |||
| std::string err_msg = option + " option requires an argument field. Syntax: " + option + " <field>"; | |||
| return Status(StatusCode::kMDSyntaxError, err_msg); | |||
| } | |||
| return Status::OK(); | |||
| } | |||
| Status CacheAdminArgHandler::AssignArg(std::string option, int32_t *out_arg, std::stringstream *arg_stream, | |||
| CommandId command_id) { | |||
| // Detect if the user tried to provide this argument more than once | |||
| @@ -269,11 +315,7 @@ Status CacheAdminArgHandler::ParseArgStream(std::stringstream *arg_stream) { | |||
| break; | |||
| } | |||
| case ArgValue::kArgDestroySession: { | |||
| // session_id is an unsigned type. We may need to template the AssignArg function so that | |||
| // it can handle different flavours of integers instead of just int32_t. | |||
| int32_t session_int; | |||
| RETURN_IF_NOT_OK(AssignArg(tok, &session_int, arg_stream, CommandId::kCmdDestroySession)); | |||
| session_id_ = session_int; | |||
| RETURN_IF_NOT_OK(AssignArg(tok, &session_ids_, arg_stream, CommandId::kCmdDestroySession)); | |||
| break; | |||
| } | |||
| case ArgValue::kArgNumWorkers: { | |||
| @@ -376,11 +418,13 @@ Status CacheAdminArgHandler::RunCommand() { | |||
| CacheClientGreeter comm(hostname_, port_, 1); | |||
| RETURN_IF_NOT_OK(comm.ServiceStart()); | |||
| CacheClientInfo cinfo; | |||
| cinfo.set_session_id(session_id_); | |||
| auto rq = std::make_shared<DropSessionRequest>(cinfo); | |||
| RETURN_IF_NOT_OK(comm.HandleRequest(rq)); | |||
| RETURN_IF_NOT_OK(rq->Wait()); | |||
| std::cout << "Drop session successfully for server on port " << std::to_string(port_) << std::endl; | |||
| for (session_id_type id : session_ids_) { | |||
| cinfo.set_session_id(id); | |||
| auto rq = std::make_shared<DropSessionRequest>(cinfo); | |||
| RETURN_IF_NOT_OK(comm.HandleRequest(rq)); | |||
| RETURN_IF_NOT_OK(rq->Wait()); | |||
| std::cout << "Drop session " << id << " successfully for server on port " << std::to_string(port_) << std::endl; | |||
| } | |||
| break; | |||
| } | |||
| case CommandId::kCmdListSessions: { | |||
| @@ -22,6 +22,7 @@ | |||
| #include <string> | |||
| #include <sstream> | |||
| #include <thread> | |||
| #include <vector> | |||
| #include "minddata/dataset/util/status.h" | |||
| #include "minddata/dataset/engine/cache/cache_client.h" | |||
| @@ -94,6 +95,9 @@ class CacheAdminArgHandler { | |||
| Status AssignArg(std::string option, float *out_arg, std::stringstream *arg_stream, | |||
| CommandId command_id = CommandId::kCmdUnknown); | |||
| Status AssignArg(std::string option, std::vector<uint32_t> *out_arg, std::stringstream *arg_stream, | |||
| CommandId command_id = CommandId::kCmdUnknown); | |||
| Status Validate(); | |||
| CommandId command_id_; | |||
| @@ -102,7 +106,7 @@ class CacheAdminArgHandler { | |||
| int32_t shm_mem_sz_; | |||
| int32_t log_level_; | |||
| float memory_cap_ratio_; | |||
| session_id_type session_id_; | |||
| std::vector<session_id_type> session_ids_; | |||
| std::string hostname_; | |||
| std::string spill_dir_; | |||
| std::string trailing_args_; | |||
| @@ -332,16 +332,16 @@ bash run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet201 | |||
| #### Evaluation while training | |||
| ```bash | |||
| # evaluation while distributed training Ascend example: | |||
| # evaluation with distributed training Ascend example: | |||
| bash run_distribute_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) | |||
| # evaluation while standalone training Ascend example: | |||
| # evaluation with standalone training Ascend example: | |||
| bash run_standalone_train.sh [resnet18|resnet50|resnet101|se-resnet50] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) | |||
| # evaluation while distributed training GPU example: | |||
| # evaluation with distributed training GPU example: | |||
| bash run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) | |||
| # evaluation while standalone training GPU example: | |||
| # evaluation with standalone training GPU example: | |||
| bash run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [RUN_EVAL](optional) [EVAL_DATASET_PATH](optional) | |||
| ``` | |||
| @@ -34,8 +34,8 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| batch_size(int): the batch size of dataset. Default: 32 | |||
| target(str): the device target. Default: Ascend | |||
| distribute(bool): data for distribute or not. Default: False | |||
| enable_cache(bool): whether tensor caching service is used for eval. | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. | |||
| enable_cache(bool): whether tensor caching service is used for eval. Default: False | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None | |||
| Returns: | |||
| dataset | |||
| @@ -104,8 +104,8 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| batch_size(int): the batch size of dataset. Default: 32 | |||
| target(str): the device target. Default: Ascend | |||
| distribute(bool): data for distribute or not. Default: False | |||
| enable_cache(bool): whether tensor caching service is used for eval. | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. | |||
| enable_cache(bool): whether tensor caching service is used for eval. Default: False | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None | |||
| Returns: | |||
| dataset | |||
| @@ -182,8 +182,8 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| batch_size(int): the batch size of dataset. Default: 32 | |||
| target(str): the device target. Default: Ascend | |||
| distribute(bool): data for distribute or not. Default: False | |||
| enable_cache(bool): whether tensor caching service is used for eval. | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. | |||
| enable_cache(bool): whether tensor caching service is used for eval. Default: False | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None | |||
| Returns: | |||
| dataset | |||
| @@ -259,8 +259,8 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target= | |||
| batch_size(int): the batch size of dataset. Default: 32 | |||
| target(str): the device target. Default: Ascend | |||
| distribute(bool): data for distribute or not. Default: False | |||
| enable_cache(bool): whether tensor caching service is used for eval. | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. | |||
| enable_cache(bool): whether tensor caching service is used for eval. Default: False | |||
| cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None | |||
| Returns: | |||
| dataset | |||
| @@ -16,7 +16,7 @@ | |||
| # source the globals and functions for use with cache testing | |||
| export SKIP_ADMIN_COUNTER=false | |||
| declare failed_tests | |||
| declare session_id failed_tests | |||
| . cachetest_lib.sh | |||
| echo | |||
| @@ -160,6 +160,18 @@ cmd="${CACHE_ADMIN} -d 99999" | |||
| CacheAdminCmd "${cmd}" 1 | |||
| HandleRcExit $? 0 0 | |||
| # generate two new sessions to test multi-destroy | |||
| GetSession | |||
| HandleRcExit $? 0 0 | |||
| session_id1=$session_id | |||
| GetSession | |||
| HandleRcExit $? 0 0 | |||
| session_id2=$session_id | |||
| # test multi-session destroy | |||
| cmd="${CACHE_ADMIN} -d ${session_id1} ${session_id2}" | |||
| CacheAdminCmd "${cmd}" 0 | |||
| HandleRcExit $? 0 0 | |||
| # stop cache server at this point | |||
| StopServer | |||
| HandleRcExit $? 1 1 | |||