| @@ -15,4 +15,4 @@ | |||
| url = https://gitee.com/mindspore/akg.git | |||
| [submodule "graphengine"] | |||
| path = graphengine | |||
| url = https://gitee.com/ms-incubator/graphengine.git | |||
| url = https://gitee.com/mindspore/graphengine.git | |||
| @@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm | |||
| ### Communication | |||
| - [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers. | |||
| - [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers. | |||
| - IRC channel at `#mindspore` (only for meeting minutes logging purpose) | |||
| - Video Conferencing: https://meet.jit.si | |||
| - Mailing-list: https://mailweb.mindspore.cn/postorius/lists | |||
| - Video Conferencing: TBD | |||
| - Mailing-list: <https://mailweb.mindspore.cn/postorius/lists> | |||
| ## Contributing | |||
| @@ -1 +1 @@ | |||
| Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35 | |||
| Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625 | |||
| @@ -24,7 +24,7 @@ usage() | |||
| { | |||
| echo "Usage:" | |||
| echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\" | |||
| echo " [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\" | |||
| echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\" | |||
| echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]" | |||
| echo "" | |||
| echo "Options:" | |||
| @@ -48,7 +48,6 @@ usage() | |||
| echo " -P Enable dump anf graph to file in ProtoBuffer format, default on" | |||
| echo " -Q Enable dump memory, default off" | |||
| echo " -D Enable dumping of function graph ir, default on" | |||
| echo " -S Enable async data dump, default off" | |||
| echo " -z Compile dataset & mindrecord, default on" | |||
| echo " -M Enable MPI and NCCL for GPU training, gpu default on" | |||
| echo " -V Specify the minimum required cuda version, default CUDA 10.1" | |||
| @@ -89,7 +88,6 @@ checkopts() | |||
| ENABLE_TIMELINE="off" | |||
| ENABLE_DUMP2PROTO="on" | |||
| ENABLE_DUMPE2E="off" | |||
| ENABLE_DATA_DUMP="off" | |||
| ENABLE_DUMP_IR="on" | |||
| COMPILE_MINDDATA="on" | |||
| ENABLE_MPI="off" | |||
| @@ -104,7 +102,7 @@ checkopts() | |||
| ENABLE_PYTHON="on" | |||
| # Process the options | |||
| while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt | |||
| while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt | |||
| do | |||
| OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]') | |||
| case "${opt}" in | |||
| @@ -186,6 +184,7 @@ checkopts() | |||
| elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then | |||
| ENABLE_D="on" | |||
| ENABLE_CPU="on" | |||
| ENABLE_SERVING="on" | |||
| elif [[ "X$OPTARG" == "Xcpu" ]]; then | |||
| ENABLE_CPU="on" | |||
| else | |||
| @@ -220,11 +219,6 @@ checkopts() | |||
| ENABLE_DUMPE2E="$OPTARG" | |||
| echo "enable dump end to end" | |||
| ;; | |||
| S) | |||
| check_on_off $OPTARG S | |||
| ENABLE_DATA_DUMP="$OPTARG" | |||
| echo "enable data dump" | |||
| ;; | |||
| D) | |||
| check_on_off $OPTARG D | |||
| ENABLE_DUMP_IR="$OPTARG" | |||
| @@ -328,9 +322,6 @@ build_mindspore() | |||
| if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then | |||
| CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON" | |||
| fi | |||
| if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then | |||
| CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON" | |||
| fi | |||
| CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}" | |||
| CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}" | |||
| if [[ "X$ENABLE_MPI" = "Xon" ]]; then | |||
| @@ -1,4 +1,4 @@ | |||
| set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}") | |||
| set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") | |||
| set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2") | |||
| mindspore_add_pkg(glog | |||
| VER 0.4.0 | |||
| @@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E) | |||
| add_compile_definitions(ENABLE_DUMP_E2E) | |||
| endif() | |||
| if(ENABLE_DATA_DUMP) | |||
| add_compile_definitions(ENABLE_DATA_DUMP) | |||
| endif() | |||
| if(ENABLE_DEBUGGER) | |||
| add_compile_definitions(ENABLE_DEBUGGER) | |||
| endif() | |||
| if(ENABLE_TESTCASES) | |||
| add_compile_definitions(ENABLE_TESTCASES) | |||
| endif() | |||
| @@ -1,13 +1,16 @@ | |||
| # find exec | |||
| find_package(Python3 3.7 COMPONENTS Interpreter Development) | |||
| if (NOT Python3_FOUND) | |||
| message("No python3 found.") | |||
| return () | |||
| message(FATAL_ERROR "No python3 found.") | |||
| endif () | |||
| set(PYTHON ${Python3_EXECUTABLE}) | |||
| set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}) | |||
| if (NOT PYTHON_VERSION MATCHES "3.7") | |||
| message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7") | |||
| endif () | |||
| find_package(Git) | |||
| if (NOT GIT_FOUND) | |||
| message("No git found.") | |||
| @@ -1 +1 @@ | |||
| Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf | |||
| Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009 | |||
| @@ -17,7 +17,7 @@ | |||
| """Resources for ast tree parse.""" | |||
| import ast | |||
| import math | |||
| from mindspore import IndexedSlices | |||
| from mindspore import IndexedSlices, SparseTensor | |||
| from mindspore.ops.composite import multitype_ops | |||
| from mindspore.ops import functional as F, composite as C | |||
| from . import standard_method as M | |||
| @@ -140,4 +140,5 @@ convert_object_map = { | |||
| # user defined | |||
| IndexedSlices: F.make_indexed_slices, | |||
| SparseTensor: F.make_sparse_tensor, | |||
| } | |||
| @@ -44,7 +44,7 @@ if(ENABLE_GPU) | |||
| "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc" | |||
| ) | |||
| list(APPEND CUDA_NVCC_FLAGS -arch=sm_53) | |||
| list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr) | |||
| list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc") | |||
| list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc" | |||
| "runtime/device/gpu/distribution/collective_wrapper.cc" | |||
| @@ -26,14 +26,6 @@ if (ENABLE_CPU) | |||
| "cpu/*.cc" | |||
| ) | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc" | |||
| "cpu/ps/pull_kernel.cc" | |||
| "cpu/ps/embedding_look_up_ps_kernel.cc" | |||
| "cpu/ps/embedding_look_up_proxy_kernel.cc" | |||
| "cpu/ps/apply_momentum_ps_kernel.cc" | |||
| "cpu/ps/sparse_apply_adam_ps_kernel.cc" | |||
| "cpu/ps/sparse_apply_ftrl_ps_kernel.cc") | |||
| if (NOT ENABLE_MPI) | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc") | |||
| @@ -41,6 +33,17 @@ if (ENABLE_CPU) | |||
| endif () | |||
| endif () | |||
| if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE) | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc") | |||
| list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc") | |||
| endif() | |||
| if (ENABLE_GPU) | |||
| file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| "gpu/*.cu" | |||
| @@ -18,6 +18,7 @@ | |||
| #include <algorithm> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) { | |||
| std::string dst_type; | |||
| TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0); | |||
| if (output_type == kFloat32->type_id()) { | |||
| dst_type = "float32"; | |||
| } else if (output_type == kFloat16->type_id()) { | |||
| dst_type = "float16"; | |||
| } else if (output_type == kInt32->type_id()) { | |||
| dst_type = "int32"; | |||
| } else { | |||
| MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString(); | |||
| } | |||
| dst_type = TypeId2String(output_type); | |||
| AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node); | |||
| } | |||
| @@ -21,9 +21,7 @@ | |||
| #include <memory> | |||
| #include "framework/ge_runtime/task_info.h" | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #ifdef ENABLE_DATA_DUMP | |||
| #include "debug/data_dump_parser.h" | |||
| #endif | |||
| using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>; | |||
| namespace mindspore { | |||
| @@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod { | |||
| const std::vector<AddressPtr> &, uint32_t) = 0; | |||
| uint32_t block_dim() { return block_dim_; } | |||
| uint32_t stream_id() { return stream_id_; } | |||
| virtual bool NeedDump() { | |||
| #ifdef ENABLE_DATA_DUMP | |||
| return DataDumpParser::GetInstance().NeedDump(kernel_name_); | |||
| #else | |||
| return false; | |||
| #endif | |||
| } | |||
| virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); } | |||
| protected: | |||
| uint32_t block_dim_{1}; | |||
| @@ -20,6 +20,7 @@ | |||
| #include <iostream> | |||
| #include <utility> | |||
| #include <fstream> | |||
| #include <algorithm> | |||
| #include <thread> | |||
| #include "nlohmann/json.hpp" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| @@ -499,235 +500,329 @@ int Sign(float x) { | |||
| return 0; | |||
| } | |||
| void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim) { | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->value_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->indices_); | |||
| std::unordered_map<int, size_t> index_map; | |||
| size_t unique_indices_size = 0; | |||
| for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) { | |||
| int index = origin_sparse_grad.indices_[i]; | |||
| if (index < 0 || IntToSize(index) >= first_dim) { | |||
| continue; | |||
| } | |||
| auto iter = index_map.find(index); | |||
| if (iter == index_map.end()) { | |||
| index_map[index] = unique_indices_size; | |||
| unique_grad->indices_[unique_indices_size] = index; | |||
| size_t start_index = unique_indices_size * outer_dim; | |||
| size_t end_index = start_index + outer_dim; | |||
| for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) { | |||
| unique_grad->value_[j] = origin_sparse_grad.value_[k]; | |||
| } | |||
| unique_indices_size++; | |||
| } else { | |||
| size_t first_index = iter->second; | |||
| size_t start_index = first_index * outer_dim; | |||
| size_t end_index = start_index + outer_dim; | |||
| for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) { | |||
| unique_grad->value_[j] += origin_sparse_grad.value_[k]; | |||
| } | |||
| namespace { | |||
| struct BucketSparseGradient { | |||
| float *value_; | |||
| int *indices_; | |||
| int *global_indices_; | |||
| size_t indices_size_; | |||
| }; | |||
| struct MultiThreadReduceSparseGradientParam { | |||
| SparseGradient *input_grad_{nullptr}; | |||
| SparseGradient *workspace_grad_{nullptr}; | |||
| SparseGradient *output_grad_{nullptr}; | |||
| size_t max_index_{0}; | |||
| size_t value_stride_{0}; | |||
| size_t thread_num_{0}; | |||
| bool use_sort_reduce_{false}; | |||
| }; | |||
| void CalculateEachBucketSize(const std::shared_ptr<SparseGradient> &sparse_grad, size_t max_index, | |||
| std::vector<size_t> *each_bucket_size) { | |||
| MS_LOG(DEBUG) << "Start"; | |||
| MS_EXCEPTION_IF_NULL(sparse_grad); | |||
| MS_EXCEPTION_IF_NULL(sparse_grad->indices_); | |||
| MS_EXCEPTION_IF_NULL(each_bucket_size); | |||
| size_t bucket_num = each_bucket_size->size(); | |||
| for (size_t i = 0; i < sparse_grad->indices_size_; ++i) { | |||
| int index = sparse_grad->indices_[i]; | |||
| if (index >= 0 && IntToSize(index) < max_index) { | |||
| auto bucket_id = index % bucket_num; | |||
| each_bucket_size->at(bucket_id)++; | |||
| } | |||
| } | |||
| unique_grad->indices_size_ = unique_indices_size; | |||
| MS_LOG(DEBUG) << "End"; | |||
| } | |||
| struct WorkerParamsForReduceSparseGradient { | |||
| size_t slice_start_{0}; | |||
| size_t slice_end_{0}; | |||
| size_t max_length_{0}; | |||
| size_t outer_dim_{0}; | |||
| std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr}; | |||
| std::vector<size_t> *slice_positions_{nullptr}; | |||
| float *src_value_{nullptr}; | |||
| SparseGradient *unique_grad_{nullptr}; | |||
| }; | |||
| void SplitAndCalculateSegmentBucketSize(const MultiThreadReduceSparseGradientParam ¶m, | |||
| std::vector<std::shared_ptr<SparseGradient>> *segments_ptr, | |||
| std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) { | |||
| MS_EXCEPTION_IF_NULL(param.input_grad_); | |||
| MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr); | |||
| MS_EXCEPTION_IF_NULL(segments_ptr); | |||
| auto &segments = *segments_ptr; | |||
| auto &segment_bucket_sizes = *segment_bucket_sizes_ptr; | |||
| auto input_grad = param.input_grad_; | |||
| if (param.thread_num_ < 1) { | |||
| MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!"; | |||
| } | |||
| size_t thread_indices_size = input_grad->indices_size_ / param.thread_num_; | |||
| size_t left_indices_size = input_grad->indices_size_ % param.thread_num_; | |||
| std::vector<std::thread> threads; | |||
| threads.reserve(param.thread_num_); | |||
| segments.reserve(param.thread_num_); | |||
| void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) { | |||
| MS_EXCEPTION_IF_NULL(param.sorted_indices_); | |||
| MS_EXCEPTION_IF_NULL(param.slice_positions_); | |||
| MS_EXCEPTION_IF_NULL(param.src_value_); | |||
| MS_EXCEPTION_IF_NULL(param.unique_grad_); | |||
| auto outer_dim = param.outer_dim_; | |||
| auto &sorted_indices = *(param.sorted_indices_); | |||
| auto &slice_positions = *(param.slice_positions_); | |||
| auto unique_grad = param.unique_grad_; | |||
| for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) { | |||
| size_t cur_pos = slice_positions[slice_id]; | |||
| int index = sorted_indices[cur_pos].first; | |||
| unique_grad->indices_[slice_id] = index; | |||
| size_t start_index = slice_id * outer_dim; | |||
| auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float), | |||
| param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float)); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| cur_pos++; | |||
| size_t end_pos; | |||
| if (slice_id + 1 < slice_positions.size()) { | |||
| end_pos = slice_positions[slice_id + 1]; | |||
| } else { | |||
| end_pos = sorted_indices.size(); | |||
| } | |||
| while (cur_pos < end_pos) { | |||
| for (size_t i = 0; i < outer_dim; ++i) { | |||
| unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i]; | |||
| } | |||
| cur_pos++; | |||
| size_t current_indices_offset = 0; | |||
| for (size_t i = 0; i < param.thread_num_; ++i) { | |||
| segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(param.thread_num_, 0)); | |||
| size_t indices_size = thread_indices_size; | |||
| if (i < left_indices_size) { | |||
| indices_size += 1; | |||
| } | |||
| segments.emplace_back(std::make_shared<SparseGradient>()); | |||
| segments[i]->value_ = input_grad->value_ + current_indices_offset * param.value_stride_; | |||
| segments[i]->indices_ = input_grad->indices_ + current_indices_offset; | |||
| segments[i]->indices_size_ = indices_size; | |||
| threads.emplace_back( | |||
| std::thread(CalculateEachBucketSize, segments[i], param.max_index_, segment_bucket_sizes[i].get())); | |||
| current_indices_offset += indices_size; | |||
| } | |||
| for (size_t i = 0; i < param.thread_num_; ++i) { | |||
| threads[i].join(); | |||
| } | |||
| } | |||
| void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, | |||
| size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices, | |||
| std::vector<size_t> *slice_positions) { | |||
| void CopySegmentIndicesToBucket(const MultiThreadReduceSparseGradientParam ¶m, | |||
| const std::shared_ptr<SparseGradient> &segment, size_t bucket_offset, | |||
| const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets) { | |||
| MS_LOG(DEBUG) << "Start"; | |||
| size_t thread_num = 24; | |||
| if (slice_positions->size() < thread_num) { | |||
| thread_num = slice_positions->size(); | |||
| MS_EXCEPTION_IF_NULL(segment); | |||
| MS_EXCEPTION_IF_NULL(segment->indices_); | |||
| std::vector<size_t> bucket_data_num(param.thread_num_, 0); | |||
| for (size_t i = 0; i < segment->indices_size_; ++i) { | |||
| int index = segment->indices_[i]; | |||
| if (index >= 0 && IntToSize(index) < param.max_index_) { | |||
| auto bucket_id = index % param.thread_num_; | |||
| auto bucket_index = bucket_data_num[bucket_id]; | |||
| buckets[bucket_id]->indices_[bucket_index] = index; | |||
| buckets[bucket_id]->global_indices_[bucket_index] = bucket_offset + i; | |||
| bucket_data_num[bucket_id]++; | |||
| } | |||
| } | |||
| size_t stride = (slice_positions->size() + thread_num - 1) / thread_num; | |||
| thread_num = (slice_positions->size() + stride - 1) / stride; | |||
| std::vector<std::thread> threads; | |||
| size_t max_length = sorted_indices->size() * outer_dim; | |||
| MS_LOG(DEBUG) << "End"; | |||
| } | |||
| void GatherSegmentIndicesToOutputBucket(const MultiThreadReduceSparseGradientParam ¶m, | |||
| const std::vector<std::shared_ptr<SparseGradient>> &segments, | |||
| const std::vector<std::shared_ptr<std::vector<size_t>>> &segment_bucket_sizes, | |||
| std::vector<std::shared_ptr<BucketSparseGradient>> *buckets_ptr) { | |||
| MS_EXCEPTION_IF_NULL(param.output_grad_); | |||
| MS_EXCEPTION_IF_NULL(param.output_grad_->value_); | |||
| MS_EXCEPTION_IF_NULL(param.output_grad_->indices_); | |||
| MS_EXCEPTION_IF_NULL(buckets_ptr); | |||
| auto &buckets = *buckets_ptr; | |||
| size_t thread_num = param.thread_num_; | |||
| if (thread_num != segment_bucket_sizes.size()) { | |||
| MS_EXCEPTION(ArgumentError) << "Input param thread num not equal to segment size!"; | |||
| } | |||
| std::vector<size_t> bucket_data_size(thread_num, 0); | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| size_t slice_start = i * stride; | |||
| size_t slice_end = 0; | |||
| if (i == thread_num - 1) { | |||
| slice_end = slice_positions->size(); | |||
| } else { | |||
| slice_end = slice_start + stride; | |||
| for (size_t j = 0; j < thread_num; ++j) { | |||
| bucket_data_size[j] += segment_bucket_sizes[i]->at(j); | |||
| } | |||
| WorkerParamsForReduceSparseGradient params{ | |||
| slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_, | |||
| unique_grad}; | |||
| threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params)); | |||
| } | |||
| size_t current_indices_offset = 0; | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| buckets.emplace_back(std::make_shared<BucketSparseGradient>()); | |||
| buckets[i]->value_ = param.output_grad_->value_ + current_indices_offset * param.value_stride_; | |||
| buckets[i]->indices_ = param.output_grad_->indices_ + current_indices_offset; | |||
| buckets[i]->global_indices_ = param.workspace_grad_->indices_ + current_indices_offset; | |||
| buckets[i]->indices_size_ = bucket_data_size[i]; | |||
| current_indices_offset += bucket_data_size[i]; | |||
| } | |||
| std::vector<size_t> tmp_bucket_data_size(thread_num, 0); | |||
| std::vector<std::vector<std::shared_ptr<BucketSparseGradient>>> each_thread_buckets; | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| std::vector<std::shared_ptr<BucketSparseGradient>> thread_buckets; | |||
| for (size_t j = 0; j < thread_num; ++j) { | |||
| thread_buckets.emplace_back(std::make_shared<BucketSparseGradient>()); | |||
| thread_buckets[j]->indices_ = buckets[j]->indices_ + tmp_bucket_data_size[j]; | |||
| thread_buckets[j]->global_indices_ = buckets[j]->global_indices_ + tmp_bucket_data_size[j]; | |||
| thread_buckets[j]->value_ = buckets[j]->value_ + tmp_bucket_data_size[j] * param.value_stride_; | |||
| thread_buckets[j]->indices_size_ = segment_bucket_sizes[i]->at(j); | |||
| tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j); | |||
| } | |||
| each_thread_buckets.emplace_back(thread_buckets); | |||
| } | |||
| std::vector<std::thread> threads; | |||
| threads.reserve(thread_num); | |||
| current_indices_offset = 0; | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| threads.emplace_back( | |||
| std::thread(CopySegmentIndicesToBucket, param, segments[i], current_indices_offset, each_thread_buckets[i])); | |||
| current_indices_offset += segments[i]->indices_size_; | |||
| } | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| threads[i].join(); | |||
| } | |||
| MS_LOG(DEBUG) << "End"; | |||
| } | |||
| void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim, bool use_multi_threads) { | |||
| void SortAndReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam ¶m, | |||
| const std::shared_ptr<BucketSparseGradient> &bucket, | |||
| const std::shared_ptr<SparseGradient> &reduced_bucket) { | |||
| MS_LOG(DEBUG) << "Start"; | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->value_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->indices_); | |||
| std::vector<std::pair<int, size_t>> sorted_indices; | |||
| sorted_indices.reserve(origin_sparse_grad.indices_size_); | |||
| for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) { | |||
| int index = origin_sparse_grad.indices_[i]; | |||
| if (index >= 0 && IntToSize(index) < first_dim) { | |||
| sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim)); | |||
| } | |||
| } | |||
| std::sort( | |||
| sorted_indices.begin(), sorted_indices.end(), | |||
| [](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; }); | |||
| int last_index = 0; | |||
| std::vector<size_t> slice_positions; | |||
| slice_positions.reserve(sorted_indices.size()); | |||
| MS_EXCEPTION_IF_NULL(bucket); | |||
| MS_EXCEPTION_IF_NULL(bucket->value_); | |||
| MS_EXCEPTION_IF_NULL(bucket->indices_); | |||
| MS_EXCEPTION_IF_NULL(reduced_bucket); | |||
| MS_EXCEPTION_IF_NULL(reduced_bucket->value_); | |||
| MS_EXCEPTION_IF_NULL(reduced_bucket->indices_); | |||
| std::vector<std::pair<int, int>> sorted_indices; | |||
| sorted_indices.reserve(bucket->indices_size_); | |||
| for (size_t i = 0; i < bucket->indices_size_; ++i) { | |||
| int index = bucket->indices_[i]; | |||
| int global_index = bucket->global_indices_[i]; | |||
| sorted_indices.emplace_back(std::pair<int, int>(index, global_index)); | |||
| } | |||
| std::sort(sorted_indices.begin(), sorted_indices.end()); | |||
| float *global_value = param.input_grad_->value_; | |||
| size_t unique_indices_size = 0; | |||
| size_t max_length = reduced_bucket->indices_size_ * param.value_stride_; | |||
| int last_index{0}; | |||
| size_t value_offset{0}; | |||
| for (size_t i = 0; i < sorted_indices.size(); ++i) { | |||
| if (i == 0 || last_index != sorted_indices[i].first) { | |||
| slice_positions.emplace_back(i); | |||
| int index = sorted_indices[i].first; | |||
| int global_index = sorted_indices[i].second; | |||
| int global_value_offset = global_index * param.value_stride_; | |||
| if (i == 0 || index != last_index) { | |||
| if (i != 0) { | |||
| unique_indices_size++; | |||
| } | |||
| reduced_bucket->indices_[unique_indices_size] = index; | |||
| value_offset = unique_indices_size * param.value_stride_; | |||
| auto ret_code = memcpy_s(reduced_bucket->value_ + value_offset, (max_length - value_offset) * sizeof(float), | |||
| global_value + global_value_offset, param.value_stride_ * sizeof(float)); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| } else { | |||
| for (size_t j = 0; j < param.value_stride_; ++j) { | |||
| reduced_bucket->value_[value_offset + j] += global_value[global_value_offset + j]; | |||
| } | |||
| } | |||
| last_index = sorted_indices[i].first; | |||
| last_index = index; | |||
| } | |||
| if (use_multi_threads) { | |||
| RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions); | |||
| } else { | |||
| size_t max_length = sorted_indices.size() * outer_dim; | |||
| WorkerParamsForReduceSparseGradient params{0, | |||
| slice_positions.size(), | |||
| max_length, | |||
| outer_dim, | |||
| &sorted_indices, | |||
| &slice_positions, | |||
| origin_sparse_grad.value_, | |||
| unique_grad}; | |||
| WorkerForReduceSparseGradient(params); | |||
| } | |||
| unique_grad->indices_size_ = slice_positions.size(); | |||
| reduced_bucket->indices_size_ = unique_indices_size; | |||
| MS_LOG(DEBUG) << "End"; | |||
| } | |||
| void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads, | |||
| SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim) { | |||
| void ReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam ¶m, | |||
| const std::shared_ptr<BucketSparseGradient> &bucket, | |||
| const std::shared_ptr<SparseGradient> &reduced_bucket) { | |||
| MS_LOG(DEBUG) << "Start"; | |||
| if (unique_slice_grads.empty()) { | |||
| return; | |||
| } | |||
| size_t index_data_size = outer_dim * sizeof(float); | |||
| MS_EXCEPTION_IF_NULL(bucket); | |||
| MS_EXCEPTION_IF_NULL(bucket->value_); | |||
| MS_EXCEPTION_IF_NULL(bucket->indices_); | |||
| MS_EXCEPTION_IF_NULL(reduced_bucket); | |||
| MS_EXCEPTION_IF_NULL(reduced_bucket->value_); | |||
| MS_EXCEPTION_IF_NULL(reduced_bucket->indices_); | |||
| float *global_value = param.input_grad_->value_; | |||
| std::unordered_map<int, size_t> index_map; | |||
| size_t unique_indices_size = 0; | |||
| for (size_t i = 0; i < unique_slice_grads.size(); ++i) { | |||
| auto &slice_grad = unique_slice_grads[i]; | |||
| auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim, | |||
| (tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_, | |||
| slice_grad->indices_size_ * index_data_size); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| ret_code = | |||
| memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int), | |||
| slice_grad->indices_, slice_grad->indices_size_ * sizeof(int)); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| size_t max_length = reduced_bucket->indices_size_ * param.value_stride_; | |||
| for (size_t i = 0; i < bucket->indices_size_; ++i) { | |||
| int index = bucket->indices_[i]; | |||
| int global_index = bucket->global_indices_[i]; | |||
| auto iter = index_map.find(index); | |||
| if (iter == index_map.end()) { | |||
| reduced_bucket->indices_[unique_indices_size] = index; | |||
| size_t start_index = unique_indices_size * param.value_stride_; | |||
| index_map[index] = start_index; | |||
| auto ret_code = memcpy_s(reduced_bucket->value_ + start_index, (max_length - start_index) * sizeof(float), | |||
| global_value + global_index * param.value_stride_, param.value_stride_ * sizeof(float)); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| unique_indices_size++; | |||
| } else { | |||
| size_t start_index = iter->second; | |||
| size_t end_index = start_index + param.value_stride_; | |||
| for (size_t j = start_index, k = global_index * param.value_stride_; j < end_index; ++j, ++k) { | |||
| reduced_bucket->value_[j] += global_value[k]; | |||
| } | |||
| } | |||
| unique_indices_size += slice_grad->indices_size_; | |||
| } | |||
| tmp_grad->indices_size_ = unique_indices_size; | |||
| ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim); | |||
| reduced_bucket->indices_size_ = unique_indices_size; | |||
| MS_LOG(DEBUG) << "End"; | |||
| } | |||
| void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad, | |||
| SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) { | |||
| MS_LOG(DEBUG) << "Start"; | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->value_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->indices_); | |||
| MS_EXCEPTION_IF_NULL(tmp_grad); | |||
| MS_EXCEPTION_IF_NULL(tmp_grad->value_); | |||
| MS_EXCEPTION_IF_NULL(tmp_grad->indices_); | |||
| size_t thread_num = 24; | |||
| if (origin_sparse_grad.indices_size_ < thread_num) { | |||
| thread_num = origin_sparse_grad.indices_size_; | |||
| } | |||
| size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num; | |||
| size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num; | |||
| void ReduceBucketSparseGradientToWorkspace(const MultiThreadReduceSparseGradientParam ¶m, | |||
| const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets, | |||
| std::vector<std::shared_ptr<SparseGradient>> *reduced_buckets_ptr) { | |||
| MS_EXCEPTION_IF_NULL(param.workspace_grad_); | |||
| MS_EXCEPTION_IF_NULL(param.workspace_grad_->value_); | |||
| MS_EXCEPTION_IF_NULL(param.workspace_grad_->indices_); | |||
| MS_EXCEPTION_IF_NULL(reduced_buckets_ptr); | |||
| auto &reduced_buckets = *reduced_buckets_ptr; | |||
| size_t thread_num = buckets.size(); | |||
| std::vector<std::thread> threads; | |||
| threads.reserve(thread_num); | |||
| std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads; | |||
| size_t current_indices_offset = 0; | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| size_t indices_size = thread_indices_size; | |||
| if (i == thread_num - 1) { | |||
| indices_size = thread_indices_size + left_indices_size; | |||
| reduced_buckets.emplace_back(std::make_shared<SparseGradient>()); | |||
| reduced_buckets[i]->value_ = param.workspace_grad_->value_ + current_indices_offset * param.value_stride_; | |||
| reduced_buckets[i]->indices_ = param.workspace_grad_->indices_ + current_indices_offset; | |||
| reduced_buckets[i]->indices_size_ = buckets[i]->indices_size_; | |||
| if (param.use_sort_reduce_) { | |||
| threads.emplace_back(std::thread(SortAndReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i])); | |||
| } else { | |||
| threads.emplace_back(std::thread(ReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i])); | |||
| } | |||
| size_t value_offset = i * thread_indices_size * outer_dim; | |||
| size_t indices_offset = i * thread_indices_size; | |||
| auto slice_grad = SparseGradient( | |||
| {origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size}); | |||
| unique_slice_grads.emplace_back(std::make_shared<SparseGradient>()); | |||
| unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset; | |||
| unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset; | |||
| unique_slice_grads[i]->indices_size_ = indices_size; | |||
| threads.emplace_back( | |||
| std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false)); | |||
| current_indices_offset += buckets[i]->indices_size_; | |||
| } | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| threads[i].join(); | |||
| } | |||
| ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim); | |||
| } | |||
| void MergeReduceSparseGradient(const MultiThreadReduceSparseGradientParam ¶m, | |||
| const std::vector<std::shared_ptr<SparseGradient>> &reduced_buckets) { | |||
| MS_EXCEPTION_IF_NULL(param.output_grad_); | |||
| auto output_grad = param.output_grad_; | |||
| MS_EXCEPTION_IF_NULL(output_grad->value_); | |||
| MS_EXCEPTION_IF_NULL(output_grad->indices_); | |||
| size_t stride_data_size = param.value_stride_ * sizeof(float); | |||
| size_t unique_indices_size = 0; | |||
| for (size_t i = 0; i < reduced_buckets.size(); ++i) { | |||
| auto &bucket = reduced_buckets[i]; | |||
| MS_EXCEPTION_IF_NULL(bucket); | |||
| if (bucket->indices_size_ == 0) { | |||
| continue; | |||
| } | |||
| auto ret_code = memcpy_s(output_grad->value_ + unique_indices_size * param.value_stride_, | |||
| (output_grad->indices_size_ - unique_indices_size) * stride_data_size, bucket->value_, | |||
| bucket->indices_size_ * stride_data_size); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| ret_code = memcpy_s(output_grad->indices_ + unique_indices_size, | |||
| (output_grad->indices_size_ - unique_indices_size) * sizeof(int), bucket->indices_, | |||
| bucket->indices_size_ * sizeof(int)); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| unique_indices_size += bucket->indices_size_; | |||
| } | |||
| output_grad->indices_size_ = unique_indices_size; | |||
| } | |||
| } // namespace | |||
| void BucketReduceSparseGradient(const ReduceSparseGradientParam ¶m) { | |||
| MS_LOG(DEBUG) << "Start"; | |||
| MS_EXCEPTION_IF_NULL(param.input_grad_); | |||
| size_t thread_num = 23; | |||
| if (param.input_grad_->indices_size_ < thread_num) { | |||
| thread_num = param.input_grad_->indices_size_; | |||
| } | |||
| MultiThreadReduceSparseGradientParam multi_thread_param({param.input_grad_, param.workspace_grad_, param.output_grad_, | |||
| param.max_index_, param.value_stride_, thread_num, | |||
| param.use_sort_reduce_}); | |||
| std::vector<std::shared_ptr<SparseGradient>> segments; | |||
| std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes; | |||
| SplitAndCalculateSegmentBucketSize(multi_thread_param, &segments, &segment_bucket_sizes); | |||
| std::vector<std::shared_ptr<BucketSparseGradient>> buckets; | |||
| GatherSegmentIndicesToOutputBucket(multi_thread_param, segments, segment_bucket_sizes, &buckets); | |||
| std::vector<std::shared_ptr<SparseGradient>> reduced_buckets; | |||
| ReduceBucketSparseGradientToWorkspace(multi_thread_param, buckets, &reduced_buckets); | |||
| MergeReduceSparseGradient(multi_thread_param, reduced_buckets); | |||
| MS_LOG(DEBUG) << "End"; | |||
| } | |||
| @@ -73,9 +73,18 @@ class KernelMeta { | |||
| }; | |||
| struct SparseGradient { | |||
| float *value_; | |||
| int *indices_; | |||
| size_t indices_size_; | |||
| float *value_{nullptr}; | |||
| int *indices_{nullptr}; | |||
| size_t indices_size_{0}; | |||
| }; | |||
| struct ReduceSparseGradientParam { | |||
| SparseGradient *input_grad_{nullptr}; | |||
| SparseGradient *workspace_grad_{nullptr}; | |||
| SparseGradient *output_grad_{nullptr}; | |||
| size_t max_index_{0}; | |||
| size_t value_stride_{0}; | |||
| bool use_sort_reduce_{false}; | |||
| }; | |||
| struct MultiThreadComputeParams { | |||
| @@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info); | |||
| std::string GetProcessor(const AnfNodePtr &anf_node); | |||
| bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b); | |||
| int Sign(float x); | |||
| void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim); | |||
| void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim, bool use_multi_threads = true); | |||
| std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index); | |||
| std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list, | |||
| const std::vector<AnfNodePtr> &input_list); | |||
| @@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An | |||
| bool IsWeightBoundary(const AnfNodePtr &node); | |||
| void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params, | |||
| size_t total_compute_size); | |||
| void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, | |||
| size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices, | |||
| std::vector<size_t> *slice_positions); | |||
| void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads, | |||
| SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim); | |||
| void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad, | |||
| SparseGradient *unique_grad, size_t first_dim, size_t outer_dim); | |||
| void BucketReduceSparseGradient(const ReduceSparseGradientParam ¶m); | |||
| std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel { | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| protected: | |||
| void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2, | |||
| float **output_addr); | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| @@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i | |||
| size_t output_size = outputs[0]->size; | |||
| size_t size = input_size / sizeof(float); | |||
| ::ps::SArray<float> lookup_ids(size, 0); | |||
| ::ps::SArray<int> lookup_ids(size, 0); | |||
| ::ps::SArray<int> lengths{size}; | |||
| ::ps::SArray<float> lookup_result; | |||
| ::ps::SArray<float> lookup_result(output_size / sizeof(float), 0); | |||
| auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size); | |||
| if (ret != EOK) { | |||
| MS_LOG(EXCEPTION) << "Lookup id memcpy failed."; | |||
| } | |||
| parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result, | |||
| parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result, | |||
| parallel::ps::kEmbeddingLookupCmd); | |||
| auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size); | |||
| @@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel( | |||
| split_num_ = pserver_num_; | |||
| // input shape should be sharded after computing offset_; | |||
| Shard(input_shape_, axis_); | |||
| Shard(&input_shape_, axis_); | |||
| size_t output_size = | |||
| std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>()); | |||
| @@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push, | |||
| MS_REG_CPU_KERNEL_T( | |||
| Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64), | |||
| PushKernel, float); | |||
| MS_REG_CPU_KERNEL_T(Push, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeUInt64), | |||
| PushKernel, float); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -43,7 +43,7 @@ class PushKernel : public CPUKernel { | |||
| sizes.push_back(SizeToInt(input->size) / sizeof(T)); | |||
| } | |||
| parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes); | |||
| memcpy(outputs[0]->addr, &key_, sizeof(size_t)); | |||
| memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t)); | |||
| return true; | |||
| } | |||
| @@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar | |||
| void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) { | |||
| const auto &indices_addr = inputs[10]; | |||
| indices_size_ = indices_addr->size; | |||
| indices_size_ = indices_addr->size / sizeof(int); | |||
| workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); | |||
| workspace_size_list_[1] = indices_size_ * sizeof(int); | |||
| } | |||
| @@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar | |||
| void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) { | |||
| const auto &indices_addr = inputs[4]; | |||
| indices_size_ = indices_addr->size; | |||
| indices_size_ = indices_addr->size / sizeof(int); | |||
| workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); | |||
| workspace_size_list_[1] = indices_size_ * sizeof(int); | |||
| } | |||
| @@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| } | |||
| @@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp | |||
| auto indices = reinterpret_cast<int *>(inputs[10]->addr); | |||
| auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); | |||
| auto m_t = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| auto m_t = reinterpret_cast<float *>(workspace[4]->addr); | |||
| SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); | |||
| ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, | |||
| var_outer_dim_size_); | |||
| SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_}); | |||
| SparseGradient input_sparse_grad({grad, indices, indices_size_}); | |||
| ReduceSparseGradientParam param; | |||
| param.input_grad_ = &input_sparse_grad; | |||
| param.workspace_grad_ = &workspace_sparse_grad; | |||
| param.output_grad_ = &unique_sparse_grad; | |||
| param.max_index_ = var_first_dim_size_; | |||
| param.value_stride_ = var_outer_dim_size_; | |||
| BucketReduceSparseGradient(param); | |||
| size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_; | |||
| lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power); | |||
| @@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp | |||
| auto indices = reinterpret_cast<int *>(inputs[4]->addr); | |||
| auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); | |||
| auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); | |||
| SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); | |||
| TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, | |||
| var_first_dim_size_, var_outer_dim_size_); | |||
| SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_}); | |||
| SparseGradient input_sparse_grad({grad, indices, indices_size_}); | |||
| ReduceSparseGradientParam param; | |||
| param.input_grad_ = &input_sparse_grad; | |||
| param.workspace_grad_ = &workspace_sparse_grad; | |||
| param.output_grad_ = &unique_sparse_grad; | |||
| param.max_index_ = var_first_dim_size_; | |||
| param.value_stride_ = var_outer_dim_size_; | |||
| BucketReduceSparseGradient(param); | |||
| MultiThreadComputeParams input_params; | |||
| input_params.var_ = var; | |||
| @@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> | |||
| auto indices = reinterpret_cast<int *>(inputs[10]->addr); | |||
| auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); | |||
| auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); | |||
| SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); | |||
| TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, | |||
| var_first_dim_size_, var_outer_dim_size_); | |||
| SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_}); | |||
| SparseGradient input_sparse_grad({grad, indices, indices_size_}); | |||
| ReduceSparseGradientParam param; | |||
| param.input_grad_ = &input_sparse_grad; | |||
| param.workspace_grad_ = &workspace_sparse_grad; | |||
| param.output_grad_ = &unique_sparse_grad; | |||
| param.max_index_ = var_first_dim_size_; | |||
| param.value_stride_ = var_outer_dim_size_; | |||
| BucketReduceSparseGradient(param); | |||
| lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power); | |||
| MultiThreadComputeParams input_params; | |||
| @@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| } | |||
| void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| @@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre | |||
| auto indices = reinterpret_cast<int *>(inputs[6]->addr); | |||
| auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); | |||
| auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); | |||
| ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, | |||
| var_outer_dim_size_); | |||
| SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_}); | |||
| SparseGradient input_sparse_grad({grad, indices, indices_size_}); | |||
| ReduceSparseGradientParam param; | |||
| param.input_grad_ = &input_sparse_grad; | |||
| param.workspace_grad_ = &workspace_sparse_grad; | |||
| param.output_grad_ = &unique_sparse_grad; | |||
| param.max_index_ = var_first_dim_size_; | |||
| param.value_stride_ = var_outer_dim_size_; | |||
| BucketReduceSparseGradient(param); | |||
| MultiThreadComputeParams input_params; | |||
| input_params.var_ = var; | |||
| @@ -0,0 +1,26 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| BroadcastToGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| BroadcastToGpuKernel, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,83 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class BroadcastToGpuKernel : public GpuKernel { | |||
| public: | |||
| BroadcastToGpuKernel() {} | |||
| ~BroadcastToGpuKernel() = default; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||
| BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], output_shape_[0], output_shape_[1], | |||
| output_shape_[2], output_shape_[3], input_addr, output_addr, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) override { | |||
| auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| if (input_shapes.size() > 4 || output_shapes.size() > 4) { | |||
| MS_LOG(EXCEPTION) << "BroadcastTo operation not support dim greater than 4"; | |||
| } | |||
| for (int i = input_shapes.size() - 1; i >= 0; i--) { | |||
| input_shape_[i] = input_shapes[i]; | |||
| } | |||
| for (int j = output_shapes.size() - 1; j >= 0; j--) { | |||
| output_shape_[j] = output_shapes[j]; | |||
| } | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| protected: | |||
| void InitSizeLists() override { | |||
| input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T)); | |||
| output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T)); | |||
| } | |||
| private: | |||
| int input_shape_[4] = {1, 1, 1, 1}; | |||
| int output_shape_[4] = {1, 1, 1, 1}; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_ | |||
| @@ -18,6 +18,7 @@ | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh" | |||
| @@ -27,40 +28,35 @@ namespace kernel { | |||
| template <typename T> | |||
| class ConcatV2GpuFwdKernel : public GpuKernel { | |||
| public: | |||
| ConcatV2GpuFwdKernel() : axis_(0), output_size_(0) {} | |||
| ConcatV2GpuFwdKernel() | |||
| : axis_(0), | |||
| input_num_(1), | |||
| output_size_(0), | |||
| all_size_before_axis_(1), | |||
| all_size_axis_(1), | |||
| inputs_host_(nullptr), | |||
| len_axis_(nullptr) {} | |||
| ~ConcatV2GpuFwdKernel() override = default; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||
| if (inputs.size() == 2) { | |||
| T *input_0 = GetDeviceAddress<T>(inputs, 0); | |||
| T *input_1 = GetDeviceAddress<T>(inputs, 1); | |||
| T *output = GetDeviceAddress<T>(outputs, 0); | |||
| ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], input_0, input_1, output, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| } | |||
| if (inputs.size() == 3) { | |||
| T *input_0 = GetDeviceAddress<T>(inputs, 0); | |||
| T *input_1 = GetDeviceAddress<T>(inputs, 1); | |||
| T *input_2 = GetDeviceAddress<T>(inputs, 2); | |||
| T *output = GetDeviceAddress<T>(outputs, 0); | |||
| ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], input_0, input_1, input_2, output, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| } | |||
| if (inputs.size() == 4) { | |||
| T *input_0 = GetDeviceAddress<T>(inputs, 0); | |||
| T *input_1 = GetDeviceAddress<T>(inputs, 1); | |||
| T *input_2 = GetDeviceAddress<T>(inputs, 2); | |||
| T *input_3 = GetDeviceAddress<T>(inputs, 3); | |||
| T *output = GetDeviceAddress<T>(outputs, 0); | |||
| ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], w_[3], input_0, input_1, input_2, input_3, output, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| T *output = GetDeviceAddress<T>(outputs, 0); | |||
| T **inputs_device = GetDeviceAddress<T *>(workspace, 0); | |||
| int *len_axis_device = GetDeviceAddress<int>(workspace, 1); | |||
| for (size_t i = 0; i < inputs.size(); i++) { | |||
| inputs_host_[i] = GetDeviceAddress<T>(inputs, i); | |||
| } | |||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_, | |||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||
| "ConcatV2 opt cudaMemcpyAsync inputs failed"); | |||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_, | |||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||
| "ConcatV2 opt cudaMemcpyAsync length on axis failed"); | |||
| ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device, | |||
| output, reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) override { | |||
| @@ -74,25 +70,34 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||
| axis_ += SizeToInt(input_shape.size()); | |||
| } | |||
| auto input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| for (size_t i = 0; i < input_num; i++) { | |||
| auto input_size = sizeof(T); | |||
| input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node)); | |||
| inputs_host_ = std::make_unique<T *[]>(input_num_); | |||
| len_axis_ = std::make_unique<int[]>(input_num_); | |||
| for (int i = 0; i < input_num_; i++) { | |||
| int input_size = 1; | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i); | |||
| for (size_t j = 0; j < input_shape.size(); j++) { | |||
| input_size *= SizeToInt(input_shape[j]); | |||
| if (j >= IntToSize(axis_)) { | |||
| w_[i] *= SizeToInt(input_shape[j]); | |||
| } | |||
| input_size_list_.push_back(input_size); | |||
| } | |||
| input_size_list_.push_back(IntToSize(input_size * sizeof(T))); | |||
| len_axis_[i] = SizeToInt(input_shape[axis_]); | |||
| } | |||
| workspace_size_list_.push_back(sizeof(T *) * input_num_); | |||
| workspace_size_list_.push_back(sizeof(int) * input_num_); | |||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| output_size_ = sizeof(T); | |||
| for (size_t i = 0; i < output_shape.size(); i++) { | |||
| output_size_ = 1; | |||
| for (int i = 0; i < SizeToInt(output_shape.size()); i++) { | |||
| output_size_ *= output_shape[i]; | |||
| if (i > axis_) { | |||
| all_size_before_axis_ *= output_shape[i]; | |||
| all_size_axis_ *= output_shape[i]; | |||
| } | |||
| if (i == axis_) { | |||
| all_size_before_axis_ *= output_shape[i]; | |||
| } | |||
| } | |||
| output_size_list_.push_back(output_size_); | |||
| output_size_list_.push_back(IntToSize(output_size_ * sizeof(T))); | |||
| InitSizeLists(); | |||
| return true; | |||
| @@ -103,11 +108,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||
| private: | |||
| bool CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num < 2 || input_num > 4) { | |||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but ConcatV2GpuFwdKernel needs inputs between 2 and 4."; | |||
| return false; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output."; | |||
| @@ -115,9 +115,13 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||
| } | |||
| return true; | |||
| } | |||
| int w_[4] = {1, 1, 1, 1}; | |||
| int axis_; | |||
| size_t output_size_; | |||
| int input_num_; | |||
| int output_size_; | |||
| int all_size_before_axis_; | |||
| int all_size_axis_; | |||
| std::unique_ptr<T *[]> inputs_host_; | |||
| std::unique_ptr<int[]> len_axis_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| @@ -0,0 +1,31 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_ONE( | |||
| Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| SplitGpuFwdKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(Split, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| SplitGpuFwdKernel, int) | |||
| MS_REG_GPU_KERNEL_ONE( | |||
| Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| SplitGpuFwdKernel, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,153 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class SplitGpuFwdKernel : public GpuKernel { | |||
| public: | |||
| SplitGpuFwdKernel() | |||
| : axis_(0), | |||
| output_num_(1), | |||
| input_size_(1), | |||
| axis_step_(1), | |||
| all_size_before_axis_(1), | |||
| all_size_axis_(1), | |||
| outputs_host_(nullptr) {} | |||
| ~SplitGpuFwdKernel() override = default; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||
| T *input = GetDeviceAddress<T>(inputs, 0); | |||
| T **outputs_device = GetDeviceAddress<T *>(workspace, 0); | |||
| for (size_t i = 0; i < outputs.size(); i++) { | |||
| outputs_host_[i] = GetDeviceAddress<T>(outputs, i); | |||
| } | |||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_, | |||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||
| "Split opt cudaMemcpyAsync outputs failed"); | |||
| SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) override { | |||
| axis_ = GetAttr<int>(kernel_node, "axis"); | |||
| if (axis_ < 0) { | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| axis_ += SizeToInt(input_shape.size()); | |||
| } | |||
| output_num_ = GetAttr<int>(kernel_node, "output_num"); | |||
| if (!CheckParam(kernel_node)) { | |||
| return false; | |||
| } | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| input_size_ = 1; | |||
| all_size_before_axis_ = 1; | |||
| all_size_axis_ = 1; | |||
| for (int i = 0; i < SizeToInt(input_shape.size()); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| if (i > axis_) { | |||
| all_size_before_axis_ *= input_shape[i]; | |||
| all_size_axis_ *= input_shape[i]; | |||
| } | |||
| if (i == axis_) { | |||
| all_size_before_axis_ *= input_shape[i]; | |||
| } | |||
| } | |||
| input_size_list_.push_back(IntToSize(input_size_ * sizeof(T))); | |||
| axis_step_ = input_shape[axis_] / output_num_; | |||
| for (int i = 0; i < output_num_; i++) { | |||
| size_t output_size = 1; | |||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i); | |||
| for (size_t j = 0; j < output_shape.size(); j++) { | |||
| output_size *= output_shape[j]; | |||
| } | |||
| output_size_list_.push_back(output_size * sizeof(T)); | |||
| } | |||
| workspace_size_list_.push_back(sizeof(T *) * output_num_); | |||
| InitSizeLists(); | |||
| outputs_host_ = std::make_unique<T *[]>(output_num_); | |||
| return true; | |||
| } | |||
| protected: | |||
| void InitSizeLists() override {} | |||
| private: | |||
| bool CheckParam(const CNodePtr &kernel_node) { | |||
| auto input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| int dims = SizeToInt(input_shape.size()); | |||
| int output_num = SizeToInt(AnfAlgo::GetOutputTensorNum(kernel_node)); | |||
| if (input_num != 1) { | |||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but Split needs 1 input."; | |||
| return false; | |||
| } | |||
| if (dims == 0) { | |||
| MS_LOG(ERROR) << "Input dims is " << dims << ", scalar is not supported."; | |||
| return false; | |||
| } | |||
| if (axis_ < -dims || axis_ >= dims) { | |||
| MS_LOG(ERROR) << "Attr axis " << axis_ << " must be in " << -dims << "~" << dims; | |||
| return false; | |||
| } | |||
| if (output_num_ > SizeToInt(input_shape[axis_])) { | |||
| MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must less than" << input_shape[axis_]; | |||
| return false; | |||
| } | |||
| if (input_shape[axis_] % output_num_ != 0) { | |||
| MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must be divided by" << input_shape[axis_]; | |||
| return false; | |||
| } | |||
| if (output_num_ != output_num) { | |||
| MS_LOG(ERROR) << "Output num is " << output_num << ", but need " << output_num_; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| int axis_; | |||
| int output_num_; | |||
| int input_size_; | |||
| int axis_step_; | |||
| int all_size_before_axis_; | |||
| int all_size_axis_; | |||
| std::unique_ptr<T *[]> outputs_host_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H | |||
| @@ -0,0 +1,29 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_TWO(TopK, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeInt32), | |||
| TopKGpuKernel, float, int) | |||
| } | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,110 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_ | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_ | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T, typename S> | |||
| class TopKGpuKernel : public GpuKernel { | |||
| public: | |||
| TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {} | |||
| ~TopKGpuKernel() override = default; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||
| S *k = GetDeviceAddress<S>(inputs, 1); | |||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||
| S *indices = GetDeviceAddress<S>(outputs, 1); | |||
| T *data_buff = nullptr; | |||
| S *index_buff = nullptr; | |||
| if (use_share_mem_ == false) { | |||
| data_buff = GetDeviceAddress<T>(workspaces, 0); | |||
| index_buff = GetDeviceAddress<S>(workspaces, 1); | |||
| } | |||
| TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| if (sorted_ == false) { | |||
| std::cout << "================BitonicSortByKey" << std::endl; | |||
| BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| } | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) override { | |||
| auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < input_shapes.size() - 1; i++) { | |||
| outer_size_ *= input_shapes[i]; | |||
| } | |||
| inner_size_ = input_shapes[input_shapes.size() - 1]; | |||
| k_ = output_shapes[output_shapes.size() - 1]; | |||
| sorted_ = GetAttr<bool>(kernel_node, "sorted"); | |||
| ceil_power2_ = RoundUpPower2(inner_size_); | |||
| size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S)); | |||
| if (buffer_size > SHARED_MEM_PER_BLOCK) { | |||
| use_share_mem_ = false; | |||
| MS_LOG(WARNING) << "CUDA share memory not enough, sort with RAM"; | |||
| } | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| protected: | |||
| void InitSizeLists() override { | |||
| input_size_list_.push_back(outer_size_ * inner_size_ * sizeof(T)); | |||
| input_size_list_.push_back(sizeof(S)); | |||
| output_size_list_.push_back(outer_size_ * k_ * sizeof(T)); | |||
| output_size_list_.push_back(outer_size_ * k_ * sizeof(S)); | |||
| if (use_share_mem_ == false) { | |||
| workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T)); | |||
| workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S)); | |||
| } | |||
| } | |||
| private: | |||
| bool sorted_; | |||
| int outer_size_; | |||
| int inner_size_; | |||
| int k_; | |||
| bool use_share_mem_; | |||
| int ceil_power2_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // TopKpuKernel | |||
| @@ -116,16 +116,16 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const | |||
| output); | |||
| case BROADCAST_TYPE_REALDIV: | |||
| return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, | |||
| output); | |||
| output); | |||
| case BROADCAST_TYPE_MUL: | |||
| return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, | |||
| output); | |||
| output); | |||
| case BROADCAST_TYPE_SUB: | |||
| return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, | |||
| output); | |||
| output); | |||
| case BROADCAST_TYPE_ADD: | |||
| return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, | |||
| output); | |||
| output); | |||
| } | |||
| } | |||
| @@ -176,6 +176,28 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons | |||
| NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output); | |||
| } | |||
| template <typename T> | |||
| __global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0, | |||
| const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) { | |||
| int i = pos / (o1 * o2 * o3) % o0; | |||
| int j = pos / (o2 * o3) % o1; | |||
| int k = pos / o3 % o2; | |||
| int l = pos % o3; | |||
| int input_idx = Index(i, i0) * i1 * i2 * i3 + Index(j, i1) * i2 * i3 + Index(k, i2) * i3 + Index(l, i3); | |||
| output_addr[pos] = input_addr[input_idx]; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1, | |||
| const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) { | |||
| int nums = o0 * o1 * o2 * o3; | |||
| BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr, | |||
| output_addr); | |||
| } | |||
| template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, | |||
| const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, | |||
| enum BroadcastOpType op, const float *input0, const float *input1, bool *output, | |||
| @@ -204,5 +226,11 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half * | |||
| bool *output, cudaStream_t stream); | |||
| template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1, | |||
| half *output, cudaStream_t stream); | |||
| template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, | |||
| int *output, cudaStream_t stream); | |||
| template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output, | |||
| cudaStream_t stream); | |||
| template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1, | |||
| const int &o2, const int &o3, const float *input_addr, float *output_addr, | |||
| cudaStream_t stream); | |||
| template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1, | |||
| const int &o2, const int &o3, const half *input_addr, half *output_addr, cudaStream_t stream); | |||
| @@ -41,4 +41,8 @@ template <typename T, typename S> | |||
| void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output, | |||
| cudaStream_t stream); | |||
| template <typename T> | |||
| void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1, | |||
| const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_ | |||
| @@ -19,90 +19,51 @@ | |||
| #include <cuda_runtime.h> | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh" | |||
| template <typename T> | |||
| __global__ void Concat(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { | |||
| int n = pos / (w1 + w2); | |||
| int m = pos % (w1 + w2); | |||
| output[pos] = m >= w1 ? input_2[n * w2 + m - w1] : input_1[n * w1 + m]; | |||
| __global__ void Concat(const int size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| int* len_axis, T** inputs, T* output) { | |||
| for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { | |||
| int num = pos % all_size_before_axis / all_size_axis; | |||
| int block = -1; | |||
| int axis_inc = 0; | |||
| int block_len = 0; | |||
| for (int i = 0; i < input_num; i++) { | |||
| if (axis_inc <= num) { | |||
| block++; | |||
| axis_inc += len_axis[i]; | |||
| } else { | |||
| break; | |||
| } | |||
| } | |||
| block_len = len_axis[block]; | |||
| axis_inc -= len_axis[block]; | |||
| int block_pos = pos / all_size_before_axis * block_len * all_size_axis + | |||
| (num - axis_inc) * all_size_axis + pos % all_size_axis;; | |||
| output[pos] = inputs[block][block_pos]; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| __global__ void Concat(const size_t size, const int w1, const int w2, const int w3, | |||
| const T* input_1, const T* input_2, const T* input_3, T* output) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { | |||
| int n = pos / (w1 + w2 + w3); | |||
| int m = pos % (w1 + w2 + w3); | |||
| output[pos] = m < w1 ? input_1[n * w1 + m] : | |||
| m < w1 + w2 ? input_2[n * w2 + m - w1] : | |||
| input_3[n * w3 + m - w1 - w2]; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| __global__ void Concat(const size_t size, const int w1, const int w2, const int w3, const int w4, | |||
| const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { | |||
| int n = pos / (w1 + w2 + w3 + w4); | |||
| int m = pos % (w1 + w2 + w3 + w4); | |||
| output[pos] = m < w1 ? input_1[n * w1 + m] : | |||
| m < w1 + w2 ? input_2[n * w2 + m - w1]: | |||
| m < w1 + w2 + w3 ? input_3[n * w3 + m - w1 - w2]: | |||
| input_4[n * w4 + m - w1 - w2 - w3]; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output, | |||
| cudaStream_t cuda_stream) { | |||
| Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, input_1, input_2, output); | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, | |||
| const T* input_1, const T* input_2, const T* input_3, T* output, | |||
| void ConcatKernel(const int size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| int* len_axis, T** inputs, T* output, | |||
| cudaStream_t cuda_stream) { | |||
| Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, input_1, input_2, input_3, output); | |||
| Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num, | |||
| all_size_before_axis, all_size_axis, | |||
| len_axis, inputs, output); | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4, | |||
| const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output, | |||
| cudaStream_t cuda_stream) { | |||
| Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, w4, input_1, | |||
| input_2, input_3, input_4, output); | |||
| return; | |||
| } | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const float* input_1, const float* input_2, | |||
| float* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const int* input_1, const int* input_2, | |||
| int* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const half* input_1, const half* input_2, | |||
| half* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, | |||
| const float* input_1, const float* input_2, const float* input_3, | |||
| float* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, | |||
| const int* input_1, const int* input_2, const int* input_3, | |||
| int* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, | |||
| const half* input_1, const half* input_2, const half* input_3, | |||
| half* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4, | |||
| const float* input_1, const float* input_2, const float* input_3, const float* input_4, | |||
| float* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4, | |||
| const int* input_1, const int* input_2, const int* input_3, const int* input_4, | |||
| int* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4, | |||
| const half* input_1, const half* input_2, const half* input_3, const half* input_4, | |||
| half* output, cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const int size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| int* len_axis, float** inputs, float* output, | |||
| cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const int size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| int* len_axis, int** inputs, int* output, | |||
| cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const int size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| int* len_axis, half** inputs, half* output, | |||
| cudaStream_t cuda_stream); | |||
| @@ -19,13 +19,8 @@ | |||
| #include "runtime/device/gpu/cuda_common.h" | |||
| template <typename T> | |||
| void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output, | |||
| cudaStream_t cuda_stream); | |||
| template <typename T> | |||
| void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, | |||
| const T* input_1, const T* input_2, const T* input_3, T* output, cudaStream_t cuda_stream); | |||
| template <typename T> | |||
| void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4, | |||
| const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output, | |||
| void ConcatKernel(const int size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| int* len_axis, T** inputs, T* output, | |||
| cudaStream_t cuda_stream); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_ | |||
| @@ -15,9 +15,9 @@ | |||
| */ | |||
| #include "momentum_impl.cuh" | |||
| template <typename T, typename S> | |||
| template <typename T, typename S, typename G> | |||
| __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate, | |||
| const T *gradient, const S *momentum) { | |||
| const G *gradient, const S *momentum) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { | |||
| accumulation[i] = momentum[0] * accumulation[i] + gradient[i]; | |||
| variable[i] -= learning_rate[0] * accumulation[i]; | |||
| @@ -34,19 +34,32 @@ __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable, | |||
| } | |||
| return; | |||
| } | |||
| template <typename T, typename S> | |||
| void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient, | |||
| template <> | |||
| __global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation, | |||
| const float *learning_rate, const half *gradient, | |||
| const float *momentum) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { | |||
| accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]); | |||
| variable[i] -= learning_rate[0] * accumulation[i]; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T, typename S, typename G> | |||
| void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient, | |||
| const S *momentum, cudaStream_t cuda_stream) { | |||
| MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation, | |||
| learning_rate, gradient, momentum); | |||
| return; | |||
| } | |||
| template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation, | |||
| template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation, | |||
| const float *learning_rate, const float *gradient, | |||
| const float *momentum, cudaStream_t cuda_stream); | |||
| template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation, | |||
| template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation, | |||
| const half *learning_rate, const half *gradient, | |||
| const half *momentum, cudaStream_t cuda_stream); | |||
| template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation, | |||
| template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation, | |||
| const float *learning_rate, const half *gradient, | |||
| const float *momentum, cudaStream_t cuda_stream); | |||
| template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation, | |||
| const float *learning_rate, const half *gradient, | |||
| const float *momentum, cudaStream_t cuda_stream); | |||
| @@ -18,8 +18,8 @@ | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_ | |||
| #include "runtime/device/gpu/cuda_common.h" | |||
| template <typename T, typename S> | |||
| void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient, | |||
| template <typename T, typename S, typename G> | |||
| void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient, | |||
| const S *momentum, cudaStream_t cuda_stream); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_ | |||
| @@ -0,0 +1,50 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <stdio.h> | |||
| #include <stdint.h> | |||
| #include <cuda_runtime.h> | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh" | |||
| template <typename T> | |||
| __global__ void Split(const int size, const int axis_step, const int all_size_before_axis, | |||
| const int all_size_axis, const T* input, T** outputs) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { | |||
| int num = pos % all_size_before_axis / all_size_axis; | |||
| int block = num / axis_step; | |||
| int block_pos = pos / all_size_before_axis * axis_step * all_size_axis + | |||
| num % axis_step * all_size_axis + pos % all_size_axis; | |||
| outputs[block][block_pos] = input[pos]; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void SplitKernel(const int size, const int axis_step, const int all_size_before_axis, | |||
| const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) { | |||
| Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis, | |||
| all_size_axis, input, outputs); | |||
| return; | |||
| } | |||
| template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis, | |||
| const int all_size_axis, const float* input, float** outputs, | |||
| cudaStream_t cuda_stream); | |||
| template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis, | |||
| const int all_size_axis, const int* input, int** outputs, | |||
| cudaStream_t cuda_stream); | |||
| template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis, | |||
| const int all_size_axis, const half* input, half** outputs, | |||
| cudaStream_t cuda_stream); | |||
| @@ -0,0 +1,24 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_ | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_ | |||
| #include "runtime/device/gpu/cuda_common.h" | |||
| template <typename T> | |||
| void SplitKernel(const int size, const int axis_step, const int all_size_before_axis, | |||
| const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_ | |||
| @@ -0,0 +1,162 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh" | |||
| #include <limits> | |||
| #include <algorithm> | |||
| int RoundUpPower2(int v) { | |||
| v--; | |||
| v |= v >> 1; | |||
| v |= v >> 2; | |||
| v |= v >> 4; | |||
| v |= v >> 8; | |||
| v |= v >> 16; | |||
| v++; | |||
| return v; | |||
| } | |||
| template <typename T> | |||
| __inline__ __device__ void Swap(T *lhs, T *rhs) { | |||
| T tmp = lhs[0]; | |||
| lhs[0] = rhs[0]; | |||
| rhs[0] = tmp; | |||
| } | |||
| template <typename T, typename S> | |||
| __global__ void TopkKernel(const int outer, const int inner, const int ceil_power2, const T *input, const S *k, | |||
| T *output, S *indices, T *data_buff, S *index_buff) { | |||
| // default: sort with share memory | |||
| extern __shared__ T share_mem[]; | |||
| T *data_arr = share_mem; | |||
| S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2); | |||
| // sort with RAM | |||
| if (data_buff != nullptr && index_buff != nullptr) { | |||
| data_arr = data_buff + blockIdx.x * ceil_power2; | |||
| index_arr = index_buff + blockIdx.x * ceil_power2; | |||
| } | |||
| for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) { | |||
| data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max(); | |||
| index_arr[i] = i; | |||
| } | |||
| __syncthreads(); | |||
| for (size_t i = 2; i <= ceil_power2; i <<= 1) { | |||
| for (size_t j = (i >> 1); j > 0; j >>= 1) { | |||
| for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) { | |||
| size_t tid_comp = tid ^ j; | |||
| if (tid_comp > tid) { | |||
| if ((tid & i) == 0) { | |||
| if (data_arr[tid] > data_arr[tid_comp]) { | |||
| Swap(&data_arr[tid], &data_arr[tid_comp]); | |||
| Swap(&index_arr[tid], &index_arr[tid_comp]); | |||
| } | |||
| } else { | |||
| if (data_arr[tid] < data_arr[tid_comp]) { | |||
| Swap(&data_arr[tid], &data_arr[tid_comp]); | |||
| Swap(&index_arr[tid], &index_arr[tid_comp]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| __syncthreads(); | |||
| } | |||
| } | |||
| for (size_t tid = threadIdx.x; tid < k[0]; tid += blockDim.x) { | |||
| output[blockIdx.x * k[0] + tid] = data_arr[inner - tid - 1]; | |||
| indices[blockIdx.x * k[0] + tid] = index_arr[inner - tid - 1]; | |||
| } | |||
| } | |||
| template <typename T, typename S> | |||
| void TopK(const int &outer, const int &inner, const T *input, const S *k, T *output, S *indices, T *data_buff, | |||
| S *index_buff, cudaStream_t stream) { | |||
| int ceil_power2 = RoundUpPower2(inner); | |||
| int share_mem = (data_buff == nullptr) ? ceil_power2 * (sizeof(T) + sizeof(S)) : 0; | |||
| int thread = std::min(ceil_power2, GET_THREADS); | |||
| TopkKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, k, output, indices, data_buff, | |||
| index_buff); | |||
| } | |||
| template <typename T, typename S> | |||
| __global__ void BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input, | |||
| S *indices, T *data_buff, S *index_buff) { | |||
| // default: sort with share memory | |||
| extern __shared__ T share_mem[]; | |||
| T *data_arr = share_mem; | |||
| S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2); | |||
| // sort with RAM | |||
| if (data_buff != nullptr && index_buff != nullptr) { | |||
| data_arr = data_buff + blockIdx.x * ceil_power2; | |||
| index_arr = index_buff + blockIdx.x * ceil_power2; | |||
| } | |||
| for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) { | |||
| data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max(); | |||
| index_arr[i] = (i < inner) ? indices[blockIdx.x * inner + i] : std::numeric_limits<S>::max();; | |||
| } | |||
| __syncthreads(); | |||
| for (size_t i = 2; i <= ceil_power2; i <<= 1) { | |||
| for (size_t j = (i >> 1); j > 0; j >>= 1) { | |||
| for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) { | |||
| size_t tid_comp = tid ^ j; | |||
| if (tid_comp > tid) { | |||
| if ((tid & i) == 0) { | |||
| if (index_arr[tid] > index_arr[tid_comp]) { | |||
| Swap(&data_arr[tid], &data_arr[tid_comp]); | |||
| Swap(&index_arr[tid], &index_arr[tid_comp]); | |||
| } | |||
| } else { | |||
| if (index_arr[tid] < index_arr[tid_comp]) { | |||
| Swap(&data_arr[tid], &data_arr[tid_comp]); | |||
| Swap(&index_arr[tid], &index_arr[tid_comp]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| __syncthreads(); | |||
| } | |||
| } | |||
| for (size_t tid = threadIdx.x; tid < inner; tid += blockDim.x) { | |||
| input[blockIdx.x * inner + tid] = data_arr[tid]; | |||
| indices[blockIdx.x * inner + tid] = index_arr[tid]; | |||
| } | |||
| } | |||
| template <typename T, typename S> | |||
| void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff, | |||
| cudaStream_t stream) { | |||
| int ceil_power2 = RoundUpPower2(inner); | |||
| size_t share_mem = ceil_power2 * (sizeof(T) + sizeof(S)); | |||
| if (share_mem > SHARED_MEM_PER_BLOCK) { | |||
| share_mem = 0; | |||
| } else { | |||
| data_buff = nullptr; | |||
| index_buff = nullptr; | |||
| } | |||
| int thread = std::min(ceil_power2, GET_THREADS); | |||
| BitonicSortByKeyKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, indices, data_buff, | |||
| index_buff); | |||
| } | |||
| template void TopK(const int &outer, const int &inner, const float *input_addr, const int *k, float *output, | |||
| int *indices, float *data_buff, int *index_buff, cudaStream_t stream); | |||
| template void BitonicSortByKey(const int &outer, const int &inner, float *input, int *indices, float *data_buff, | |||
| int *index_buff, cudaStream_t stream); | |||
| @@ -0,0 +1,32 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_ | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_ | |||
| #include <cuda_runtime.h> | |||
| #include "runtime/device/gpu/cuda_common.h" | |||
| template <typename T, typename S> | |||
| void TopK(const int &outer, const int &inner, const T *input_addr, const S *k, T *output, S *indices, T *data_buff, | |||
| S *index_buff, cudaStream_t stream); | |||
| template <typename T, typename S> | |||
| void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff, | |||
| cudaStream_t stream); | |||
| int RoundUpPower2(int v); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_ | |||
| @@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) { | |||
| return; | |||
| } | |||
| template <typename T> | |||
| __global__ void AbsKernel(T *input, T *output, size_t count) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | |||
| output[i] = abs(input[i]); | |||
| } | |||
| return; | |||
| } | |||
| template <> | |||
| __global__ void AbsKernel(half *input, half *output, size_t count) { | |||
| half zero = 0.0; | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | |||
| output[i] = input[i] < zero ? -input[i] : input[i]; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| __global__ void FloorKernel(T *input, T *output, size_t count) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | |||
| output[i] = floor(input[i]); | |||
| } | |||
| return; | |||
| } | |||
| template <> | |||
| __global__ void FloorKernel(half *input, half *output, size_t count) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | |||
| output[i] = hfloor(input[i]); | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) { | |||
| ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); | |||
| return; | |||
| @@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) { | |||
| ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count); | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) { | |||
| AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) { | |||
| FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); | |||
| return; | |||
| } | |||
| template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| @@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea | |||
| template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| @@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t | |||
| template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| @@ -34,5 +34,9 @@ template <typename T> | |||
| void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream); | |||
| template <typename T> | |||
| void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream); | |||
| template <typename T> | |||
| void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream); | |||
| template <typename T> | |||
| void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_ | |||
| @@ -88,6 +88,12 @@ class GpuKernelRegister { | |||
| static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \ | |||
| static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \ | |||
| []() { return new OPCLASS<T, S>(); }); | |||
| // register of mixed accuracy kernels which use template and maintain three typename | |||
| #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \ | |||
| static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \ | |||
| static const GpuKernelRegister g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg( \ | |||
| #OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); }); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_ | |||
| @@ -46,5 +46,13 @@ MS_REG_GPU_KERNEL_ONE(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOut | |||
| UnaryOpGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| UnaryOpGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| UnaryOpGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| UnaryOpGpuKernel, half) | |||
| MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| UnaryOpGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| UnaryOpGpuKernel, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -36,6 +36,8 @@ enum UnaryOptype { | |||
| UNARY_OP_SQUARE, | |||
| UNARY_OP_SQRT, | |||
| UNARY_OP_RSQRT, | |||
| UNARY_OP_ABS, | |||
| UNARY_OP_FLOOR, | |||
| UNARY_OP_INVALID_TYPE = 255 | |||
| }; | |||
| static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP}, | |||
| @@ -45,7 +47,9 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY | |||
| {"ZerosLike", UNARY_OP_ZEROSLIKE}, | |||
| {"Square", UNARY_OP_SQUARE}, | |||
| {"Sqrt", UNARY_OP_SQRT}, | |||
| {"Rsqrt", UNARY_OP_RSQRT}}; | |||
| {"Rsqrt", UNARY_OP_RSQRT}, | |||
| {"Abs", UNARY_OP_ABS}, | |||
| {"Floor", UNARY_OP_FLOOR}}; | |||
| template <typename T> | |||
| class UnaryOpGpuKernel : public GpuKernel { | |||
| public: | |||
| @@ -100,6 +104,14 @@ class UnaryOpGpuKernel : public GpuKernel { | |||
| Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| case UNARY_OP_ABS: { | |||
| Abs(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| break; | |||
| } | |||
| case UNARY_OP_FLOOR: { | |||
| Floor(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| break; | |||
| } | |||
| default: { | |||
| MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported."; | |||
| } | |||
| @@ -34,15 +34,15 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNorm, | |||
| MS_REG_GPU_KERNEL_ONE(FusedBatchNorm, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| FusedBatchNormGpuKernel, half) | |||
| MS_REG_GPU_KERNEL_ONE(BatchNorm, | |||
| KernelAttr() | |||
| @@ -60,15 +60,15 @@ MS_REG_GPU_KERNEL_ONE(BatchNorm, | |||
| MS_REG_GPU_KERNEL_ONE(BatchNorm, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| FusedBatchNormGpuKernel, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -56,17 +56,17 @@ class FusedBatchNormGpuKernel : public GpuKernel { | |||
| return true; | |||
| } | |||
| auto x = GetDeviceAddress<T>(inputs, 0); | |||
| auto scale = GetDeviceAddress<T>(inputs, 1); | |||
| auto bias = GetDeviceAddress<T>(inputs, 2); | |||
| auto runing_mean = GetDeviceAddress<T>(inputs, 3); | |||
| auto runnig_variance = GetDeviceAddress<T>(inputs, 4); | |||
| auto scale = GetDeviceAddress<float>(inputs, 1); | |||
| auto bias = GetDeviceAddress<float>(inputs, 2); | |||
| auto runing_mean = GetDeviceAddress<float>(inputs, 3); | |||
| auto runnig_variance = GetDeviceAddress<float>(inputs, 4); | |||
| auto y = GetDeviceAddress<T>(outputs, 0); | |||
| const float alpha = 1; | |||
| const float beta = 0; | |||
| if (is_train_) { | |||
| auto save_mean = GetDeviceAddress<T>(outputs, 3); | |||
| auto save_variance = GetDeviceAddress<T>(outputs, 4); | |||
| auto save_mean = GetDeviceAddress<float>(outputs, 3); | |||
| auto save_variance = GetDeviceAddress<float>(outputs, 4); | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y, | |||
| scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean, | |||
| @@ -33,12 +33,12 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNormGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| FusedBatchNormGradGpuKernel, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -55,12 +55,12 @@ class FusedBatchNormGradGpuKernel : public GpuKernel { | |||
| } | |||
| auto dy = GetDeviceAddress<T>(inputs, 0); | |||
| auto x = GetDeviceAddress<T>(inputs, 1); | |||
| auto scale = GetDeviceAddress<T>(inputs, 2); | |||
| auto save_mean = GetDeviceAddress<T>(inputs, 3); | |||
| auto save_variance = GetDeviceAddress<T>(inputs, 4); | |||
| auto scale = GetDeviceAddress<float>(inputs, 2); | |||
| auto save_mean = GetDeviceAddress<float>(inputs, 3); | |||
| auto save_variance = GetDeviceAddress<float>(inputs, 4); | |||
| auto dx = GetDeviceAddress<T>(outputs, 0); | |||
| auto bn_scale = GetDeviceAddress<T>(outputs, 1); | |||
| auto bn_bias = GetDeviceAddress<T>(outputs, 2); | |||
| auto bn_scale = GetDeviceAddress<float>(outputs, 1); | |||
| auto bn_bias = GetDeviceAddress<float>(outputs, 2); | |||
| const float alpha_data_diff = 1; | |||
| const float beta_data_diff = 0; | |||
| @@ -18,32 +18,41 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_TWO(ApplyMomentum, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| MomentumGpuKernel, float, float) | |||
| MS_REG_GPU_KERNEL_TWO(ApplyMomentum, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| MomentumGpuKernel, half, half) | |||
| MS_REG_GPU_KERNEL_TWO(ApplyMomentum, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| MomentumGpuKernel, half, float) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyMomentum, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| MomentumGpuKernel, float, float, float) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyMomentum, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| MomentumGpuKernel, half, half, half) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyMomentum, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| MomentumGpuKernel, half, float, half) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyMomentum, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| MomentumGpuKernel, float, float, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -23,7 +23,7 @@ | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T, typename S> | |||
| template <typename T, typename S, typename G> | |||
| class MomentumGpuKernel : public GpuKernel { | |||
| public: | |||
| MomentumGpuKernel() | |||
| @@ -38,7 +38,7 @@ class MomentumGpuKernel : public GpuKernel { | |||
| T *variable = GetDeviceAddress<T>(inputs, 0); | |||
| T *accumulation = GetDeviceAddress<T>(inputs, 1); | |||
| S *learning_rate = GetDeviceAddress<S>(inputs, 2); | |||
| T *gradient = GetDeviceAddress<T>(inputs, 3); | |||
| G *gradient = GetDeviceAddress<G>(inputs, 3); | |||
| S *momentum = GetDeviceAddress<S>(inputs, 4); | |||
| MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| @@ -54,7 +54,7 @@ class MomentumGpuKernel : public GpuKernel { | |||
| variable_size_ = sizeof(T); | |||
| accumulation_size_ = sizeof(T); | |||
| learning_rate_size_ = sizeof(S); | |||
| gradient_size_ = sizeof(T); | |||
| gradient_size_ = sizeof(G); | |||
| momentum_size_ = sizeof(S); | |||
| auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| @@ -81,6 +81,7 @@ static std::map<string, string> tbe_func_adapter_map = { | |||
| {"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"}, | |||
| {"apply_add_sign", "apply_add_sign_d"}, | |||
| {"apply_power_sign", "apply_power_sign_d"}, | |||
| {"apply_centered_rms_prop", "apply_centered_rms_prop_d"}, | |||
| {"transpose", "transpose_d"}, | |||
| {"fill", "fill_d"}, | |||
| {"unsorted_segment_sum", "unsorted_segment_sum_d"}, | |||
| @@ -43,6 +43,7 @@ constexpr auto kJInputs = "inputs"; | |||
| constexpr auto kJOutputs = "outputs"; | |||
| constexpr auto kJAttrs = "attrs"; | |||
| constexpr auto kJKernelName = "kernel_name"; | |||
| constexpr auto kJFullName = "full_name"; | |||
| constexpr auto kJOpInfo = "op_info"; | |||
| constexpr auto kJDtype = "dtype"; | |||
| constexpr auto kJtype = "type"; | |||
| @@ -125,6 +126,7 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor | |||
| op_info_json[kJKernelName] = json_name_; | |||
| } | |||
| (*kernel_json)[kJOpInfo] = op_info_json; | |||
| (*kernel_json)[kJFullName] = anf_node->fullname_with_scope(); | |||
| if (creater_type_ == SINGLE_BUILD) { | |||
| TbeUtils::SaveJsonInfo(json_name_, json_info_); | |||
| } | |||
| @@ -97,6 +97,7 @@ | |||
| #include "backend/optimizer/ascend/format_type/modify_ops_attrs.h" | |||
| #include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h" | |||
| #include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h" | |||
| #include "backend/optimizer/ascend/format_type/remove_internal_output.h" | |||
| #include "utils/context/ms_context.h" | |||
| #include "utils/config_manager.h" | |||
| #include "debug/anf_ir_dump.h" | |||
| @@ -201,6 +202,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph) | |||
| data_layout_pm->AddPass(std::make_shared<OptimizeDependence>()); | |||
| data_layout_pm->AddPass(std::make_shared<TransDataSplit>()); | |||
| data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>()); | |||
| data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>()); | |||
| optimizer->AddPassManager(data_layout_pm); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| kernel_graph->SetExecOrderByDefault(); | |||
| @@ -222,6 +224,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap | |||
| mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>()); | |||
| mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>()); | |||
| mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>()); | |||
| mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>()); | |||
| optimizer->AddPassManager(mixed_precision_pm); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| kernel_graph->SetExecOrderByDefault(); | |||
| @@ -142,6 +142,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| std::vector<AnfNodePtr> make_tuple_inputs; | |||
| make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple)); | |||
| auto kernel_graph = func_graph->cast<KernelGraphPtr>(); | |||
| for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) { | |||
| std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx); | |||
| if (output_format == kOpFormat_NC1KHKWHWC0) { | |||
| @@ -151,7 +152,11 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const | |||
| auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx); | |||
| std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx); | |||
| if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) { | |||
| make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false)); | |||
| auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false); | |||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) { | |||
| kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0); | |||
| } | |||
| make_tuple_inputs.emplace_back(trans_op); | |||
| } else { | |||
| // No need insert trans op. | |||
| make_tuple_inputs.push_back(tuple_getitem); | |||
| @@ -249,9 +254,14 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP | |||
| if (outputs_num == 0) { | |||
| return node; | |||
| } | |||
| auto kernel_graph = func_graph->cast<KernelGraphPtr>(); | |||
| // Single output | |||
| if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) { | |||
| return InsertTransOpForSingleOutput(func_graph, node, kernel_select); | |||
| auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select); | |||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) { | |||
| kernel_graph->ReplaceInternalOutput(node, new_node); | |||
| } | |||
| return new_node; | |||
| } | |||
| // Multiple output | |||
| return InsertTransOpForMultipleOutput(func_graph, node, kernel_select); | |||
| @@ -40,6 +40,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo | |||
| std::vector<AnfNodePtr> make_tuple_inputs; | |||
| AbstractBasePtrList abstract_list; | |||
| make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple)); | |||
| auto kernel_graph = func_graph->cast<KernelGraphPtr>(); | |||
| for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) { | |||
| AnfNodePtr replace_node = nullptr; | |||
| const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx); | |||
| @@ -64,6 +65,9 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo | |||
| MS_EXCEPTION_IF_NULL(replace_node); | |||
| replace_node->set_scope(cnode->scope()); | |||
| AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); | |||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) { | |||
| kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0); | |||
| } | |||
| } else { | |||
| replace_node = getitem; | |||
| } | |||
| @@ -87,6 +91,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c | |||
| return cnode; | |||
| } | |||
| MS_EXCEPTION_IF_NULL(cnode->Type()); | |||
| auto kernel_graph = func_graph->cast<KernelGraphPtr>(); | |||
| // Single output | |||
| if (!cnode->Type()->isa<Tuple>()) { | |||
| if (!need_insert_cast[0]) { | |||
| @@ -109,6 +114,9 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c | |||
| MS_EXCEPTION_IF_NULL(replace_node); | |||
| replace_node->set_scope(cnode->scope()); | |||
| AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); | |||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) { | |||
| kernel_graph->ReplaceInternalOutput(cnode, replace_node); | |||
| } | |||
| } | |||
| return replace_node; | |||
| } | |||
| @@ -188,6 +196,10 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo | |||
| CNodePtr cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| auto new_node = InsertCastForInput(func_graph, cnode); | |||
| auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>(); | |||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) { | |||
| kernel_graph->ReplaceInternalOutput(node, new_node); | |||
| } | |||
| // process output | |||
| return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true)); | |||
| } | |||
| @@ -46,14 +46,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An | |||
| if (node == nullptr || !AnfAlgo::IsRealKernel(node)) { | |||
| return nullptr; | |||
| } | |||
| AnfNodePtr front_node; | |||
| AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node); | |||
| MS_LOG(DEBUG) << "process op: " << node->DebugString(); | |||
| AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_); | |||
| auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>(); | |||
| if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) { | |||
| front_node = kernel_graph->GetFrontNodeByInternalOutput(node); | |||
| kernel_graph->ReplaceInternalOutput(node, new_node); | |||
| } | |||
| AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node); | |||
| MS_LOG(DEBUG) << "====process op: " << node->DebugString(); | |||
| AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_); | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) { | |||
| @@ -61,12 +60,7 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An | |||
| return new_node; | |||
| } | |||
| } | |||
| auto final_node = InsertTransOpForOutput(func_graph, new_node, kernel_select_); | |||
| if (kernel_graph != nullptr && front_node != nullptr) { | |||
| auto old_node = kernel_graph->GetInternalOutputByFrontNode(front_node); | |||
| kernel_graph->ReplaceInternalOutput(old_node, final_node); | |||
| } | |||
| return final_node; | |||
| return InsertTransOpForOutput(func_graph, new_node, kernel_select_); | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,83 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/format_type/remove_internal_output.h" | |||
| #include <memory> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| namespace { | |||
| bool UsedForOutputOnly(const FuncGraphPtr &func_graph, const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(func_graph); | |||
| auto manager = func_graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| auto &node_users = manager->node_users(); | |||
| auto iter = node_users.find(node); | |||
| if (iter == node_users.end()) { | |||
| return false; | |||
| } | |||
| const auto &node_set = iter->second; | |||
| for (const auto &node_index : node_set) { | |||
| if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimMakeTuple)) { | |||
| return false; | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace | |||
| const BaseRef RemoveInternalOutputTransOp::DefinePattern() const { | |||
| VarPtr X = std::make_shared<Var>(); | |||
| auto prim = std::make_shared<Primitive>(kTransDataOpName); | |||
| return VectorRef({prim, X}); | |||
| } | |||
| const BaseRef RemoveInternalOutputCast::DefinePattern() const { | |||
| VarPtr X = std::make_shared<Var>(); | |||
| return VectorRef({prim::kPrimCast, X}); | |||
| } | |||
| const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const EquivPtr &) const { | |||
| MS_EXCEPTION_IF_NULL(func_graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto kernel_graph = func_graph->cast<KernelGraphPtr>(); | |||
| if (kernel_graph == nullptr) { | |||
| return nullptr; | |||
| } | |||
| if (!kernel_graph->IsInternalOutput(node)) { | |||
| return nullptr; | |||
| } | |||
| if (!UsedForOutputOnly(func_graph, node)) { | |||
| return nullptr; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| CheckCNodeInputSize(cnode, kTransOpInputNum); | |||
| auto input_node = cnode->input(1); | |||
| if (!AnfAlgo::CheckPrimitiveType(input_node, prim::kPrimTupleGetItem)) { | |||
| kernel_graph->ReplaceInternalOutput(node, input_node); | |||
| } else { | |||
| auto tuple_getitem = input_node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(tuple_getitem); | |||
| int idx = AnfAlgo::GetTupleGetItemOutIndex(tuple_getitem); | |||
| AnfNodePtr real_input_node = AnfAlgo::GetTupleGetItemRealInput(tuple_getitem); | |||
| kernel_graph->ReplaceInternalOutput(node, real_input_node, 0, idx); | |||
| } | |||
| return input_node; | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,51 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_ | |||
| #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_ | |||
| #include <string> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class RemoveInternalOutput : public PatternProcessPass { | |||
| public: | |||
| explicit RemoveInternalOutput(const std::string &name, bool multigraph = true) | |||
| : PatternProcessPass(name, multigraph) {} | |||
| ~RemoveInternalOutput() override = default; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| }; | |||
| class RemoveInternalOutputTransOp : public RemoveInternalOutput { | |||
| public: | |||
| explicit RemoveInternalOutputTransOp(bool multigraph = true) | |||
| : RemoveInternalOutput("remove_internal_output_trans_op", multigraph) {} | |||
| ~RemoveInternalOutputTransOp() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| }; | |||
| class RemoveInternalOutputCast : public RemoveInternalOutput { | |||
| public: | |||
| explicit RemoveInternalOutputCast(bool multigraph = true) | |||
| : RemoveInternalOutput("remove_internal_output_cast", multigraph) {} | |||
| ~RemoveInternalOutputCast() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_ | |||
| @@ -13,8 +13,8 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| @@ -53,4 +53,4 @@ class AdamFusion : public PatternProcessPass { | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_ | |||
| @@ -13,8 +13,8 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| @@ -55,4 +55,4 @@ class AdamWeightDecayFusion : public PatternProcessPass { | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_ | |||
| @@ -0,0 +1,65 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/gpu/replace_addn_fusion.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| const BaseRef ReplaceAddNFusion::DefinePattern() const { | |||
| VectorRef addn = VectorRef({prim::kPrimAddN, A, B}); | |||
| return addn; | |||
| } | |||
| const AnfNodePtr ReplaceAddNFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, | |||
| const EquivPtr &equiv) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(equiv); | |||
| auto A = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0); | |||
| auto B = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 1); | |||
| MS_EXCEPTION_IF_NULL(A); | |||
| MS_EXCEPTION_IF_NULL(B); | |||
| int num_input = AnfAlgo::GetNodeAttr<int>(node, "n"); | |||
| if (num_input == 2) { | |||
| auto prim = std::make_shared<Primitive>(prim::kPrimTensorAdd->name()); | |||
| MS_EXCEPTION_IF_NULL(prim); | |||
| std::vector<AnfNodePtr> inputs = {NewValueNode(prim), A, B}; | |||
| auto add_new = graph->NewCNode(inputs); | |||
| std::vector<TypeId> outputs_type; | |||
| std::vector<std::vector<size_t>> outputs_shape; | |||
| outputs_type.push_back(AnfAlgo::GetOutputInferDataType(A, 0)); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(A, 0)); | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, add_new.get()); | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(add_new)); | |||
| return add_new; | |||
| } else { | |||
| return nullptr; | |||
| } | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,40 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class ReplaceAddNFusion : public PatternProcessPass { | |||
| public: | |||
| explicit ReplaceAddNFusion(bool multigraph = true) : PatternProcessPass("replace_addn", multigraph) { | |||
| A = std::make_shared<Var>(); | |||
| B = std::make_shared<Var>(); | |||
| } | |||
| ~ReplaceAddNFusion() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| private: | |||
| VarPtr A; | |||
| VarPtr B; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_ | |||
| @@ -0,0 +1,92 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/gpu/replace_bn_cast_fusion.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| const BaseRef ReplaceBNCastFusion::DefinePattern() const { | |||
| VectorRef in_cast = VectorRef({prim::kPrimCast, x_}); | |||
| VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNorm, in_cast, scale_, bias_, mean_, var_}); | |||
| VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_}); | |||
| VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget}); | |||
| return out_cast; | |||
| } | |||
| const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, | |||
| const EquivPtr &equiv) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(equiv); | |||
| auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0); | |||
| auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1); | |||
| MS_EXCEPTION_IF_NULL(index_node); | |||
| auto value_node = index_node->cast<ValueNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(value_node); | |||
| int item_idx = GetValue<int>(value_node->value()); | |||
| auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0); | |||
| auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0); | |||
| auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0); | |||
| if (item_idx != 0) { | |||
| return nullptr; | |||
| } | |||
| auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 1); | |||
| auto bias = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 2); | |||
| auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 3); | |||
| auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 4); | |||
| MS_EXCEPTION_IF_NULL(fbn2); | |||
| MS_EXCEPTION_IF_NULL(x_after); | |||
| MS_EXCEPTION_IF_NULL(x_before); | |||
| MS_EXCEPTION_IF_NULL(scale); | |||
| MS_EXCEPTION_IF_NULL(bias); | |||
| MS_EXCEPTION_IF_NULL(mean); | |||
| MS_EXCEPTION_IF_NULL(var); | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before)); | |||
| manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple)); | |||
| std::vector<TypeId> outputs_type; | |||
| std::vector<std::vector<size_t>> outputs_shape; | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(fbn2); | |||
| for (size_t i = 0; i < output_num; i++) { | |||
| outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i)); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i)); | |||
| } | |||
| outputs_type[0] = kNumberTypeFloat16; | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get()); | |||
| outputs_type.clear(); | |||
| outputs_shape.clear(); | |||
| outputs_type.push_back(kNumberTypeFloat16); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0)); | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get()); | |||
| return tuple; | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,58 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class ReplaceBNCastFusion : public PatternProcessPass { | |||
| public: | |||
| explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) { | |||
| x_ = std::make_shared<Var>(); | |||
| scale_ = std::make_shared<Var>(); | |||
| bias_ = std::make_shared<Var>(); | |||
| mean_ = std::make_shared<Var>(); | |||
| var_ = std::make_shared<Var>(); | |||
| y_ = std::make_shared<Var>(); | |||
| running_mean_ = std::make_shared<Var>(); | |||
| running_var_ = std::make_shared<Var>(); | |||
| save_mean_ = std::make_shared<Var>(); | |||
| save_var_ = std::make_shared<Var>(); | |||
| index_ = std::make_shared<Var>(); | |||
| } | |||
| ~ReplaceBNCastFusion() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| private: | |||
| VarPtr x_; | |||
| VarPtr scale_; | |||
| VarPtr bias_; | |||
| VarPtr mean_; | |||
| VarPtr var_; | |||
| VarPtr y_; | |||
| VarPtr running_mean_; | |||
| VarPtr running_var_; | |||
| VarPtr save_mean_; | |||
| VarPtr save_var_; | |||
| VarPtr index_; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_ | |||
| @@ -0,0 +1,88 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const { | |||
| VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_}); | |||
| VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_}); | |||
| VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget}); | |||
| return out_cast; | |||
| } | |||
| const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, | |||
| const EquivPtr &equiv) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(equiv); | |||
| auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0); | |||
| auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1); | |||
| MS_EXCEPTION_IF_NULL(index_node); | |||
| auto value_node = index_node->cast<ValueNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(value_node); | |||
| int item_idx = GetValue<int>(value_node->value()); | |||
| if (item_idx != 0) { | |||
| return nullptr; | |||
| } | |||
| auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0); | |||
| auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0); | |||
| auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1); | |||
| auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2); | |||
| auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3); | |||
| auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4); | |||
| MS_EXCEPTION_IF_NULL(fbn2g); | |||
| MS_EXCEPTION_IF_NULL(dy_); | |||
| MS_EXCEPTION_IF_NULL(scale); | |||
| MS_EXCEPTION_IF_NULL(x_); | |||
| MS_EXCEPTION_IF_NULL(mean); | |||
| MS_EXCEPTION_IF_NULL(var); | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple)); | |||
| std::vector<TypeId> outputs_type; | |||
| std::vector<std::vector<size_t>> outputs_shape; | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g); | |||
| for (size_t i = 0; i < output_num; i++) { | |||
| outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i)); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i)); | |||
| } | |||
| outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0); | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get()); | |||
| outputs_type.clear(); | |||
| outputs_shape.clear(); | |||
| outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0)); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0)); | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get()); | |||
| return tuple; | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,54 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class ReplaceBNGradCast2Fusion : public PatternProcessPass { | |||
| public: | |||
| explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) { | |||
| dy_ = std::make_shared<Var>(); | |||
| x_ = std::make_shared<Var>(); | |||
| scale_ = std::make_shared<Var>(); | |||
| mean_ = std::make_shared<Var>(); | |||
| var_ = std::make_shared<Var>(); | |||
| dx_ = std::make_shared<Var>(); | |||
| bn_scale_ = std::make_shared<Var>(); | |||
| bn_bias_ = std::make_shared<Var>(); | |||
| index_ = std::make_shared<Var>(); | |||
| } | |||
| ~ReplaceBNGradCast2Fusion() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| private: | |||
| VarPtr dy_; | |||
| VarPtr x_; | |||
| VarPtr scale_; | |||
| VarPtr mean_; | |||
| VarPtr var_; | |||
| VarPtr dx_; | |||
| VarPtr bn_scale_; | |||
| VarPtr bn_bias_; | |||
| VarPtr index_; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_ | |||
| @@ -0,0 +1,91 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| const BaseRef ReplaceBNGradCastFusion::DefinePattern() const { | |||
| VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_}); | |||
| VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_cast, x_, scale_, mean_, var_}); | |||
| VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_}); | |||
| VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget}); | |||
| return out_cast; | |||
| } | |||
| const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, | |||
| const EquivPtr &equiv) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(equiv); | |||
| auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0); | |||
| auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1); | |||
| MS_EXCEPTION_IF_NULL(index_node); | |||
| auto value_node = index_node->cast<ValueNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(value_node); | |||
| int item_idx = GetValue<int>(value_node->value()); | |||
| if (item_idx != 0) { | |||
| return nullptr; | |||
| } | |||
| auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0); | |||
| auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0); | |||
| auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0); | |||
| auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1); | |||
| auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2); | |||
| auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3); | |||
| auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4); | |||
| MS_EXCEPTION_IF_NULL(fbn2g); | |||
| MS_EXCEPTION_IF_NULL(dy_after); | |||
| MS_EXCEPTION_IF_NULL(dy_before); | |||
| MS_EXCEPTION_IF_NULL(scale); | |||
| MS_EXCEPTION_IF_NULL(x_); | |||
| MS_EXCEPTION_IF_NULL(mean); | |||
| MS_EXCEPTION_IF_NULL(var); | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before)); | |||
| manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple)); | |||
| std::vector<TypeId> outputs_type; | |||
| std::vector<std::vector<size_t>> outputs_shape; | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g); | |||
| for (size_t i = 0; i < output_num; i++) { | |||
| outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i)); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i)); | |||
| } | |||
| outputs_type[0] = kNumberTypeFloat16; | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get()); | |||
| outputs_type.clear(); | |||
| outputs_shape.clear(); | |||
| outputs_type.push_back(kNumberTypeFloat16); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0)); | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get()); | |||
| return tuple; | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,54 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class ReplaceBNGradCastFusion : public PatternProcessPass { | |||
| public: | |||
| explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) { | |||
| dy_ = std::make_shared<Var>(); | |||
| x_ = std::make_shared<Var>(); | |||
| scale_ = std::make_shared<Var>(); | |||
| mean_ = std::make_shared<Var>(); | |||
| var_ = std::make_shared<Var>(); | |||
| dx_ = std::make_shared<Var>(); | |||
| bn_scale_ = std::make_shared<Var>(); | |||
| bn_bias_ = std::make_shared<Var>(); | |||
| index_ = std::make_shared<Var>(); | |||
| } | |||
| ~ReplaceBNGradCastFusion() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| private: | |||
| VarPtr dy_; | |||
| VarPtr x_; | |||
| VarPtr scale_; | |||
| VarPtr mean_; | |||
| VarPtr var_; | |||
| VarPtr dx_; | |||
| VarPtr bn_scale_; | |||
| VarPtr bn_bias_; | |||
| VarPtr index_; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_ | |||
| @@ -0,0 +1,63 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h" | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "ir/primitive.h" | |||
| #include "utils/utils.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| const BaseRef ReplaceMomentumCastFusion::DefinePattern() const { | |||
| VectorRef grad_cast = VectorRef({prim::kPrimCast, grad_}); | |||
| VectorRef momentum = VectorRef({prim::kPrimApplyMomentum, var_, acc_, lr_, grad_cast, mom_}); | |||
| return momentum; | |||
| } | |||
| const AnfNodePtr ReplaceMomentumCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, | |||
| const EquivPtr &equiv) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| MS_EXCEPTION_IF_NULL(equiv); | |||
| auto grad_cast = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 3); | |||
| auto grad = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(grad_cast), 0); | |||
| MS_EXCEPTION_IF_NULL(grad_cast); | |||
| MS_EXCEPTION_IF_NULL(grad); | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| manager->Replace(utils::cast<CNodePtr>(grad_cast), utils::cast<CNodePtr>(grad)); | |||
| std::vector<TypeId> outputs_type; | |||
| std::vector<std::vector<size_t>> outputs_shape; | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(node); | |||
| for (size_t i = 0; i < output_num; i++) { | |||
| outputs_type.push_back(AnfAlgo::GetOutputInferDataType(node, i)); | |||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(node, i)); | |||
| } | |||
| outputs_type[3] = AnfAlgo::GetPrevNodeOutputInferDataType(grad_cast, 0); | |||
| AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, node.get()); | |||
| return node; | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,46 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class ReplaceMomentumCastFusion : public PatternProcessPass { | |||
| public: | |||
| explicit ReplaceMomentumCastFusion(bool multigraph = true) : PatternProcessPass("replace_momentum_cast", multigraph) { | |||
| var_ = std::make_shared<Var>(); | |||
| acc_ = std::make_shared<Var>(); | |||
| lr_ = std::make_shared<Var>(); | |||
| grad_ = std::make_shared<Var>(); | |||
| mom_ = std::make_shared<Var>(); | |||
| } | |||
| ~ReplaceMomentumCastFusion() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| private: | |||
| VarPtr var_; | |||
| VarPtr acc_; | |||
| VarPtr lr_; | |||
| VarPtr grad_; | |||
| VarPtr mom_; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_ | |||
| @@ -25,7 +25,8 @@ | |||
| namespace mindspore { | |||
| namespace memreuse { | |||
| enum RefCountType { kDynamicRefCount, kStaticRefCount }; | |||
| enum NodeType { NORMAL, SPECIAL }; | |||
| enum NodeType { COMMON_NODE, COMMUNICATION_NODE }; | |||
| enum KernelRefType { COMMON, REFNODE_OUTPUT, COMM_NOTREUSE, COMM_REUSE, SUMMARY }; | |||
| static constexpr int kInitIndex = -1; | |||
| class KernelRefCount { | |||
| public: | |||
| @@ -36,6 +37,7 @@ class KernelRefCount { | |||
| size_t offset_; | |||
| size_t size_; | |||
| int index_; | |||
| KernelRefType type_; | |||
| // remember to reset offset | |||
| KernelRefCount() | |||
| : stream_id_(0), | |||
| @@ -44,6 +46,7 @@ class KernelRefCount { | |||
| offset_(0), | |||
| size_(0), | |||
| index_(kInitIndex), | |||
| type_(COMMON), | |||
| reftype_(kStaticRefCount) {} | |||
| ~KernelRefCount() = default; | |||
| void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype); | |||
| @@ -65,7 +68,7 @@ class KernelDef { | |||
| KernelMap inputs_; | |||
| KernelMap outputs_; | |||
| KernelMap wk_space_; | |||
| NodeType dirty = NORMAL; | |||
| NodeType type_ = COMMON_NODE; | |||
| KernelDef() = default; | |||
| ~KernelDef() = default; | |||
| void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; } | |||
| @@ -46,6 +46,8 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() { | |||
| if (iter == kernel_output_refs_.end()) { | |||
| auto output_sizes = kernel_mod->GetOutputSizeList(); | |||
| KernelRefCountPtrList kernel_refs; | |||
| bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel_cnode); | |||
| size_t output_index = 0; | |||
| for (auto size : output_sizes) { | |||
| total_dy_size_ += size; | |||
| // do not MallocDynamicMem just record this | |||
| @@ -54,9 +56,20 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() { | |||
| auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode); | |||
| kernel_ref->stream_id_ = curr_stream_id; | |||
| kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount); | |||
| if (is_comm_op) { | |||
| kernel_ref->type_ = COMM_REUSE; | |||
| } else { | |||
| session::AnfWithOutIndex out_pair(kernel_cnode, output_index); | |||
| if (graph_->IsInRefOutputMap(out_pair)) { | |||
| kernel_ref->type_ = REFNODE_OUTPUT; | |||
| } else { | |||
| kernel_ref->type_ = COMMON; | |||
| } | |||
| } | |||
| kernel_refs.push_back(kernel_ref); | |||
| kernel_out_ref_num++; | |||
| total_refs_list_.push_back(kernel_ref); | |||
| output_index++; | |||
| } | |||
| if (!kernel_refs.empty()) { | |||
| kernel_output_refs_[key] = kernel_refs; | |||
| @@ -155,9 +168,19 @@ void MemReuseUtil::SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| MS_EXCEPTION_IF_NULL(kernel_def_ptr); | |||
| auto key = kernel.get(); | |||
| for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { | |||
| bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel); | |||
| size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel); | |||
| for (size_t i = 0; i < input_tensor_num; ++i) { | |||
| auto ref_ptr = GetKernelInputRef(kernel, i); | |||
| if (ref_ptr != nullptr) { | |||
| if (is_comm_op) { | |||
| if (input_tensor_num == 1) { | |||
| ref_ptr->type_ = COMM_REUSE; | |||
| } else { | |||
| ref_ptr->type_ = COMM_NOTREUSE; | |||
| } | |||
| } | |||
| if (ref_ptr->reftype() == kStaticRefCount) { | |||
| continue; | |||
| } else if (ref_ptr->reftype() == kDynamicRefCount) { | |||
| @@ -258,6 +281,11 @@ void MemReuseUtil::SetKernelDefMap() { | |||
| auto key = kernel.get(); | |||
| kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]); | |||
| kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]); | |||
| if (AnfAlgo::IsCommunicationOp(kernel)) { | |||
| kernel_def_ptr->type_ = COMMUNICATION_NODE; | |||
| } else { | |||
| kernel_def_ptr->type_ = COMMON_NODE; | |||
| } | |||
| kernel_def_ptr_list_.push_back(kernel_def_ptr); | |||
| kernel_map_[key] = kernel_def_ptr; | |||
| } | |||
| @@ -337,6 +365,7 @@ void MemReuseUtil::SetSummaryNodesRefCount() { | |||
| KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index]; | |||
| kernel_ref->ref_count_ = kMaxRefCount; | |||
| kernel_ref->ref_count_dynamic_use_ = kMaxRefCount; | |||
| kernel_ref->type_ = SUMMARY; | |||
| total_summary_size += kernel_ref->size_; | |||
| MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index; | |||
| } else { | |||
| @@ -83,6 +83,7 @@ class MemReuseUtil { | |||
| void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; } | |||
| uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const; | |||
| uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const; | |||
| bool is_all_nop_node() const { return is_all_nop_node_; } | |||
| private: | |||
| int util_index_; | |||
| @@ -33,11 +33,11 @@ void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) { | |||
| set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list()); | |||
| // check info Correctness | |||
| for (auto &tensor : tensor_ptr_list_) { | |||
| tensor->size_ = AlignMemorySize(tensor->size_); | |||
| tensor->size_ = AlignCommonMemorySize(tensor->size_); | |||
| } | |||
| // align wk size to 512 && refcount == 1 | |||
| for (auto &wk : wk_tensor_list_) { | |||
| wk->size_ = AlignMemorySize(wk->size_); | |||
| wk->size_ = AlignCommonMemorySize(wk->size_); | |||
| wk->ref_count_ = 1; | |||
| } | |||
| #ifdef ENABLE_D | |||
| @@ -135,11 +135,23 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr | |||
| return false; | |||
| } | |||
| void BestFitMemReuse::AssignNodeOutputOffset() { | |||
| void BestFitMemReuse::AssignCommonNodeOutputOffset() { | |||
| MS_EXCEPTION_IF_NULL(current_kernel_); | |||
| for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) { | |||
| size_t index = GetTensorIndex(tensor_idx); | |||
| auto tensor_desc = tensor_ptr_list_[index]; | |||
| MS_EXCEPTION_IF_NULL(tensor_desc); | |||
| if (tensor_desc->type_ == REFNODE_OUTPUT) { | |||
| total_refoutput_size += tensor_desc->size_; | |||
| continue; | |||
| } else if (tensor_desc->type_ == COMM_NOTREUSE) { | |||
| total_comm_not_reuse_size += tensor_desc->size_; | |||
| } else if (tensor_desc->type_ == COMM_REUSE) { | |||
| // get align size for communication op's single input | |||
| tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_); | |||
| total_comm_reuse_size += tensor_desc->size_; | |||
| } | |||
| auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_); | |||
| if (!reusable_membuf_map.empty()) { | |||
| auto membuf_index = reusable_membuf_map.begin()->second; | |||
| @@ -152,6 +164,93 @@ void BestFitMemReuse::AssignNodeOutputOffset() { | |||
| MemReuseChecker::GetInstance().IsAddNewMembuf_ = true; | |||
| #endif | |||
| } | |||
| // skip left align border for communication op single input to used | |||
| if (tensor_desc->type_ == COMM_REUSE) { | |||
| tensor_desc->offset_ += kDefaultMemAlignSize; | |||
| } | |||
| } | |||
| } | |||
| void BestFitMemReuse::AssignCommunicationNodeOutputOffset() { | |||
| size_t total_kernel_output_size = 0; | |||
| size_t output_num = 0; | |||
| // get all output size | |||
| MS_EXCEPTION_IF_NULL(current_kernel_); | |||
| for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) { | |||
| size_t index = GetTensorIndex(tensor_idx); | |||
| auto tensor_desc = tensor_ptr_list_[index]; | |||
| MS_EXCEPTION_IF_NULL(tensor_desc); | |||
| if (tensor_desc->type_ == COMM_REUSE) { | |||
| total_comm_reuse_size += tensor_desc->size_; | |||
| total_comm_output_reuse_size += tensor_desc->size_; | |||
| total_kernel_output_size += tensor_desc->size_; | |||
| } else { | |||
| MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:" | |||
| << current_kernel_->scope_full_name(); | |||
| continue; | |||
| } | |||
| } | |||
| total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size); | |||
| // add left align border for the first output and right align border for the last output to alloc align border memory | |||
| size_t output_index = 0; | |||
| auto output_ref_indexes = current_kernel_->GetOutputRefIndexs(); | |||
| for (auto &tensor_idx : output_ref_indexes) { | |||
| size_t index = GetTensorIndex(tensor_idx); | |||
| auto tensor_desc = tensor_ptr_list_[index]; | |||
| MS_EXCEPTION_IF_NULL(tensor_desc); | |||
| if (output_index == 0 || output_index == output_num - 1) { | |||
| tensor_desc->size_ += kDefaultMemAlignSize; | |||
| } | |||
| if ((output_index == 0) && (output_ref_indexes.size() == 1)) { | |||
| // add right align border for single output | |||
| tensor_desc->size_ += kDefaultMemAlignSize; | |||
| } | |||
| output_index++; | |||
| } | |||
| auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size); | |||
| if (!reusable_membuf_map.empty()) { | |||
| auto membuf_index = reusable_membuf_map.begin()->second; | |||
| output_index = 0; | |||
| for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) { | |||
| size_t index = GetTensorIndex(tensor_idx); | |||
| auto tensor_desc = tensor_ptr_list_[index]; | |||
| MS_EXCEPTION_IF_NULL(tensor_desc); | |||
| ReuseExistMembuf(tensor_desc.get(), membuf_index + output_index, kDynamicMem); | |||
| // skip skip left align border for communication op's first output to used | |||
| if (output_index == 0) { | |||
| tensor_desc->offset_ += kDefaultMemAlignSize; | |||
| } | |||
| output_index++; | |||
| } | |||
| } else { | |||
| // no membuf can reuse, add new membuf after the membuf_ptr_list | |||
| output_index = 0; | |||
| for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) { | |||
| size_t index = GetTensorIndex(tensor_idx); | |||
| auto tensor_desc = tensor_ptr_list_[index]; | |||
| MS_EXCEPTION_IF_NULL(tensor_desc); | |||
| AddNewMembufPtr(tensor_desc.get(), kDynamicMem); | |||
| // skip align size offset for first output to used | |||
| if (output_index == 0) { | |||
| tensor_desc->offset_ += kDefaultMemAlignSize; | |||
| } | |||
| output_index++; | |||
| #ifdef MEM_REUSE_DEBUG | |||
| MemReuseChecker::GetInstance().IsAddNewMembuf_ = true; | |||
| #endif | |||
| } | |||
| } | |||
| } | |||
| void BestFitMemReuse::AssignNodeOutputOffset() { | |||
| if (current_kernel_->type_ == COMMUNICATION_NODE) { | |||
| AssignCommunicationNodeOutputOffset(); | |||
| } else { | |||
| AssignCommonNodeOutputOffset(); | |||
| } | |||
| } | |||
| @@ -319,11 +418,17 @@ void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) { | |||
| } | |||
| } | |||
| size_t BestFitMemReuse::AlignMemorySize(size_t size) const { | |||
| size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const { | |||
| // memory size 512 align | |||
| return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize; | |||
| } | |||
| size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const { | |||
| // memory size 512 align and add communication memory: left align border memory - data - right align border memory | |||
| return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize + | |||
| kDefaultMemAlignSize; | |||
| } | |||
| size_t BestFitMemReuse::GetAllocatedSize() { | |||
| size_t AllocatedSize = kTotalSize; | |||
| if (membuf_ptr_list_.empty()) { | |||
| @@ -412,6 +517,9 @@ void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) { | |||
| ++op_num; | |||
| #endif | |||
| } | |||
| MS_LOG(INFO) << "Special Tensor total size: RefOutput: " << total_refoutput_size | |||
| << " CommReuse: " << total_comm_reuse_size << " CommOutputReuse: " << total_comm_output_reuse_size | |||
| << " CommNotReuse: " << total_comm_not_reuse_size; | |||
| #ifdef MEM_REUSE_DEBUG | |||
| MemReuseChecker::GetInstance().ExportMembufInfoIR(); | |||
| MemReuseChecker::GetInstance().ExportAddNewMmebufIR(); | |||
| @@ -74,6 +74,14 @@ class BestFitMemReuse { | |||
| * Assign output tensor memory offset of current kernel | |||
| */ | |||
| void AssignNodeOutputOffset(); | |||
| /** | |||
| * Assign output tensor memory offset of common kernel | |||
| */ | |||
| void AssignCommonNodeOutputOffset(); | |||
| /** | |||
| * Assign output tensor memory offset of communication kernel | |||
| */ | |||
| void AssignCommunicationNodeOutputOffset(); | |||
| /** | |||
| * Update input tensor's status of current kernel, and the status of membuf used by current kernel | |||
| */ | |||
| @@ -110,8 +118,10 @@ class BestFitMemReuse { | |||
| void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag); | |||
| // Merge unused membuf | |||
| void ReleaseMembuf(size_t tensor_index, int flag); | |||
| // Memory address alignment 512 | |||
| size_t AlignMemorySize(size_t size) const; | |||
| // Memory address alignment for common memory | |||
| size_t AlignCommonMemorySize(size_t size) const; | |||
| // Memory address alignment for communication used memory | |||
| size_t AlignCommunicationMemorySize(size_t size) const; | |||
| int GetRealIndex(size_t index, int flag = kDynamicMem) const; | |||
| size_t GetTensorIndex(int index) const; | |||
| size_t GetWorkspaceIndex(int index) const; | |||
| @@ -153,6 +163,10 @@ class BestFitMemReuse { | |||
| // kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def | |||
| std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_; | |||
| std::vector<std::vector<uint32_t>> stream_groups_; | |||
| size_t total_refoutput_size{0}; | |||
| size_t total_comm_reuse_size{0}; | |||
| size_t total_comm_output_reuse_size{0}; | |||
| size_t total_comm_not_reuse_size{0}; | |||
| }; | |||
| } // namespace memreuse | |||
| } // namespace mindspore | |||
| @@ -170,12 +170,14 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li | |||
| ofs << "all_tensor_refs:\n"; | |||
| ofs << "index:" | |||
| << "\tsize:" | |||
| << "\trefcount:\n"; | |||
| << "\trefcount:" | |||
| << "\ttype:\n"; | |||
| for (auto &ref : total_refs_list) { | |||
| ofs << "%" << ref->index_ << "T" | |||
| << "\t" | |||
| << "#" << ref->size_ << "S" | |||
| << "\t" << ref->ref_count_ << "C" | |||
| << "\t" << ref->type_ << "t" | |||
| << "\n"; | |||
| } | |||
| ofs << "kernel_def exc_order:\n"; | |||
| @@ -241,7 +243,7 @@ bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph | |||
| void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) { | |||
| auto scope_name = def->scope_full_name(); | |||
| std::string split_name = GetSplitName(scope_name); | |||
| ofs << "$" << def_idx << "\t" << split_name << "\t"; | |||
| ofs << "$" << def_idx << "\t" << split_name << "\t" << static_cast<int>(def->type_) << "\t"; | |||
| ofs << "inputs["; | |||
| for (auto &in : def->inputs_) { | |||
| for (auto &in_ref : in.second) { | |||
| @@ -100,7 +100,10 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic | |||
| auto parallel_context = parallel::ParallelContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(parallel_context); | |||
| const auto &split_indices = parallel_context->GetAllReduceFusionSplitIndices(group); | |||
| std::vector<uint32_t> split_indices; | |||
| if (!parallel_context->enable_parallel_optimizer()) { | |||
| split_indices = parallel_context->GetAllReduceFusionSplitIndices(group); | |||
| } | |||
| size_t segments = 0; | |||
| if (split_indices.size() != 0) { | |||
| @@ -71,7 +71,6 @@ bool ReplaceNodeByProxy::Run(const FuncGraphPtr &func_graph) { | |||
| AbstractBasePtrList abstract_list; | |||
| AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node); | |||
| AnfAlgo::CopyNodeAttr("reduce_scatter_flag", cnode, proxy_node); | |||
| AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node); | |||
| abstract_list.push_back(cnode->abstract()); | |||
| auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list); | |||
| @@ -18,9 +18,12 @@ | |||
| #include <utility> | |||
| #include <memory> | |||
| #include <algorithm> | |||
| #include <string> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/union_find_set.h" | |||
| #include "runtime/device/ascend/ascend_label_assign.h" | |||
| #include "utils/context/ms_context.h" | |||
| #include "debug/anf_ir_dump.h" | |||
| static constexpr size_t kCNodePrim = 0; | |||
| static constexpr size_t kCNodeCallArg = 1; | |||
| @@ -104,7 +107,7 @@ static void ReuseParameter(NotNull<KernelGraphPtr> root_kg, | |||
| static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) { | |||
| for (size_t i = start; i < list.size() - 1; ++i) { | |||
| if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) { | |||
| if (AnfAlgo::IsRealKernel(list[i])) { | |||
| return list[i]; | |||
| } | |||
| } | |||
| @@ -168,18 +171,43 @@ static void EraseNodeFromExecOrder(const AnfNodePtr &node, const NotNull<std::ve | |||
| exec_order->erase(exec_iter); | |||
| } | |||
| void AscendControlParser::AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph, | |||
| const NotNull<std::set<KernelGraphPtr> *> memo) { | |||
| if (memo->find(graph) != memo->end()) { | |||
| return; | |||
| } | |||
| memo->insert(graph.get()); | |||
| const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order(); | |||
| if (child_graph_order.empty()) { | |||
| return; | |||
| } | |||
| std::vector<AnfNodePtr> depend_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimPartial->name()))}; | |||
| for (auto &cg : child_graph_order) { | |||
| MS_EXCEPTION_IF_NULL(cg); | |||
| auto fg = cg->cast<FuncGraphPtr>(); | |||
| MS_EXCEPTION_IF_NULL(fg); | |||
| depend_inputs.emplace_back(NewValueNode(fg)); | |||
| AttachChildGraphToReturnNode(NOT_NULL(cg), memo); | |||
| } | |||
| auto child_graphs = graph->NewCNode(depend_inputs); | |||
| InsertDependToGraph(graph, NOT_NULL(child_graphs)); | |||
| } | |||
| void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) { | |||
| std::set<KernelGraphPtr> memo; | |||
| std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list; | |||
| // Insert Assign | |||
| ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo)); | |||
| memo.clear(); | |||
| // Reuse Parameter | |||
| ReuseParameter(kg, link_list); | |||
| // replace call by label goto / label switch | |||
| memo.clear(); | |||
| (void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo)); | |||
| memo.clear(); | |||
| // assign label resource | |||
| device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg); | |||
| AttachChildGraphToReturnNode(kg, NOT_NULL(&memo)); | |||
| } | |||
| void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph, | |||
| @@ -248,10 +276,14 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph, | |||
| } | |||
| MS_LOG(INFO) << "Erase " << assign_node->DebugString(5); | |||
| EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order)); | |||
| auto source = AnfAlgo::VisitKernelWithReturnType(assign_node->input(kCNodeAssignSource), 0).first; | |||
| parameter_count.AddReadCount(source, -1); | |||
| auto source = assign_node->input(kCNodeAssignSource); | |||
| MS_EXCEPTION_IF_NULL(source); | |||
| auto visit_source = AnfAlgo::VisitKernelWithReturnType(source, 0).first; | |||
| parameter_count.AddWriteCount(para, -1); | |||
| parameter_count.AddReadCount(para, -1); | |||
| if (visit_source->isa<Parameter>()) { | |||
| parameter_count.AddReadCount(visit_source, read - 1); | |||
| } | |||
| for (auto &node : all_nodes) { | |||
| for (size_t i = 0; i < node->size(); ++i) { | |||
| if (node->input(i) == para) { | |||
| @@ -260,8 +292,6 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph, | |||
| } | |||
| } | |||
| } | |||
| parameter_count.AddReadCount(source, 1); | |||
| parameter_count.AddReadCount(para, -1); | |||
| } | |||
| root_graph->set_execution_order(exec_order); | |||
| } | |||
| @@ -318,6 +348,17 @@ void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) { | |||
| (void)RecurseGraph(root_graph, NOT_NULL(&memo)); | |||
| EraseParameter(root_graph, memo); | |||
| EraseLabel(root_graph); | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| auto save_graphs_path = context_ptr->save_graphs_path(); | |||
| if (save_graphs_path.empty()) { | |||
| save_graphs_path = "."; | |||
| } | |||
| if (context_ptr->save_graphs_flag()) { | |||
| std::string file_path = save_graphs_path + "/after_erase_label_and_parameter.ir"; | |||
| DumpIR(file_path, root_graph.get()); | |||
| } | |||
| } | |||
| std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode( | |||
| @@ -66,7 +66,8 @@ class AscendControlParser { | |||
| static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to); | |||
| static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node); | |||
| static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node); | |||
| static void AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph, | |||
| const NotNull<std::set<KernelGraphPtr> *> memo); | |||
| // root graph order | |||
| static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode, | |||
| NotNull<KernelGraphPtr> graph); | |||
| @@ -353,6 +353,10 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) { | |||
| RootGraphExecutorValidate(NOT_NULL(root_graph)); | |||
| // adjust kernel | |||
| AdjustKernel(root_graph); | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| // Assign parameter keys. | |||
| AssignParamKey(root_graph); | |||
| #endif | |||
| // assign stream | |||
| AssignStream(NOT_NULL(root_graph)); | |||
| // insert profiling point | |||
| @@ -511,6 +515,12 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor:: | |||
| } | |||
| // load input data from user input | |||
| LoadInputData(kernel_graph, inputs); | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| // Initialize parameter server | |||
| if (!ps_init_) { | |||
| InitPSParamAndOptim(kernel_graph, inputs); | |||
| } | |||
| #endif | |||
| // convert inputs to model | |||
| predictmodel::StepConvertWeight(inputs); | |||
| { | |||
| @@ -16,6 +16,7 @@ | |||
| #include "backend/session/cpu_session.h" | |||
| #include <algorithm> | |||
| #include <sstream> | |||
| #include "ir/tensor.h" | |||
| #include "ir/anf.h" | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| @@ -25,9 +26,15 @@ | |||
| #include "predict/predict.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "runtime/device/cpu/kernel_select_cpu.h" | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| #include "backend/optimizer/common/pass_manager.h" | |||
| #include "backend/optimizer/pass/replace_node_by_proxy.h" | |||
| #ifdef ENABLE_DEBUGGER | |||
| #include "debug/debugger/debugger.h" | |||
| #endif | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| #include "frontend/parallel/ps/util.h" | |||
| #endif | |||
| namespace mindspore { | |||
| namespace session { | |||
| @@ -49,12 +56,29 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf, | |||
| return new_parameter; | |||
| } | |||
| void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) { | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| std::string pass_name = "replace_node_by_proxy"; | |||
| pass_name.append(std::to_string(graph_sum_)); | |||
| pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name)); | |||
| optimizer->AddPassManager(pm); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| kernel_graph->SetExecOrderByDefault(); | |||
| } | |||
| GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { | |||
| auto graph_id = graph_sum_; | |||
| auto graph = ConstructKernelGraph(lst, outputs); | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_LOG(INFO) << "Set kernel info"; | |||
| SetKernelInfo(graph.get()); | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| AssignParamKey(graph); | |||
| if (parallel::ps::Util::IsRoleOfWorker()) { | |||
| Optimize(graph); | |||
| } | |||
| #endif | |||
| predictmodel::StepConvertGraph(graph); | |||
| MS_LOG(INFO) << "Build kernel"; | |||
| BuildKernel(graph.get()); | |||
| @@ -66,6 +90,12 @@ GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList | |||
| void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { | |||
| auto &kernel_graph = graphs_[graph_id]; | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| // Initialize parameter server | |||
| if (!ps_init_) { | |||
| InitPSParamAndOptim(kernel_graph, inputs); | |||
| } | |||
| #endif | |||
| MS_LOG(INFO) << "Bind input output address"; | |||
| std::vector<tensor::TensorPtr> need_sync_outputs; | |||
| runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs); | |||
| @@ -119,6 +149,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) { | |||
| } | |||
| } | |||
| namespace { | |||
| void KernelNotSupportException(const AnfNodePtr &kernel_node) { | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| std::stringstream operator_info; | |||
| operator_info << "Operator[" << kernel_name << "] "; | |||
| auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info()); | |||
| if (kernel_info == nullptr) { | |||
| operator_info << "is not support."; | |||
| MS_LOG(EXCEPTION) << operator_info.str(); | |||
| } | |||
| auto kernel_build_Info = kernel_info->select_kernel_build_info(); | |||
| if (kernel_build_Info == nullptr) { | |||
| operator_info << "is not support."; | |||
| MS_LOG(EXCEPTION) << operator_info.str(); | |||
| } | |||
| size_t input_num = kernel_build_Info->GetInputNum(); | |||
| if (input_num > 0) { | |||
| operator_info << " input("; | |||
| for (size_t i = 0; i < input_num; ++i) { | |||
| operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i)); | |||
| if (i != input_num - 1) { | |||
| operator_info << ","; | |||
| } | |||
| } | |||
| operator_info << ") "; | |||
| } | |||
| size_t output_num = kernel_build_Info->GetOutputNum(); | |||
| if (output_num > 0) { | |||
| operator_info << "output("; | |||
| for (size_t i = 0; i < output_num; ++i) { | |||
| operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i)); | |||
| if (i != kernel_build_Info->GetOutputNum() - 1) { | |||
| operator_info << ","; | |||
| } | |||
| } | |||
| operator_info << ") "; | |||
| } | |||
| operator_info << "is not support."; | |||
| MS_LOG(EXCEPTION) << operator_info.str(); | |||
| } | |||
| } // namespace | |||
| void CPUSession::BuildKernel(const KernelGraph *kernel_graph) { | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| auto &kernel_nodes = kernel_graph->execution_order(); | |||
| @@ -129,7 +201,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) { | |||
| std::shared_ptr<kernel::CPUKernel> cpu_kernel = | |||
| kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node); | |||
| if (cpu_kernel == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support."; | |||
| KernelNotSupportException(kernel_node); | |||
| } | |||
| cpu_kernel->Init(kernel_node); | |||
| AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get()); | |||
| @@ -37,6 +37,7 @@ class CPUSession : public SessionBasic { | |||
| protected: | |||
| ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override; | |||
| void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph); | |||
| private: | |||
| void SetKernelInfo(const KernelGraph *kernel_graph); | |||
| @@ -25,6 +25,11 @@ | |||
| #include "backend/optimizer/pass/getitem_tuple.h" | |||
| #include "backend/optimizer/gpu/adam_weight_decay_fusion.h" | |||
| #include "backend/optimizer/gpu/adam_fusion.h" | |||
| #include "backend/optimizer/gpu/replace_bn_cast_fusion.h" | |||
| #include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h" | |||
| #include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h" | |||
| #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h" | |||
| #include "backend/optimizer/gpu/replace_addn_fusion.h" | |||
| #include "runtime/device/kernel_runtime_manager.h" | |||
| #include "predict/predict.h" | |||
| #include "common/utils.h" | |||
| @@ -59,6 +64,11 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) { | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>()); | |||
| pm->AddPass(std::make_shared<opt::AdamFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>()); | |||
| pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>()); | |||
| optimizer->AddPassManager(pm); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| kernel_graph->SetExecOrderByDefault(); | |||
| @@ -167,6 +177,10 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList | |||
| Optimize(graph); | |||
| // Select kernel build info | |||
| SelectKernel(graph); | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| // Assign parameter keys. | |||
| AssignParamKey(graph); | |||
| #endif | |||
| // Convert kernel Graph to model | |||
| predictmodel::StepConvertGraph(graph); | |||
| // Start gpu kernel runtime | |||
| @@ -204,6 +218,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten | |||
| auto &kernel_graph = graphs_[graph_id]; | |||
| // Load input data from user input | |||
| LoadInputData(kernel_graph, inputs); | |||
| // Initialize parameter server | |||
| if (!ps_init_) { | |||
| InitPSParamAndOptim(kernel_graph, inputs); | |||
| } | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| // Convert inputs to model | |||
| predictmodel::StepConvertWeight(inputs); | |||
| @@ -307,7 +307,7 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) { | |||
| if (inputs.size() == 1 || !feature_map_input_indexs.empty()) { | |||
| kernel_info->SetFeatureMapFlag(true); | |||
| } | |||
| if (AnfAlgo::IsRealCNodeKernel(cnode)) { | |||
| if (AnfAlgo::IsRealKernel(cnode)) { | |||
| AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode); | |||
| AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode); | |||
| } | |||
| @@ -929,10 +929,15 @@ void KernelGraph::AddInternalOutput(const AnfNodePtr &front_node, const AnfNodeP | |||
| } | |||
| MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString(); | |||
| front_to_internal_outputs_map_[front_node] = node; | |||
| internal_outputs_to_front_map_[node] = front_node; | |||
| int output_idx = 0; | |||
| if (AnfAlgo::CheckPrimitiveType(front_node, prim::kPrimTupleGetItem)) { | |||
| output_idx = AnfAlgo::GetTupleGetItemOutIndex(front_node->cast<CNodePtr>()); | |||
| } | |||
| internal_outputs_to_front_map_[node][output_idx] = front_node; | |||
| } | |||
| void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node) { | |||
| void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx, | |||
| int dst_output_idx) { | |||
| if (new_node == nullptr || node == nullptr) { | |||
| MS_LOG(INFO) << "New node or node is nullptr"; | |||
| return; | |||
| @@ -947,9 +952,30 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString(); | |||
| internal_outputs_to_front_map_[new_node] = iter->second; | |||
| front_to_internal_outputs_map_[iter->second] = new_node; | |||
| internal_outputs_to_front_map_.erase(iter); | |||
| auto &front_nodes = iter->second; | |||
| // Move all front nodes to new node mapping | |||
| if (src_output_idx == -1) { | |||
| internal_outputs_to_front_map_[new_node] = front_nodes; | |||
| for (const auto &front_node_iter : front_nodes) { | |||
| front_to_internal_outputs_map_[front_node_iter.second] = new_node; | |||
| } | |||
| internal_outputs_to_front_map_.erase(iter); | |||
| return; | |||
| } | |||
| // Move specified front node to new node mapping | |||
| int index = SizeToInt(src_output_idx); | |||
| auto front_node_iter = front_nodes.find(index); | |||
| if (front_node_iter == front_nodes.end()) { | |||
| MS_LOG(INFO) << "The output " << src_output_idx << " of node " << node->DebugString() << " is not an internal node"; | |||
| return; | |||
| } | |||
| auto front_node = front_node_iter->second; | |||
| internal_outputs_to_front_map_[new_node][dst_output_idx] = front_node; | |||
| front_to_internal_outputs_map_[front_node] = new_node; | |||
| front_nodes.erase(index); | |||
| if (front_nodes.empty()) { | |||
| internal_outputs_to_front_map_.erase(iter); | |||
| } | |||
| } | |||
| AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const { | |||
| @@ -967,14 +993,6 @@ bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const { | |||
| return false; | |||
| } | |||
| AnfNodePtr KernelGraph::GetFrontNodeByInternalOutput(const AnfNodePtr &node) const { | |||
| auto iter = internal_outputs_to_front_map_.find(node); | |||
| if (iter != internal_outputs_to_front_map_.end()) { | |||
| return iter->second; | |||
| } | |||
| return nullptr; | |||
| } | |||
| void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) { | |||
| if (node == nullptr) { | |||
| return; | |||
| @@ -148,10 +148,10 @@ class KernelGraph : public FuncGraph { | |||
| const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; } | |||
| void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; } | |||
| void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node); | |||
| void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node); | |||
| void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1, | |||
| int dst_output_idx = -1); | |||
| AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const; | |||
| bool IsInternalOutput(const AnfNodePtr &node) const; | |||
| AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const; | |||
| void AddFinalOutputKernel(const AnfNodePtr &node); | |||
| bool IsFinalOutputKernel(const AnfNodePtr &node) const; | |||
| uint32_t current_epoch() const { return current_epoch_; } | |||
| @@ -223,7 +223,7 @@ class KernelGraph : public FuncGraph { | |||
| CNodePtr end_goto_; | |||
| bool null_output_; | |||
| std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_; | |||
| std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_; | |||
| std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_; | |||
| std::set<AnfNodePtr> final_output_kernels_; | |||
| uint32_t current_epoch_; | |||
| }; | |||
| @@ -35,6 +35,11 @@ | |||
| #include "ir/dtype.h" | |||
| #include "ir/anf.h" | |||
| #include "ir/func_graph_cloner.h" | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| #include "frontend/parallel/ps/worker.h" | |||
| #include "frontend/parallel/ps/common.h" | |||
| #include "frontend/parallel/ps/util.h" | |||
| #endif | |||
| namespace mindspore { | |||
| namespace session { | |||
| @@ -295,7 +300,11 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const | |||
| MS_LOG(INFO) << "No corresponding internal output for output node"; | |||
| return; | |||
| } | |||
| auto real_kernel = AnfAlgo::VisitKernel(ref_node, 0); | |||
| size_t output_idx = 0; | |||
| if (AnfAlgo::CheckPrimitiveType(out_node, prim::kPrimTupleGetItem)) { | |||
| output_idx = AnfAlgo::GetTupleGetItemOutIndex(out_node->cast<CNodePtr>()); | |||
| } | |||
| auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx); | |||
| auto ref_real_node = real_kernel.first; | |||
| auto ref_real_node_index = real_kernel.second; | |||
| if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) && | |||
| @@ -320,6 +329,7 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const | |||
| builder.SetOutputsFormat({format}); | |||
| d_kernel_info->set_select_kernel_build_info(builder.Build()); | |||
| AnfAlgo::SetOutputAddr(address, 0, parameter.get()); | |||
| AnfAlgo::SetOutputInferTypeAndShape({type}, {AnfAlgo::GetOutputInferShape(parameter, 0)}, parameter.get()); | |||
| } | |||
| } | |||
| @@ -973,6 +983,16 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std: | |||
| bool internal_output = true; | |||
| std::string kernel_target = GetCNodeTarget(front_real_kernel.first); | |||
| for (auto user : users) { | |||
| auto cnode = user.first->cast<CNodePtr>(); | |||
| if (cnode == nullptr) { | |||
| internal_output = false; | |||
| break; | |||
| } | |||
| auto prim = cnode->input(kAnfPrimitiveIndex); | |||
| if (prim == nullptr || !prim->isa<ValueNode>()) { | |||
| internal_output = false; | |||
| break; | |||
| } | |||
| if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) { | |||
| internal_output = false; | |||
| break; | |||
| @@ -1097,5 +1117,92 @@ KernelGraphPtr SessionBasic::NewKernelGraph() { | |||
| graphs_[graph_sum_++] = graph; | |||
| return graph; | |||
| } | |||
| AnfNodePtr SessionBasic::FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list) { | |||
| MS_EXCEPTION_IF_NULL(push_node); | |||
| for (auto &node : node_list) { | |||
| if (node != nullptr && node->isa<CNode>()) { | |||
| for (auto input : node->cast<CNodePtr>()->inputs()) { | |||
| if (push_node == AnfAlgo::VisitKernel(input, 0).first) { | |||
| if (AnfAlgo::GetCNodeName(node) != kPullOpName) { | |||
| MS_LOG(EXCEPTION) << "The edge between Push and Pull node is invalid."; | |||
| } | |||
| return node; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return nullptr; | |||
| } | |||
| #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES) | |||
| void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) { | |||
| if (!parallel::ps::Util::IsRoleOfWorker()) { | |||
| MS_LOG(INFO) << "Not parameter server mode."; | |||
| return; | |||
| } | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return()); | |||
| for (auto &node : node_list) { | |||
| if (node != nullptr && node->isa<CNode>()) { | |||
| // Assign key for forward kernel EmbeddingLookup. | |||
| // The key will be assigned to embedding table ande Push kernel as well. | |||
| if (AnfAlgo::GetCNodeName(node) == kEmbeddingLookupOpName) { | |||
| size_t embedding_table_idx = 0; | |||
| auto embedding_table = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), embedding_table_idx); | |||
| size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(embedding_table->fullname_with_scope()); | |||
| AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node); | |||
| } else if (AnfAlgo::GetCNodeName(node) == kPushOpName) { | |||
| auto pull_node = FindPullNode(node, node_list); | |||
| if (!pull_node) { | |||
| MS_LOG(EXCEPTION) << "Assigning parameter key failed: can't find Pull node of the Push node."; | |||
| } | |||
| // Second input of Pull node is the trainable parameter. | |||
| size_t parameter_index = 1; | |||
| auto parameter_node = AnfAlgo::GetInputNode(pull_node->cast<CNodePtr>(), parameter_index); | |||
| size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(parameter_node->fullname_with_scope()); | |||
| AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node); | |||
| AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), pull_node); | |||
| std::string optimizer_name = AnfAlgo::GetNodeAttr<std::string>(node, kAttrOptimizerType); | |||
| parallel::ps::Worker<float>::GetInstance().SetKeyOptimId(key, optimizer_name); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, | |||
| const std::vector<tensor::TensorPtr> &inputs_const) { | |||
| if (!parallel::ps::Util::IsRoleOfWorker()) { | |||
| return; | |||
| } | |||
| std::vector<tensor::TensorPtr> inputs(inputs_const); | |||
| size_t input_ctrl_size = 1; | |||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||
| if (kernel_graph->input_ctrl_tensors()) { | |||
| input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs); | |||
| } | |||
| auto input_nodes = kernel_graph->inputs(); | |||
| if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) { | |||
| MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size() | |||
| << ", input_ctrl_size:" << input_ctrl_size; | |||
| } | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| for (size_t i = 0; i < inputs.size(); ++i) { | |||
| auto tensor = inputs[i]; | |||
| MS_EXCEPTION_IF_NULL(tensor); | |||
| auto input_node = input_nodes[i]; | |||
| MS_EXCEPTION_IF_NULL(input_node); | |||
| if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) { | |||
| auto pk_node = input_node->cast<ParameterPtr>(); | |||
| mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim( | |||
| pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes())); | |||
| } | |||
| } | |||
| ps_init_ = true; | |||
| } | |||
| #endif | |||
| } // namespace session | |||
| } // namespace mindspore | |||
| @@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>; | |||
| class SessionBasic { | |||
| public: | |||
| SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) { | |||
| SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) { | |||
| #ifdef ENABLE_DEBUGGER | |||
| debugger_ = nullptr; | |||
| #endif | |||
| @@ -104,6 +104,8 @@ class SessionBasic { | |||
| virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; } | |||
| virtual void SetActive(GraphId, GraphId) {} | |||
| virtual void GetSummaryNodes(KernelGraph *graph); | |||
| void AssignParamKey(const KernelGraphPtr &kernel_graph); | |||
| void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const); | |||
| #ifdef ENABLE_DEBUGGER | |||
| // set debugger | |||
| @@ -140,6 +142,7 @@ class SessionBasic { | |||
| AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph); | |||
| void AddParameterToGraphInputs(const std::vector<AnfNodePtr> ¶meters, KernelGraph *graph); | |||
| void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr ¶meter); | |||
| AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list); | |||
| std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_; | |||
| std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_; | |||
| @@ -148,6 +151,7 @@ class SessionBasic { | |||
| CallBackFunc summary_callback_; | |||
| static GraphId graph_sum_; | |||
| uint32_t device_id_; | |||
| bool ps_init_; | |||
| #ifdef ENABLE_DEBUGGER | |||
| std::shared_ptr<Debugger> debugger_; | |||
| #endif | |||
| @@ -23,9 +23,7 @@ if (ENABLE_D) | |||
| list(APPEND _DEBUG_SRC_LIST | |||
| "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" | |||
| ) | |||
| if (ENABLE_DATA_DUMP) | |||
| list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc") | |||
| endif(ENABLE_DATA_DUMP) | |||
| list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc") | |||
| endif() | |||
| if (ENABLE_DUMP_E2E) | |||
| @@ -120,6 +120,10 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) { | |||
| MS_LOG(ERROR) << dump_config_file << " not exist."; | |||
| return {}; | |||
| } | |||
| auto suffix = dump_config_file.substr(dump_config_file.find_last_of('.') + 1); | |||
| if (suffix != "json") { | |||
| MS_LOG(EXCEPTION) << "[DataDump] dump config file suffix only support json! But got:." << suffix; | |||
| } | |||
| return dump_config_file; | |||
| } | |||
| } // namespace mindspore | |||
| @@ -29,13 +29,13 @@ void DataDumpParser::ResetParam() { | |||
| net_name_.clear(); | |||
| dump_mode_ = 0; | |||
| dump_step_ = 0; | |||
| kernel_set_.clear(); | |||
| kernel_map_.clear(); | |||
| } | |||
| bool DataDumpParser::DumpEnabled() const { | |||
| auto enable_dump = std::getenv(kEnableDataDump); | |||
| if (!enable_dump) { | |||
| MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP"; | |||
| MS_LOG(INFO) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP"; | |||
| return false; | |||
| } | |||
| @@ -60,9 +60,18 @@ std::optional<std::string> DataDumpParser::GetDumpPath() const { | |||
| return {}; | |||
| } | |||
| std::string dump_path_str(dump_path); | |||
| if (!std::all_of(dump_path_str.begin(), dump_path_str.end(), ::isalpha)) { | |||
| MS_LOG(EXCEPTION) << "[DataDump] dump path only support alphas, but got:" << dump_path_str; | |||
| } | |||
| return dump_path_str; | |||
| } | |||
| std::string GetIfstreamString(const std::ifstream &ifstream) { | |||
| std::stringstream buffer; | |||
| buffer << ifstream.rdbuf(); | |||
| return buffer.str(); | |||
| } | |||
| void DataDumpParser::ParseDumpConfig() { | |||
| std::lock_guard<std::mutex> guard(lock_); | |||
| MS_LOG(INFO) << "[DataDump] parse start"; | |||
| @@ -84,7 +93,12 @@ void DataDumpParser::ParseDumpConfig() { | |||
| } | |||
| nlohmann::json j; | |||
| json_file >> j; | |||
| try { | |||
| json_file >> j; | |||
| } catch (nlohmann::json::parse_error &e) { | |||
| MS_LOG(ERROR) << "[DataDump] json contents:" << GetIfstreamString(json_file); | |||
| MS_LOG(EXCEPTION) << "[DataDump] parse json failed, error:" << e.what(); | |||
| } | |||
| if (j.find("DumpSettings") == j.end()) { | |||
| MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist."; | |||
| } | |||
| @@ -111,8 +125,8 @@ bool DataDumpParser::NeedDump(const std::string &op_full_name) const { | |||
| if (dump_mode_ == 0) { | |||
| return true; | |||
| } | |||
| auto iter = kernel_set_.find(op_full_name); | |||
| return iter != kernel_set_.end(); | |||
| auto iter = kernel_map_.find(op_full_name); | |||
| return iter != kernel_map_.end(); | |||
| } | |||
| bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const { | |||
| @@ -145,8 +159,25 @@ bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) { | |||
| auto kernel_str = kernel.dump(); | |||
| kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end()); | |||
| MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str; | |||
| kernel_set_.insert(kernel_str); | |||
| kernel_map_.insert({kernel_str, 0}); | |||
| } | |||
| return true; | |||
| } | |||
| void DataDumpParser::MatchKernel(const std::string &kernel_name) { | |||
| auto iter = kernel_map_.find(kernel_name); | |||
| if (iter == kernel_map_.end()) { | |||
| return; | |||
| } | |||
| iter->second = iter->second + 1; | |||
| MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second; | |||
| } | |||
| void DataDumpParser::PrintUnusedKernel() { | |||
| for (const auto &iter : kernel_map_) { | |||
| if (iter.second == 0) { | |||
| MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first; | |||
| } | |||
| } | |||
| } | |||
| } // namespace mindspore | |||
| @@ -18,7 +18,7 @@ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_ | |||
| #include <string> | |||
| #include <set> | |||
| #include <map> | |||
| #include <mutex> | |||
| #include <optional> | |||
| #include "nlohmann/json.hpp" | |||
| @@ -39,7 +39,8 @@ class DataDumpParser { | |||
| const std::string &net_name() const { return net_name_; } | |||
| uint32_t dump_mode() const { return dump_mode_; } | |||
| uint32_t dump_step() const { return dump_step_; } | |||
| const std::set<std::string> &kernel_set() const { return kernel_set_; } | |||
| void MatchKernel(const std::string &kernel_name); | |||
| void PrintUnusedKernel(); | |||
| private: | |||
| DataDumpParser() = default; | |||
| @@ -55,7 +56,7 @@ class DataDumpParser { | |||
| std::string net_name_; | |||
| uint32_t dump_mode_{0}; | |||
| uint32_t dump_step_{0}; | |||
| std::set<std::string> kernel_set_; | |||
| std::map<std::string, uint32_t> kernel_map_; | |||
| }; | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_ | |||
| @@ -124,6 +124,8 @@ void ProtoExporter::SetNodeOutputType(const TypePtr &type, const BaseShapePtr &s | |||
| // Do Nothing | |||
| } else if (type->isa<UndeterminedType>()) { | |||
| // Do Nothing | |||
| } else if (type->isa<SparseTensorType>()) { | |||
| // Do Nothing | |||
| } else if (type->isa<Tuple>()) { | |||
| TuplePtr tuple_type = dyn_cast<Tuple>(type); | |||
| type_proto->set_data_type(irpb::DT_TUPLE); | |||