From: @zhao_ting_v Reviewed-by: @liangchenghui,@oacjiewen Signed-off-by: @liangchenghuitags/v1.3.0
| @@ -103,8 +103,7 @@ void AdamDeltaCPUKernel::CheckParams(const std::vector<kernel::AddressPtr> &inpu | |||
| } | |||
| } | |||
| bool AdamDeltaCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| bool AdamDeltaCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| CheckParams(inputs, outputs); | |||
| auto m = reinterpret_cast<float *>(inputs[0]->addr); | |||
| @@ -52,7 +52,7 @@ void ArgmaxCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| size_t shape_len = shape_.size(); | |||
| int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS); | |||
| axis += shape_len; | |||
| axis += SizeToLong(shape_len); | |||
| if (axis < 0) { | |||
| MS_LOG(EXCEPTION) << "Invalid axis:" << axis << ", should in range [-1, " << (shape_len - 1) << "]"; | |||
| } | |||
| @@ -23,7 +23,7 @@ namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = input1[i] + input2[i]; | |||
| input1[i] = out[i]; | |||
| @@ -34,7 +34,7 @@ void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = input1[i] + input2[i]; | |||
| } | |||
| @@ -44,7 +44,7 @@ void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = input1[i] - input2[i]; | |||
| } | |||
| @@ -54,7 +54,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = input1[i] * input2[i]; | |||
| } | |||
| @@ -64,7 +64,7 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| auto dividend = input1[i]; | |||
| auto divisor = input2[i]; | |||
| @@ -89,7 +89,7 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| auto dividend = input1[i]; | |||
| auto divisor = input2[i]; | |||
| @@ -114,7 +114,7 @@ void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| auto dividend = input1[i]; | |||
| auto divisor = input2[i]; | |||
| @@ -139,7 +139,7 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = static_cast<double>(input1[i]); | |||
| auto y = static_cast<double>(input2[i]); | |||
| @@ -157,7 +157,7 @@ void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = static_cast<double>(input1[i]); | |||
| auto y = static_cast<double>(input2[i]); | |||
| @@ -170,7 +170,7 @@ void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| auto x = static_cast<double>(input1[i]); | |||
| auto y = static_cast<double>(input2[i]); | |||
| @@ -182,7 +182,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) { | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| T diff = input1[i] - input2[i]; | |||
| out[i] = diff * diff; | |||
| @@ -193,7 +193,7 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2, | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = (T)atan2(static_cast<double>(input1[i]), static_cast<double>(input2[i])); | |||
| } | |||
| @@ -147,7 +147,7 @@ template <typename T> | |||
| void Asin(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = asin(in[i]); | |||
| out[i] = static_cast<T>(asin(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -157,7 +157,7 @@ template <typename T> | |||
| void ACos(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = acos(in[i]); | |||
| out[i] = static_cast<T>(acos(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -167,7 +167,7 @@ template <typename T> | |||
| void Atan(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = atan(in[i]); | |||
| out[i] = static_cast<T>(atan(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -177,7 +177,7 @@ template <typename T> | |||
| void Sin(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = sin(in[i]); | |||
| out[i] = static_cast<T>(sin(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -187,7 +187,7 @@ template <typename T> | |||
| void Cos(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = cos(in[i]); | |||
| out[i] = static_cast<T>(cos(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -197,7 +197,7 @@ template <typename T> | |||
| void Tan(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = tan(in[i]); | |||
| out[i] = static_cast<T>(tan(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -207,7 +207,7 @@ template <typename T> | |||
| void Sinh(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = sinh(in[i]); | |||
| out[i] = static_cast<T>(sinh(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -217,7 +217,7 @@ template <typename T> | |||
| void Cosh(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = cosh(in[i]); | |||
| out[i] = static_cast<T>(cosh(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -227,7 +227,7 @@ template <typename T> | |||
| void Asinh(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = asinh(in[i]); | |||
| out[i] = static_cast<T>(asinh(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -237,7 +237,7 @@ template <typename T> | |||
| void Acosh(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = acosh(in[i]); | |||
| out[i] = static_cast<T>(acosh(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -247,7 +247,7 @@ template <typename T> | |||
| void Atanh(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = atanh(in[i]); | |||
| out[i] = static_cast<T>(atanh(static_cast<double>(in[i]))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| @@ -127,7 +127,8 @@ std::pair<bool, size_t> CPUKernelFactory::CPUKernelAttrCheck(const std::string & | |||
| return std::make_pair(false, 0); | |||
| } | |||
| bool CPUKernelFactory::CPUKernelSingleAttrCheck(const KernelAttr &kernel_attr, const KernelBuildInfo &kernel_info) { | |||
| bool CPUKernelFactory::CPUKernelSingleAttrCheck(const KernelAttr &kernel_attr, | |||
| const KernelBuildInfo &kernel_info) const { | |||
| for (size_t i = 0; i < kernel_info.GetInputNum(); ++i) { | |||
| auto dtype = kernel_attr.GetAllSame() ? kernel_attr.GetInputAttr(0).first : kernel_attr.GetInputAttr(i).first; | |||
| if (kernel_info.GetInputDeviceType(i) != dtype) { | |||
| @@ -46,7 +46,7 @@ class CPUKernelFactory { | |||
| ~CPUKernelFactory() = default; | |||
| DISABLE_COPY_AND_ASSIGN(CPUKernelFactory) | |||
| std::pair<bool, size_t> CPUKernelAttrCheck(const std::string &kernel_name, const KernelBuildInfo &kernel_info); | |||
| bool CPUKernelSingleAttrCheck(const KernelAttr &kernel_attr, const KernelBuildInfo &kernel_info); | |||
| bool CPUKernelSingleAttrCheck(const KernelAttr &kernel_attr, const KernelBuildInfo &kernel_info) const; | |||
| std::map<std::string, std::vector<std::pair<KernelAttr, CPUKernelCreator>>> name_to_attr_creator_; | |||
| }; | |||
| @@ -32,7 +32,7 @@ bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const | |||
| auto output = reinterpret_cast<int *>(outputs[0]->addr); | |||
| size_t elem_num = inputs[0]->size / sizeof(int); | |||
| for (size_t i = 0; i < elem_num; i++) { | |||
| output[i] = val[i]; | |||
| output[i] = static_cast<int>(val[i]); | |||
| } | |||
| return true; | |||
| @@ -55,7 +55,7 @@ bool IsFiniteCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, co | |||
| } | |||
| void IsFiniteCPUKernel::LaunchKernelFloat16(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| const std::vector<kernel::AddressPtr> &outputs) const { | |||
| float16 *input = reinterpret_cast<float16 *>(inputs[0]->addr); | |||
| bool *output = reinterpret_cast<bool *>(outputs[0]->addr); | |||
| @@ -39,7 +39,7 @@ class IsFiniteCPUKernel : public CPUKernel { | |||
| void LaunchKernelOther(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs); | |||
| void LaunchKernelFloat16(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs); | |||
| void LaunchKernelFloat16(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) const; | |||
| private: | |||
| std::map<TypeId, size_t> dtype_map_ = {{kNumberTypeBool, sizeof(bool)}, {kNumberTypeInt8, sizeof(int8_t)}, | |||
| @@ -34,16 +34,16 @@ void LayerNormCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| if (begin_params_axis < 0) { | |||
| begin_params_axis += x_shape.size(); | |||
| } | |||
| for (size_t i = 0; i < IntToSize(begin_norm_axis); i++) { | |||
| for (size_t i = 0; i < LongToSize(begin_norm_axis); i++) { | |||
| block_num_ *= x_shape[i]; | |||
| } | |||
| for (size_t i = IntToSize(begin_norm_axis); i < x_shape.size(); i++) { | |||
| for (size_t i = LongToSize(begin_norm_axis); i < x_shape.size(); i++) { | |||
| block_size_ *= x_shape[i]; | |||
| } | |||
| for (size_t i = IntToSize(begin_params_axis); i < x_shape.size(); i++) { | |||
| for (size_t i = LongToSize(begin_params_axis); i < x_shape.size(); i++) { | |||
| param_num_ *= x_shape[i]; | |||
| } | |||
| if (block_num_ <= 0 || block_size_ <= 0) { | |||
| if (block_num_ == 0 || block_size_ == 0) { | |||
| MS_LOG(EXCEPTION) << "LayerNormCPUKernel input shape error, input shape: " << x_shape; | |||
| } | |||
| } | |||
| @@ -93,8 +93,8 @@ void LayerNormCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, con | |||
| sum += x[j]; | |||
| square_sum += x[j] * x[j]; | |||
| } | |||
| T block_mean = sum / block_size_; | |||
| T block_var = square_sum / block_size_ - block_mean * block_mean; | |||
| T block_mean = sum / static_cast<T>(block_size_); | |||
| T block_var = square_sum / static_cast<T>(block_size_) - block_mean * block_mean; | |||
| for (size_t j = i * block_size_; j < (i + 1) * block_size_; ++j) { | |||
| auto param_shift = j % param_num_; | |||
| y[j] = (x[j] - block_mean) / (T)std::sqrt(static_cast<double>(block_var) + eps_) * gamma[param_shift] + | |||
| @@ -33,30 +33,30 @@ void LayerNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| if (begin_params_axis < 0) { | |||
| begin_params_axis += x_shape.size(); | |||
| } | |||
| for (size_t i = 0; i < IntToSize(begin_norm_axis); i++) { | |||
| for (size_t i = 0; i < LongToSize(begin_norm_axis); i++) { | |||
| block_num_ *= x_shape[i]; | |||
| } | |||
| for (size_t i = IntToSize(begin_norm_axis); i < x_shape.size(); i++) { | |||
| for (size_t i = LongToSize(begin_norm_axis); i < x_shape.size(); i++) { | |||
| block_size_ *= x_shape[i]; | |||
| } | |||
| for (size_t i = 0; i < IntToSize(begin_params_axis); i++) { | |||
| for (size_t i = 0; i < LongToSize(begin_params_axis); i++) { | |||
| param_size_ *= x_shape[i]; | |||
| } | |||
| for (size_t i = begin_params_axis; i < x_shape.size(); i++) { | |||
| for (size_t i = LongToSize(begin_params_axis); i < x_shape.size(); i++) { | |||
| param_num_ *= x_shape[i]; | |||
| } | |||
| if (block_num_ <= 0 || block_size_ <= 0) { | |||
| if (block_num_ == 0 || block_size_ == 0) { | |||
| MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel input shape error, input shape: " << x_shape; | |||
| } | |||
| } | |||
| bool LayerNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, workspace, outputs); | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) { | |||
| LaunchKernel<float>(inputs, workspace, outputs); | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "input dtype only support float16, float32, float64"; | |||
| } | |||
| @@ -65,7 +65,6 @@ bool LayerNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input | |||
| template <typename T> | |||
| void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| auto x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto dy = reinterpret_cast<T *>(inputs[1]->addr); | |||
| @@ -123,7 +122,7 @@ void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| auto var_sqrt = (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -0.5); | |||
| auto dx1 = dy[j] * gamma[param_shift] * var_sqrt; | |||
| auto dx2 = sum1 * (T)2.0 / block_size_ * (x[j] - mean[norm_shift]); | |||
| auto dx3 = ((T)(-1.0) * var_sqrt * sum2 + ((T)1.0 / block_size_) * sum1 * sum3) * ((T)1.0 / block_size_); | |||
| auto dx3 = ((T)(-1.0) * var_sqrt * sum2 + ((T)1.0 / (T)block_size_) * sum1 * sum3) * ((T)1.0 / (T)block_size_); | |||
| dx[j] = dx1 + dx2 + dx3; | |||
| } | |||
| } | |||
| @@ -35,8 +35,7 @@ class LayerNormGradCPUKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs); | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| @@ -133,13 +133,13 @@ void MaximumCPUKernel<T>::InitTensorBroadcastShape() { | |||
| } | |||
| int input_x_dim_offset = output_shape_.size() - input_x_shape_.size(); | |||
| for (size_t j = 0; j < input_x_shape_.size(); j++) { | |||
| broadcast_input_x_shape_[j + input_x_dim_offset] = input_x_shape_[j]; | |||
| broadcast_input_x_shape_[j + IntToSize(input_x_dim_offset)] = input_x_shape_[j]; | |||
| input_x_num_ *= input_x_shape_[j]; | |||
| } | |||
| int input_y_dim_offset = output_shape_.size() - input_y_shape_.size(); | |||
| for (size_t k = 0; k < input_y_shape_.size(); k++) { | |||
| if (need_broadcast_) { | |||
| broadcast_input_y_shape_[k + input_y_dim_offset] = input_y_shape_[k]; | |||
| broadcast_input_y_shape_[k + IntToSize(input_y_dim_offset)] = input_y_shape_[k]; | |||
| input_y_num_ *= input_y_shape_[k]; | |||
| } | |||
| } | |||
| @@ -115,9 +115,11 @@ void MaximumGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, c | |||
| size_t y_tensor_len = GetTensorLen(y_shape_); | |||
| size_t x_tensor_size = x_tensor_len * sizeof(T); | |||
| size_t y_tensor_size = y_tensor_len * sizeof(T); | |||
| memset_s(dx_addr, x_tensor_size, 0, x_tensor_size); | |||
| memset_s(dy_addr, y_tensor_size, 0, y_tensor_size); | |||
| auto res_dx = memset_s(dx_addr, x_tensor_size, 0, x_tensor_size); | |||
| auto res_dy = memset_s(dy_addr, y_tensor_size, 0, y_tensor_size); | |||
| if (res_dx != EOK || res_dy != EOK) { | |||
| MS_LOG(EXCEPTION) << "MaximumGradCPUKernel LaunchKernel task memset failed."; | |||
| } | |||
| std::vector<size_t> x_shape(dout_shape.size(), 1); | |||
| std::vector<size_t> y_shape(dout_shape.size(), 1); | |||
| std::vector<size_t> x_cargo(dout_shape.size(), 0); | |||
| @@ -84,7 +84,7 @@ bool BatchNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input | |||
| auto wksp_in = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto scale_ret = memcpy_s(wksp_in, workspace[0]->size, inputs[2]->addr, inputs[2]->size); | |||
| auto max_size = workspace[0]->size - inputs[2]->size; | |||
| auto bias_ret = memset_s(wksp_in + (inputs[2]->size / sizeof(float)), max_size, 0., max_size); | |||
| auto bias_ret = memset_s(wksp_in + (inputs[2]->size / sizeof(float)), max_size, 0, max_size); | |||
| if (scale_ret != 0 && bias_ret != 0) { | |||
| MS_LOG(EXCEPTION) << "Memcpy_s error."; | |||
| return false; | |||
| @@ -21,7 +21,7 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| bool BatchMatMulCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| bool BatchMatMulCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (inputs.size() < 2 || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "batchmatmul error input output size!"; | |||
| @@ -83,7 +83,7 @@ void BatchMatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| } | |||
| auto input1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| dim_k_ = trans_a ? input1_shape[dims - 2] : input1_shape[dims - 1]; | |||
| dim_k_ = static_cast<dnnl_dim_t>(trans_a ? input1_shape[dims - 2] : input1_shape[dims - 1]); | |||
| trans_a_ = trans_a ? TRANSPOSE_YES : TRANSPOSE_NO; | |||
| trans_b_ = trans_b ? TRANSPOSE_YES : TRANSPOSE_NO; | |||
| @@ -58,8 +58,9 @@ void Conv2dGradInputCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| if (stride_me.size() < h_index + 2) { | |||
| MS_LOG(EXCEPTION) << "Strides should greater than " << (h_index + 1) << ", but got " << stride_me.size(); | |||
| } | |||
| (void)std::transform(stride_me.begin() + h_index, stride_me.begin() + h_index + 2, std::back_inserter(stride_ori), | |||
| [](const int64_t &value) { return static_cast<int>(value); }); | |||
| auto h_index_int64 = SizeToLong(h_index); | |||
| (void)std::transform(stride_me.begin() + h_index_int64, stride_me.begin() + h_index_int64 + 2, | |||
| std::back_inserter(stride_ori), [](const int64_t &value) { return static_cast<int>(value); }); | |||
| (void)std::transform(dilation_me.begin(), dilation_me.end(), std::back_inserter(dilation_ori), | |||
| [](const int64_t &value) { return static_cast<int>(value); }); | |||
| @@ -23,7 +23,7 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node, | |||
| dnnl::memory::desc src_desc) { | |||
| const dnnl::memory::desc src_desc) { | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| if (kernel_name == "ReLU") { | |||
| return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_relu, src_desc, 0.0); | |||
| @@ -32,7 +32,7 @@ class EltWiseCPUKernel : public MKLCPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const CNodePtr &kernel_node, dnnl::memory::desc src_desc); | |||
| dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const CNodePtr &kernel_node, const dnnl::memory::desc src_desc); | |||
| dnnl::prop_kind DnnlForward = dnnl::prop_kind::forward_training; | |||
| }; | |||
| @@ -159,14 +159,13 @@ void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr); | |||
| } | |||
| void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, string name) { | |||
| void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const { | |||
| if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << name << " memset error"; | |||
| } | |||
| } | |||
| bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| using dt = dnnl::memory::data_type; | |||
| using tag = dnnl::memory::format_tag; | |||
| @@ -45,7 +45,7 @@ class LSTMGradCPUKernel : public MKLCPUKernel { | |||
| const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory, | |||
| const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory, | |||
| const dnnl::memory &diff_bias_memory); | |||
| void ResetMemory(const dnnl::memory &mem, string name); | |||
| void ResetMemory(const dnnl::memory &mem, const string name) const; | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| int64_t weight_size_ = 0; | |||
| int64_t weight_h_size_ = 0; | |||
| @@ -107,10 +107,12 @@ bool SliceGradCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inp | |||
| } | |||
| bool can_copy_memory[3] = {CanCopyMemoryOnAxis(0), CanCopyMemoryOnAxis(1), CanCopyMemoryOnAxis(2)}; | |||
| int stride_signs[4] = {SignOfStride(0), SignOfStride(1), SignOfStride(2), SignOfStride(3)}; | |||
| size_t out_start_offset[3] = {begin_[0] * output_element_num_[0], begin_[1] * output_element_num_[1], | |||
| begin_[2] * output_element_num_[2]}; | |||
| size_t out_step_size[3] = {strides_[0] * output_element_num_[0], strides_[1] * output_element_num_[1], | |||
| strides_[2] * output_element_num_[2]}; | |||
| size_t out_start_offset[3] = {IntToSize(begin_[0]) * output_element_num_[0], | |||
| IntToSize(begin_[1]) * output_element_num_[1], | |||
| IntToSize(begin_[2]) * output_element_num_[2]}; | |||
| size_t out_step_size[3] = {IntToSize(strides_[0]) * output_element_num_[0], | |||
| IntToSize(strides_[1]) * output_element_num_[1], | |||
| IntToSize(strides_[2]) * output_element_num_[2]}; | |||
| auto in_n_offset = 0; | |||
| auto out_n_offset = out_start_offset[0]; | |||
| for (int i = begin_[0]; stride_signs[0] * i < stride_signs[0] * end_[0]; | |||
| @@ -138,7 +140,7 @@ bool SliceGradCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inp | |||
| continue; | |||
| } | |||
| for (int m = begin_[3]; stride_signs[3] * m < stride_signs[3] * end_[3]; m += strides_[3]) { | |||
| output_addr[out_n_offset + out_c_offset + out_h_offset + m] = *input_addr++; | |||
| output_addr[out_n_offset + out_c_offset + out_h_offset + IntToSize(m)] = *input_addr++; | |||
| } | |||
| } | |||
| } | |||
| @@ -42,7 +42,7 @@ void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const st | |||
| if (k < 1) { | |||
| MS_LOG(EXCEPTION) << "Input k must > 0!"; | |||
| } | |||
| int k_num = std::min<int>(inner_size_, k); | |||
| size_t k_num = IntToSize(std::min<int>(inner_size_, k)); | |||
| if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) { | |||
| MS_LOG(EXCEPTION) << "Error output data size!"; | |||
| } | |||
| @@ -54,10 +54,10 @@ void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const st | |||
| [&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; }); | |||
| auto base_output = i * k_num; | |||
| if (!sorted_) { | |||
| std::stable_sort(idx.begin(), idx.begin() + k_num); | |||
| std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num)); | |||
| } | |||
| for (int j = 0; j < k_num; ++j) { | |||
| indices[base_output + j] = idx[j] - base_input; | |||
| for (size_t j = 0; j < k_num; ++j) { | |||
| indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input); | |||
| output[base_output + j] = input[idx[j]]; | |||
| } | |||
| } | |||