diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_impl.cu index 2deecaf6e3..41eae121a3 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_impl.cu +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_impl.cu @@ -28,35 +28,35 @@ __device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { // half --> integer __device__ __forceinline__ void CastBase(const half *input_addr, uint64_t *output_addr) { - *output_addr = __half2ull_rn((*input_addr)); + *output_addr = __half2ull_rd((*input_addr)); } __device__ __forceinline__ void CastBase(const half *input_addr, int64_t *output_addr) { - *output_addr = __half2ll_rn((*input_addr)); + *output_addr = __half2ll_rd((*input_addr)); } __device__ __forceinline__ void CastBase(const half *input_addr, uint32_t *output_addr) { - *output_addr = __half2uint_rn((*input_addr)); + *output_addr = __half2uint_rd((*input_addr)); } __device__ __forceinline__ void CastBase(const half *input_addr, int32_t *output_addr) { - *output_addr = __half2int_rn((*input_addr)); + *output_addr = __half2int_rd((*input_addr)); } __device__ __forceinline__ void CastBase(const half *input_addr, uint16_t *output_addr) { - *output_addr = __half2ushort_rn((*input_addr)); + *output_addr = __half2ushort_rd((*input_addr)); } __device__ __forceinline__ void CastBase(const half *input_addr, int16_t *output_addr) { - *output_addr = __half2short_rn((*input_addr)); + *output_addr = __half2short_rd((*input_addr)); } __device__ __forceinline__ void CastBase(const half *input_addr, uint8_t *output_addr) { - *output_addr = static_cast(__half2ushort_rn((*input_addr))); + *output_addr = static_cast(__half2ushort_rd((*input_addr))); } __device__ __forceinline__ void CastBase(const half *input_addr, int8_t *output_addr) { - *output_addr = static_cast(__half2short_rn((*input_addr))); + *output_addr = static_cast(__half2short_rd((*input_addr))); } // integer --> half diff --git a/tests/st/ops/gpu/test_cast_op.py b/tests/st/ops/gpu/test_cast_op.py index c58299de66..8316d31f2f 100644 --- a/tests/st/ops/gpu/test_cast_op.py +++ b/tests/st/ops/gpu/test_cast_op.py @@ -597,3 +597,25 @@ def test_cast31(): assert type0 == 'uint16' type1 = output[1].asnumpy().dtype assert type1 == 'uint32' + + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_cast32(): + np.random.seed(10) + x = np.random.rand(*(3, 2)).astype(np.float16) + x0 = Tensor(x) + t0 = mstype.int32 + x1 = Tensor(x) + t1 = mstype.float64 + + context.set_context(mode=context.GRAPH_MODE, device_target='GPU') + net = Net(t0, t1) + output = net(x0, x1) + type0 = output[0].asnumpy().dtype + assert type0 == 'int32' + expected = x.astype(np.int32) + assert (output[0].asnumpy() == expected).all() + type1 = output[1].asnumpy().dtype + assert type1 == 'float64'